In [1]:
import os
import json
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pylab as plt
import requests
from bs4 import BeautifulSoup
import sqlite3
import re
from selenium import webdriver
from selenium.webdriver.common.by import By

# Configure Notebook
%matplotlib inline
plt.style.use('fivethirtyeight')
sns.set_context("notebook")
import warnings
warnings.filterwarnings('ignore')

In [2]:
conn = sqlite3.connect('red_wines_final 1.db')
c = conn.cursor()
wine_df = pd.read_sql_query("SELECT * FROM red_wines", conn)
wine_df = wine_df.sort_values(by='Price', ascending = False)
wine_df_filtered=wine_df.drop_duplicates(subset=['Producer', 'WineType'], keep = 'first')
wine_df_filtered=wine_df_filtered.sort_values(by='Price', ascending = False)
wine_df_filtered

Unnamed: 0,id,Producer,WineType,Year,Region,Country,URL,Rating,Num_Ratings,Price,url_idx
1585,1634,Bacigalupi,Zinfandel,2015.0,Russian River Valley,United States,https://www.vivino.com/bacigalupi-zinfandel/w/...,4.4,27,54513.90,62
313,318,Domaine de La Romanée-Conti,Romanée-Conti Grand Cru,2010.0,Romanée-Conti Grand Cru,France,https://www.vivino.com/domaine-de-la-romanee-c...,4.7,290,32000.00,12
366,371,Le Pin,Pomerol,1995.0,Pomerol,France,https://www.vivino.com/le-pin-pomerol/w/119749...,4.7,119,15869.00,14
126,130,Château Pétrus,Pomerol,2020.0,Pomerol,France,https://www.vivino.com/chateau-petrus-pomerol/...,4.8,58,14366.00,4
1676,1735,Domaine Arnoux-Lachaux,Vosne-Romanée 1er Cru 'Aux Reignots',2019.0,Vosne-Romanée 1er Cru 'Aux Raignots',France,https://www.vivino.com/domaine-arnoux-lachaux-...,4.3,137,9595.00,66
...,...,...,...,...,...,...,...,...,...,...,...
22214,23928,Woodbridge by Robert Mondavi,Pinot Noir,2017.0,California,United States,https://www.vivino.com/woodbridge-by-robert-mo...,3.3,772,9.72,922
21809,23519,Delheim,Cabernet Sauvignon - Shiraz,2021.0,Stellenbosch,South Africa,https://www.vivino.com/delheim-cabernet-sauvig...,3.7,57,9.05,907
22259,23975,Santa Carolina,Cabernet Sauvignon (Varietal),2023.0,Rapel Valley,Chile,https://www.vivino.com/santa-carolina-cabernet...,3.1,3534,8.75,924
22213,23927,Félix Solís,Los Molinos Valdepeñas Crianza,2023.0,Valdepeñas,Spain,https://www.vivino.com/felix-solis-los-molinos...,3.3,1249,8.70,922


In [None]:
import requests
import re
import json
import pandas as pd

# Assuming wine_df_filtered is your DataFrame that contains the URLs
wine_df_extracted = wine_df_filtered[['URL']]  

# Clean the URLs by removing query parameters (if any)
wine_df_extracted['URL'] = wine_df_extracted['URL'].str.split('?').str[0]

# Initialize lists to store the extracted data and failed URLs
all_recommended_vintages = []
all_vintages_data = []
failed_urls = []  # List to store failed URLs

# Loop through each URL in the filtered DataFrame
for index, row in wine_df_extracted.iterrows():
    url = row['URL']  # Access the URL directly
    print(f"Processing URL: {url}")  # Log the current URL being processed
    
    try:
        r = requests.get(url, headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0"
        })

        # Check if the request was successful
        if r.status_code == 200:
            # Search for the JavaScript data in the page source
            res = re.search(r"window\.__PRELOADED_STATE__\.winePageInformation\s*=\s*(.*});", r.text, re.MULTILINE)
            
            if res:
                data = json.loads(res.group(1))

                # Extract wine details (Producer, WineType, Region, Country, Price)
                producer = data.get("wine", {}).get("producer", "")
                wine_type = data.get("wine", {}).get("type", "")
                region = data.get("wine", {}).get("region", "")
                country = data.get("wine", {}).get("country", "")
                price = data.get("wine", {}).get("price", "")  # Extract Price, if available
                
                # Extract recommended vintages
                recommended_vintages = data.get("recommended_vintages", [])
                if recommended_vintages:
                    recommended_df = pd.DataFrame(recommended_vintages)
                    recommended_df['source_url'] = url  # Add the URL for reference
                    recommended_df['Producer'] = producer
                    recommended_df['WineType'] = wine_type
                    recommended_df['Region'] = region
                    recommended_df['Country'] = country
                    recommended_df['Price'] = price  # Add the Price
                    all_recommended_vintages.append(recommended_df)

                # Extract all vintages
                all_vintages = data.get("wine", {}).get("vintages", [])
                if all_vintages:
                    all_vintages_df = pd.DataFrame(all_vintages)
                    all_vintages_df['source_url'] = url  # Add the URL for reference
                    all_vintages_df['Producer'] = producer
                    all_vintages_df['WineType'] = wine_type
                    all_vintages_df['Region'] = region
                    all_vintages_df['Country'] = country
                    all_vintages_df['Price'] = price  # Add the Price
                    all_vintages_data.append(all_vintages_df)

            else:
                print(f"No data found for URL: {url}")
                failed_urls.append(url)  # Append failed URL
        else:
            print(f"Failed to retrieve data for URL: {url}, Status code: {r.status_code}")
            failed_urls.append(url)  # Append failed URL
    
    except Exception as e:
        print(f"An error occurred for URL: {url} - {str(e)}")
        failed_urls.append(url)  # Append failed URL

# Concatenate all the recommended vintages and all vintages data into DataFrames
if all_recommended_vintages:
    final_recommended_vintages_df = pd.concat(all_recommended_vintages, ignore_index=True)
else:
    final_recommended_vintages_df = pd.DataFrame()  # Empty DataFrame if no data

if all_vintages_data:
    final_all_vintages_df = pd.concat(all_vintages_data, ignore_index=True)
else:
    final_all_vintages_df = pd.DataFrame()  # Empty DataFrame if no data

# Display the final extracted data
print("Extracted Recommended Vintages Data:")
print(final_recommended_vintages_df)

print("\nExtracted All Vintages Data:")
print(final_all_vintages_df)

# Display the failed URLs
if failed_urls:
    print("\nFailed URLs:")
    print(failed_urls)
else:
    print("\nNo failed URLs.")


Processing URL: https://www.vivino.com/bacigalupi-zinfandel/w/2259925
Failed to retrieve data for URL: https://www.vivino.com/bacigalupi-zinfandel/w/2259925, Status code: 429
Processing URL: https://www.vivino.com/domaine-de-la-romanee-conti-romanee-conti-grand-cru/w/83912
Failed to retrieve data for URL: https://www.vivino.com/domaine-de-la-romanee-conti-romanee-conti-grand-cru/w/83912, Status code: 429
Processing URL: https://www.vivino.com/le-pin-pomerol/w/1197490
Failed to retrieve data for URL: https://www.vivino.com/le-pin-pomerol/w/1197490, Status code: 429
Processing URL: https://www.vivino.com/chateau-petrus-pomerol/w/1166837
Failed to retrieve data for URL: https://www.vivino.com/chateau-petrus-pomerol/w/1166837, Status code: 429
Processing URL: https://www.vivino.com/domaine-arnoux-lachaux-vosne-romanee-1er-cru-aux-reignots/w/1495399
Failed to retrieve data for URL: https://www.vivino.com/domaine-arnoux-lachaux-vosne-romanee-1er-cru-aux-reignots/w/1495399, Status code: 429
P

In [None]:
final_all_vintages_df

Unnamed: 0,id,Producer,WineType,statistics,Year,grapes,has_valid_ratings,source_url,Producer.1,WineType.1,Region,Country,Price
0,176916427,bacigalupi-zinfandel-2022,Bacigalupi Zinfandel 2022,"{'status': 'BelowThreshold', 'ratings_count': ...",2022,,False,https://www.vivino.com/bacigalupi-zinfandel/w/...,,,,,
1,173856042,bacigalupi-zinfandel-2021,Bacigalupi Zinfandel 2021,"{'status': 'BelowThreshold', 'ratings_count': ...",2021,,False,https://www.vivino.com/bacigalupi-zinfandel/w/...,,,,,
2,171559100,bacigalupi-zinfandel-2020,Bacigalupi Zinfandel 2020,"{'status': 'BelowThreshold', 'ratings_count': ...",2020,,False,https://www.vivino.com/bacigalupi-zinfandel/w/...,,,,,
3,167792546,bacigalupi-zinfandel-2019,Bacigalupi Zinfandel 2019,"{'status': 'BelowThreshold', 'ratings_count': ...",2019,,False,https://www.vivino.com/bacigalupi-zinfandel/w/...,,,,,
4,164394048,bacigalupi-zinfandel-2018,Bacigalupi Zinfandel 2018,"{'status': 'BelowThreshold', 'ratings_count': ...",2018,,False,https://www.vivino.com/bacigalupi-zinfandel/w/...,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
418650,68836723,crane-lake-pinot-noir-1993,Crane Lake Pinot Noir 1993,"{'status': 'BelowThreshold', 'ratings_count': ...",1993,,False,https://www.vivino.com/crane-lake-pinot-noir/w...,,,,,
418651,163293956,crane-lake-pinot-noir-1991,Crane Lake Pinot Noir 1991,"{'status': 'BelowThreshold', 'ratings_count': ...",1991,,False,https://www.vivino.com/crane-lake-pinot-noir/w...,,,,,
418652,38590533,crane-lake-pinot-noir-1978,Crane Lake Pinot Noir 1978,"{'status': 'BelowThreshold', 'ratings_count': ...",1978,,False,https://www.vivino.com/crane-lake-pinot-noir/w...,,,,,
418653,24872659,crane-lake-pinot-noir-1976,Crane Lake Pinot Noir 1976,"{'status': 'BelowThreshold', 'ratings_count': ...",1976,,False,https://www.vivino.com/crane-lake-pinot-noir/w...,,,,,


In [None]:
final_all_vintages_df_True = final_all_vintages_df[final_all_vintages_df["has_valid_ratings"] == True]
# all_vintages_df_True["statistics"][2]
def extract_object_data(Object_data):
    """Extracts key-value pairs from an object and returns a Series."""
    if isinstance(Object_data, dict):
        return pd.Series(Object_data)
    else:
        return pd.Series()  # Handle other data types if needed

# Apply the function to the 'object_column' and create a new DataFrame
new_columns = final_all_vintages_df_True['statistics'].apply(extract_object_data)

# Concatenate the original DataFrame with the new columns
final_all_vintages_df_True = pd.concat([final_all_vintages_df_True, new_columns], axis=1)

# Print the modified DataFrame
final_all_vintages_df_True

selected_columns = ['id', 'name', 'year', 'ratings_average', 'reviews_count', 'has_valid_ratings']
df4 = final_all_vintages_df_True[selected_columns]

df4

KeyError: "['name', 'year'] not in index"

In [None]:
# Save the DataFrame to a CSV file
df4.to_csv('final_all_vintages_filtered.csv', index=False)

print("DataFrame saved to 'final_all_vintages_filtered.csv'")


DataFrame saved to 'final_all_vintages_filtered.csv'


In [None]:
df4


Unnamed: 0,id,name,year,ratings_average,reviews_count,has_valid_ratings
6,147519855,Bacigalupi Zinfandel 2016,2016,4.5,20,True
7,154248950,Bacigalupi Zinfandel 2015,2015,4.4,14,True
22,4971178,Bacigalupi Zinfandel,0,4.4,3,True
26,162909346,Domaine de La Romanée-Conti Romanée-Conti Gran...,2020,4.7,14,True
27,159459464,Domaine de La Romanée-Conti Romanée-Conti Gran...,2019,4.9,18,True
...,...,...,...,...,...,...
418635,2175427,Crane Lake Pinot Noir 2012,2012,3.0,86,True
418636,1377237,Crane Lake Pinot Noir 2011,2011,2.9,27,True
418637,1236043,Crane Lake Pinot Noir 2010,2010,3.6,6,True
418638,1171623,Crane Lake Pinot Noir 2009,2009,3.3,13,True


DataFrame saved to 'final_all_vintages_filtered.csv'
