In [1]:
import os
import json
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pylab as plt
import requests
from bs4 import BeautifulSoup
import sqlite3
import re
from selenium import webdriver
from selenium.webdriver.common.by import By

# Configure Notebook
%matplotlib inline
plt.style.use('fivethirtyeight')
sns.set_context("notebook")
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the red wine database extracted 
conn = sqlite3.connect('red_wines_final 1.db')
c = conn.cursor()
wine_df = pd.read_sql_query("SELECT * FROM red_wines", conn)
wine_df = wine_df.sort_values(by='Price', ascending = False)
wine_df_filtered=wine_df.drop_duplicates(subset=['Producer', 'WineType'], keep = 'first')
print(wine_df_filtered["URL"])


1585     https://www.vivino.com/bacigalupi-zinfandel/w/...
313      https://www.vivino.com/domaine-de-la-romanee-c...
366      https://www.vivino.com/le-pin-pomerol/w/119749...
126      https://www.vivino.com/chateau-petrus-pomerol/...
1676     https://www.vivino.com/domaine-arnoux-lachaux-...
                               ...                        
22214    https://www.vivino.com/woodbridge-by-robert-mo...
21809    https://www.vivino.com/delheim-cabernet-sauvig...
22259    https://www.vivino.com/santa-carolina-cabernet...
22213    https://www.vivino.com/felix-solis-los-molinos...
22166    https://www.vivino.com/crane-lake-pinot-noir/w...
Name: URL, Length: 11240, dtype: object


In [3]:
# Create a new dataframe for vintage data extraction
wine_df_extracted = wine_df_filtered[['URL']]  

# Clean the URLs by removing query parameters to get the vintage results
wine_df_extracted['URL'] = wine_df_extracted['URL'].str.split('?').str[0]

# Initialize lists to store the extracted data and failed URLs
all_recommended_vintages = []
all_vintages_data = []
failed_urls = []  

# Loop through each URL in the filtered DataFrame
for index, row in wine_df_extracted.iterrows():
    url = row['URL']  # Access the URL directly
    print(f"Processing URL: {url}")  # Log the current URL being processed
    
    try:
        r = requests.get(url, headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0"
        })

        # Check if the request was successful
        if r.status_code == 200:
            # Search for the JavaScript data in the page source
            res = re.search(r"window\.__PRELOADED_STATE__\.winePageInformation\s*=\s*(.*});", r.text, re.MULTILINE)
            
            if res:
                data = json.loads(res.group(1))
                
                # Extract recommended vintages
                recommended_vintages = data.get("recommended_vintages", [])
                if recommended_vintages:
                    recommended_df = pd.DataFrame(recommended_vintages)
                    recommended_df['source_url'] = url  # Add the URL for reference
                    all_recommended_vintages.append(recommended_df)

                # Extract all vintages
                all_vintages = data.get("wine", {}).get("vintages", [])
                if all_vintages:
                    all_vintages_df = pd.DataFrame(all_vintages)
                    all_vintages_df['source_url'] = url  # Add the URL for reference
                    all_vintages_data.append(all_vintages_df)

            else:
                print(f"No data found for URL: {url}")
                failed_urls.append(url)  # Append failed URL
        else:
            print(f"Failed to retrieve data for URL: {url}, Status code: {r.status_code}")
            failed_urls.append(url)  # Append failed URL
    
    except Exception as e:
        print(f"An error occurred for URL: {url} - {str(e)}")
        failed_urls.append(url)  # Append failed URL

# Concatenate all the recommended vintages and all vintages data into DataFrames
if all_recommended_vintages:
    final_recommended_vintages_df = pd.concat(all_recommended_vintages, ignore_index=True)
else:
    final_recommended_vintages_df = pd.DataFrame()  # Empty DataFrame if no data

if all_vintages_data:
    final_all_vintages_df = pd.concat(all_vintages_data, ignore_index=True)
else:
    final_all_vintages_df = pd.DataFrame()  # Empty DataFrame if no data

# Display the final extracted data
print("Extracted Recommended Vintages Data:")
print(final_recommended_vintages_df)

print("\nExtracted All Vintages Data:")
print(final_all_vintages_df)

# Display the failed URLs
if failed_urls:
    print("\nFailed URLs:")
    print(failed_urls)
else:
    print("\nNo failed URLs.")


Processing URL: https://www.vivino.com/bacigalupi-zinfandel/w/2259925
Processing URL: https://www.vivino.com/domaine-de-la-romanee-conti-romanee-conti-grand-cru/w/83912
Processing URL: https://www.vivino.com/le-pin-pomerol/w/1197490
Processing URL: https://www.vivino.com/chateau-petrus-pomerol/w/1166837
Processing URL: https://www.vivino.com/domaine-arnoux-lachaux-vosne-romanee-1er-cru-aux-reignots/w/1495399
Processing URL: https://www.vivino.com/tenuta-san-guido-sassicaia/w/5078
Processing URL: https://www.vivino.com/chateau-mouton-rothschild-pauillac-premier-grand-cru-classe/w/1684223
Processing URL: https://www.vivino.com/truchot-martin-vieilles-vignes-charmes-chambertin-grand-cru/w/3120598
Processing URL: https://www.vivino.com/domaine-de-la-romanee-conti-grands-echezeaux-grand-cru/w/1286354
Processing URL: https://www.vivino.com/screaming-eagle-cabernet-sauvignon/w/82025
Processing URL: https://www.vivino.com/domaine-de-la-romanee-conti-la-tache-grand-cru/w/83911
Processing URL: h

In [4]:
final_all_vintages_df

Unnamed: 0,id,seo_name,name,statistics,year,grapes,has_valid_ratings,source_url
0,176916427,bacigalupi-zinfandel-2022,Bacigalupi Zinfandel 2022,"{'status': 'BelowThreshold', 'ratings_count': ...",2022,,False,https://www.vivino.com/bacigalupi-zinfandel/w/...
1,173856042,bacigalupi-zinfandel-2021,Bacigalupi Zinfandel 2021,"{'status': 'BelowThreshold', 'ratings_count': ...",2021,,False,https://www.vivino.com/bacigalupi-zinfandel/w/...
2,171559100,bacigalupi-zinfandel-2020,Bacigalupi Zinfandel 2020,"{'status': 'BelowThreshold', 'ratings_count': ...",2020,,False,https://www.vivino.com/bacigalupi-zinfandel/w/...
3,167792546,bacigalupi-zinfandel-2019,Bacigalupi Zinfandel 2019,"{'status': 'BelowThreshold', 'ratings_count': ...",2019,,False,https://www.vivino.com/bacigalupi-zinfandel/w/...
4,164394048,bacigalupi-zinfandel-2018,Bacigalupi Zinfandel 2018,"{'status': 'BelowThreshold', 'ratings_count': ...",2018,,False,https://www.vivino.com/bacigalupi-zinfandel/w/...
...,...,...,...,...,...,...,...,...
147405,159739530,la-spinetta-pin-1978,La Spinetta Pin 1978,"{'status': 'BelowThreshold', 'ratings_count': ...",1978,,False,https://www.vivino.com/la-spinetta-pin/w/15192
147406,38009325,la-spinetta-pin-1957,La Spinetta Pin 1957,"{'status': 'BelowThreshold', 'ratings_count': ...",1957,,False,https://www.vivino.com/la-spinetta-pin/w/15192
147407,152889930,la-spinetta-pin-1945,La Spinetta Pin 1945,"{'status': 'BelowThreshold', 'ratings_count': ...",1945,,False,https://www.vivino.com/la-spinetta-pin/w/15192
147408,172260183,la-spinetta-pin-1878,La Spinetta Pin 1878,"{'status': 'BelowThreshold', 'ratings_count': ...",1878,,False,https://www.vivino.com/la-spinetta-pin/w/15192


In [5]:
final_all_vintages_df_True = final_all_vintages_df[final_all_vintages_df["has_valid_ratings"] == True]
# all_vintages_df_True["statistics"][2]
def extract_object_data(Object_data):
    """Extracts key-value pairs from an object and returns a Series."""
    if isinstance(Object_data, dict):
        return pd.Series(Object_data)
    else:
        return pd.Series()  # Handle other data types if needed

# Apply the function to the 'object_column' and create a new DataFrame
new_columns = final_all_vintages_df_True['statistics'].apply(extract_object_data)

# Concatenate the original DataFrame with the new columns
final_all_vintages_df_True = pd.concat([final_all_vintages_df_True, new_columns], axis=1)

# Print the modified DataFrame
final_all_vintages_df_True

selected_columns = ['id', 'name', 'year', 'ratings_average', 'reviews_count', 'has_valid_ratings']
final_all_vintages_df1 = final_all_vintages_df_True[selected_columns]

final_all_vintages_df1

Unnamed: 0,id,name,year,ratings_average,reviews_count,has_valid_ratings
6,147519855,Bacigalupi Zinfandel 2016,2016,4.5,20,True
7,154248950,Bacigalupi Zinfandel 2015,2015,4.4,14,True
22,4971178,Bacigalupi Zinfandel,0,4.4,3,True
26,162909346,Domaine de La Romanée-Conti Romanée-Conti Gran...,2020,4.7,14,True
27,159459464,Domaine de La Romanée-Conti Romanée-Conti Gran...,2019,4.9,18,True
...,...,...,...,...,...,...
147389,1998201,La Spinetta Pin 1999,1999,4.1,31,True
147390,2812277,La Spinetta Pin 1998,1998,4.1,22,True
147391,2005350,La Spinetta Pin 1997,1997,3.8,13,True
147395,2798496,La Spinetta Pin 1993,1993,4.4,14,True


In [6]:
final_recommended_vintages_df['id'] = final_recommended_vintages_df['vintage'].apply(lambda x: x['id'] if 'id' in x else None)
final_recommended_vintages_df['name'] = final_recommended_vintages_df['vintage'].apply(lambda x: x['name'] if 'name' in x else None)
final_recommended_vintages_df['amount'] = final_recommended_vintages_df['price'].apply(lambda x: x['amount'] if isinstance(x, dict) and 'amount' in x else None)
# final_recommended_df

selected_columns = ['id', 'name', 'type', 'amount']
final_recommended_vintages_df1 = final_recommended_vintages_df[selected_columns]
final_recommended_vintages_df1

Unnamed: 0,id,name,type,amount
0,147519855,Bacigalupi Zinfandel 2016,best_user_rated,
1,154248950,Bacigalupi Zinfandel 2015,most_user_rated,54513.9
2,167792546,Bacigalupi Zinfandel 2019,top_ranked,
3,1309310,Domaine de La Romanée-Conti Romanée-Conti Gran...,wsa_winning,
4,159459464,Domaine de La Romanée-Conti Romanée-Conti Gran...,best_user_rated,
...,...,...,...,...
12412,12407045,Cascina Adelaide Cannubi Barolo 2012,most_user_rated,
12413,153157004,Cascina Adelaide Cannubi Barolo 2016,top_ranked,
12414,2798496,La Spinetta Pin 1993,best_user_rated,
12415,9051550,La Spinetta Pin 2013,most_user_rated,


In [8]:
final_merge_df = pd.merge(final_all_vintages_df1, final_recommended_vintages_df1, on="id", how="outer")
# merged_df
selected_columns = ['id', 'name_x', 'year', 'ratings_average', 'reviews_count', 'has_valid_ratings', 'type', 'amount']
all_winebottlevintage_df = final_merge_df [selected_columns]
all_winebottlevintage_df

all_filtered_winebottle = all_winebottlevintage_df[
    (all_winebottlevintage_df['year'] >= 1990) &
    (all_winebottlevintage_df['ratings_average'] > 3) &
    (all_winebottlevintage_df['reviews_count'] > 1)
].sort_values(by='name_x', ascending=False)

# Display the all_filtered DataFrame
pd.set_option('display.max_rows', 3000)  # Set to None to show all rows
pd.set_option('display.max_columns', None)
all_filtered_winebottle

Unnamed: 0,id,name_x,year,ratings_average,reviews_count,has_valid_ratings,type,amount
47480,162833040,Álvaro Palacios Quiñon de Valmira 2019,2019.0,4.6,19.0,True,best_user_rated,
46520,160297222,Álvaro Palacios Quiñon de Valmira 2018,2018.0,4.6,28.0,True,top_ranked,
44277,156609096,Álvaro Palacios Quiñon de Valmira 2017,2017.0,4.6,20.0,True,,
42252,153375629,Álvaro Palacios Quiñon de Valmira 2016,2016.0,4.5,53.0,True,,
42275,153407612,Álvaro Palacios Quiñon de Valmira 2015,2015.0,4.6,45.0,True,most_user_rated,
...,...,...,...,...,...,...,...,...
39268,146495543,50th Parallel Estate Unparalleled Pinot Noir 2016,2016.0,3.8,23.0,True,,
35195,26918652,50th Parallel Estate Unparalleled Pinot Noir 2015,2015.0,4.0,13.0,True,top_ranked,
34453,21642824,50th Parallel Estate Unparalleled Pinot Noir 2014,2014.0,3.9,41.0,True,most_user_rated,
30571,8379954,50th Parallel Estate Unparalleled Pinot Noir 2013,2013.0,3.8,15.0,True,,


In [9]:
# # Save the DataFrame to a CSV file
all_filtered_winebottle.to_csv('all_filtered_winebottle_v1.csv', index=False)

print("DataFrame saved to 'final_all_vintages_filteredv1.csv'")


DataFrame saved to 'final_all_vintages_filteredv1.csv'


In [None]:
AWB = pd.read_csv('all_filtered_winebottle_v1.csv')
AWB

Unnamed: 0,id,name,year,ratings_average,reviews_count,has_valid_ratings
0,147519855,Bacigalupi Zinfandel 2016,2016,4.5,20,True
1,154248950,Bacigalupi Zinfandel 2015,2015,4.4,14,True
2,4971178,Bacigalupi Zinfandel,0,4.4,3,True
3,162909346,Domaine de La Romanée-Conti Romanée-Conti Gran...,2020,4.7,14,True
4,159459464,Domaine de La Romanée-Conti Romanée-Conti Gran...,2019,4.9,18,True
...,...,...,...,...,...,...
138202,2175427,Crane Lake Pinot Noir 2012,2012,3.0,86,True
138203,1377237,Crane Lake Pinot Noir 2011,2011,2.9,27,True
138204,1236043,Crane Lake Pinot Noir 2010,2010,3.6,6,True
138205,1171623,Crane Lake Pinot Noir 2009,2009,3.3,13,True
