In [29]:
# %pip install requests
# %pip install beautifulsoup4
# %pip install selenium
# %pip install pandas
# %pip install geopy
# %pip install geopandas
# %pip install folium
# %pip install openmeteo-requests
# %pip install requests-cache retry-requests numpy pandas
# %pip install timezonefinder
# % pip install seaborn

In [30]:
import os
import time
import requests
import pandas as pd
import geopandas as gpd
import folium
import openmeteo_requests
import requests_cache
import sqlite3
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import json
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from geopy.geocoders import Nominatim
from geopy.geocoders import OpenCage
from folium import Marker
from folium import GeoJson
from retry_requests import retry
from timezonefinder import TimezoneFinder 

Function Breakdown Section 4
---
Vintage_Dataframe
- grabs two dataframes based on the URLs pulled from the popular producers of the specific wines
- one for all the wine vintage data, and one based on popular stats for the wine 
- grabs all the vintage data about certain qualities

Final_Vintage_DataFrame 
- creates a dataframe with vintage data
- included here critically is the ratings data tied to the specific producers

Training_Data
- converts output dataframe from All_Weather_Data into a dataframe indexed across producers and years 
- the columns are monthly averages based on the daily data
- adds in the rating and price data for the specific producer of a specific wine for a specific year 
- final form set up to allow easier training on a model 

Training_Data_Model2
- formats the second training dataset, which comes from every bottle in popular districts

In [31]:
def Vintage_Dataframe(popular_wines_df):
    """
    Input: takes in a dataframe of popular wines, which includes a column with the URL for each of the wines
    Output: two dataframes 
        Recommended vintages: which has the data on the specific wines chosen 
        All Vintages: which has data on all the wines
    """
    #Rename columns 
    popular_wines_df = popular_wines_df.rename(columns = {'WineType' : 'Top WineType'
    })

    #Initialize lists to store the extracted data
    all_recommended_vintages = []
    all_vintages_data = []
    failed_urls = []

    #Stips each of the URLs so they are in proper form 
    popular_wines_df['URL'] = popular_wines_df['URL'].str.split('?').str[0]

    #Loop row in the DataFrame
    for index, row in popular_wines_df.iterrows():

        # Extract producer, wine type, region, country for the current wine
        producer = row['Producer']
        winetype = row['Top WineType']
        region_name = row['Top Region']

        #Grabs each URL and strips and remaining white space 
        url = row['URL'].strip() 
        
        try:
            r = requests.get(url, headers={
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0"
            })

            #Check if the request was successful
            if r.status_code == 200:
                
                #Search for the JavaScript data in the page source
                res = re.search(r"window\.__PRELOADED_STATE__\.winePageInformation\s*=\s*(.*});", r.text, re.MULTILINE)
                
                if res:
                    data = json.loads(res.group(1))
                    
                    # Extract taste information
                    taste = data.get('highlights', [{}])[0].get('metadata', {}).get('style', {}).get('baseline_structure', {})
                    acidity = taste.get("acidity", None)
                    fizziness = taste.get("fizziness", None)
                    intensity = taste.get("intensity", None)
                    sweetness = taste.get("sweetness", None)
                    tannin = taste.get("tannin", None)
                    
                    # Extract recommended vintages
                    recommended_vintages = data.get("recommended_vintages", [])
                    if recommended_vintages:
                        recommended_df = pd.DataFrame(recommended_vintages)
                        recommended_df['source_url'] = url  # Add the URL for reference
                        # Add the wine details from 'wine_df_filtered' directly
                        recommended_df['Producer'] = producer
                        recommended_df['WineType'] = winetype
                        recommended_df['Region'] = region_name

                        # Add taste information
                        recommended_df['Acidity'] = acidity
                        recommended_df['Fizziness'] = fizziness
                        recommended_df['Intensity'] = intensity
                        recommended_df['Sweetness'] = sweetness
                        recommended_df['Tannin'] = tannin
                        all_recommended_vintages.append(recommended_df)

                    # Extract all vintages
                    all_vintages = data.get("wine", {}).get("vintages", [])
                    if all_vintages:
                        all_vintages_df = pd.DataFrame(all_vintages)
                        all_vintages_df['source_url'] = url  # Add the URL for reference
                        
                        # Add the wine details from 'wine_df_filtered' directly
                        all_vintages_df['Producer'] = producer
                        all_vintages_df['WineType'] = winetype
                        all_vintages_df['Region'] = region_name

                        # Add taste information
                        all_vintages_df['Acidity'] = acidity
                        all_vintages_df['Fizziness'] = fizziness
                        all_vintages_df['Intensity'] = intensity
                        all_vintages_df['Sweetness'] = sweetness
                        all_vintages_df['Tannin'] = tannin
                        all_vintages_data.append(all_vintages_df)


                else:
                    print(f"No data found for URL: {url}")
            else:
                print(f"Failed to retrieve data for URL: {url}, Status code: {r.status_code}")
        
        except Exception as e:
            print(f"An error occurred for URL: {url} - {str(e)}")

    #Concats all the recommended vintages and all vintages data into DataFrames
    if all_recommended_vintages:
        final_recommended_vintages_df = pd.concat(all_recommended_vintages, ignore_index=True)
    else:
        final_recommended_vintages_df = pd.DataFrame()  # Empty DataFrame if no data

    if all_vintages_data:
        final_all_vintages_df = pd.concat(all_vintages_data, ignore_index=True)
    else:
        final_all_vintages_df = pd.DataFrame() 
   
    return final_recommended_vintages_df, final_all_vintages_df

In [32]:
def Final_Vintage_Dataframe(recommended_vintages_df, all_vintages_df, popular_wines_df, lower_date_bound = 1900, lower_rating_bound  = 3, lower_review_count = 1):
    """
    Input: two dataframes 
        Recommended vintages: which has the data on the specific wines chosen 
        All Vintages: which has data on all the wines
    Output:  one dataframe 
        all_filtered_winebottle: dataframe with vintage dataframe, of importance is the rating, price, year and producer name
    """

    #Defines a function for internal function use that grabs key-values pairs
    def extract_object_data(Object_data):
        """
        Extracts key-value pairs from an object and returns a Series
        """
        if isinstance(Object_data, dict):
            return pd.Series(Object_data)
        else:
            return pd.Series()  

    #Filters the vintages df so that the all the data is only gotten for those with a valid rating 
    all_vintages_df_True = all_vintages_df[all_vintages_df["has_valid_ratings"] == True]

    #Apply the function to the 'object_column' and create a new DataFrame
    new_columns = all_vintages_df_True['statistics'].apply(extract_object_data)

    #Concatenate the original DataFrame with the new columns
    all_vintages_df_True = pd.concat([all_vintages_df_True, new_columns], axis=1)

    #Grabs specific volumns from the vintage data 
    df4 = all_vintages_df_True[[
        'id', 'name', 'year', 'ratings_average', 'reviews_count', 
        'Acidity', 'Intensity', 'Sweetness', 'Tannin',
        'Region', 'Producer', 'WineType'
        ]]

    #Grabs the id and amount from the vintage data then creates a new dataframe 
    recommended_vintages_df['id'] =recommended_vintages_df['vintage'].apply(lambda x: x.get('id'))
    df5 =recommended_vintages_df[['id', 'type']].drop_duplicates(subset = ['id'])
    
    #Merges the two dataframes
    final_merge_df = pd.merge(df4, df5, on='id', how='left')

    #Filters the data by year, ratings average and reviews count 
    all_filtered_winebottle = final_merge_df[
        (final_merge_df['year'] >= lower_date_bound) &
        (final_merge_df['ratings_average'] > lower_rating_bound) &
        (final_merge_df['reviews_count'] > lower_review_count)
    ]

    #Change the columns names 
    all_filtered_winebottle = all_filtered_winebottle.rename(columns={'id': 'ID',
                                                                      'name': 'Name',
                                                                      'year': 'Year_v',
                                                                      'ratings_average': 'Ratings_Average',
                                                                      'reviews_count': 'Reviews_Count',
                                                                      'type': 'Type',
                                                                      })
    
    #Add a producer/year column for future merging 
    all_filtered_winebottle['Producer_Year'] = all_filtered_winebottle.apply(lambda row: row['Producer'] + ' ' + str(row['Year_v']), axis=1)

    return all_filtered_winebottle

In [33]:
def Training_Data(weather_df, vintage_df):
    """
    Input: Weather dataframe indexed on the daily scale,
        Vintage dataframe which has yearly bottles from the same producer and wine type
    Output: Weather dataframe indexed based on a single row for a producer and year 
        columns will be monthly min, max or average for the respective data per year
    """
    #Weather_df.index = pd.to_datetime(weather_df.index)
    producers = weather_df['Producer'].unique()

    #Initializes a dataframe 
    final_df = pd.DataFrame()

    #Check to see if date is the index
    if type(weather_df.index[0]) == int:

        weather_df = weather_df.set_index('Date') 

    #Filter the vintage_df to make winetypes uniform 
    popular_wine_types = ['château margaux', 'cabernet sauvignon', 'pinot noir', 'zinfandel', 'syrah', 
                            'pinot gris', 'sauvignon blanc', 'chardonnay', 'baco noir', 'bordeaux',
                            'malbec', 'chardonnay', 'pinot grigio', 'merlot', 'sangiovese', 'shiraz',
                            'cabernet franc', 'muscat', 'grenache', 'sangiovese'  ]
    
    #Sets all the winetypes to be lower 
    vintage_df['WineType'] = vintage_df['WineType'].str.lower()

    #Filters through each of the winetypes, then changes the df winetype name if the wine type is in the row string 
    for winetype in popular_wine_types:

        vintage_df['WineType']  = vintage_df['WineType'].apply(lambda row: next((winetype for winetype in popular_wine_types if winetype in row), row))

    #Create a column for the year and produce all the unique years
    weather_df.index = pd.to_datetime(weather_df.index)
    weather_df['Year'] = weather_df.index.year
    unique_years = weather_df['Year'].unique()

    #Find all the unique months
    weather_df['Month'] = weather_df.index.month
    unique_months = weather_df['Month'].unique()

    #Gives key for the month names 
    month_names = {1 : 'January', 2 : 'February', 3 : 'March', 4 : 'April', 5 : 'May', 
                                  6 : 'June', 7 : 'July', 8 : 'August', 9 : 'September', 10 : 'October', 11 : 'November',
                                  12 : 'December'}

    for producer in producers:

        #Filter the data based on the producer
        producer_df = weather_df[weather_df['Producer'] == producer]

        #Iterate over each unique year
        for year in unique_years:
            
            #Filter the data for the specific year
            yearly_df = producer_df[producer_df['Year'] == year]

            #Creates initial data
            yearly_data = {'Producer' : producer, 'Year' : year}

            #Ierate over each unique month
            for month in unique_months: 

                #Filter the data for the specific month
                monthly_df = yearly_df[yearly_df['Month'] == month]

                #Group by month
                Grouped_Daily_into_Monthly = monthly_df.groupby(monthly_df.index.to_period('M'))

                #Change month number to month name
                month = month_names[month]

                yearly_data.update({
                    
                    'WineType' : Grouped_Daily_into_Monthly['Top WineType'].first()[0],
                    'District' : Grouped_Daily_into_Monthly['Top District'].first()[0],
                    f'{month} Max Temp (°C)' : Grouped_Daily_into_Monthly['Max Temp (°C)'].max().iloc[0],
                    f'{month} Min Temp (°C)' : Grouped_Daily_into_Monthly['Min Temp (°C)'].min().iloc[0],
                    f'{month} Avg Temp (°C)' : Grouped_Daily_into_Monthly['Avg Temp (°C)'].mean().iloc[0],
                    f'{month} Max Relative Humidity' : Grouped_Daily_into_Monthly['Max Relative Humidity'].max().iloc[0],
                    f'{month} Min Relative Humidity' : Grouped_Daily_into_Monthly['Min Relative Humidity'].min().iloc[0],
                    f'{month} Avg Relative Humidity' : Grouped_Daily_into_Monthly['Max Relative Humidity'].mean().iloc[0],
                    f'{month} Cumulative Rain (mm)' : Grouped_Daily_into_Monthly['Cumulative Rain (mm)'].sum().iloc[0],
                    f'{month} Cumulative Snow (mm)' : Grouped_Daily_into_Monthly['Cumulative Snow (mm)'].sum().iloc[0],
                    f'{month} Cumulative Precip (mm)' : Grouped_Daily_into_Monthly['Cumulative Rain (mm)'].sum().iloc[0] + Grouped_Daily_into_Monthly['Cumulative Snow (mm)'].sum().iloc[0],
                    f'{month} Cumulative Snow (mm)' : Grouped_Daily_into_Monthly['Cumulative Snow (mm)'].sum().iloc[0],
                    f'{month} Avg Cloud Cover (%)' : Grouped_Daily_into_Monthly['Avg Cloud Cover (%)'].mean().iloc[0],
                    f'{month} Max Wind Speed (Km/h)' : Grouped_Daily_into_Monthly['Max Wind Speed (Km/h)'].max().iloc[0],
                    f'{month} Min Wind Speed (Km/h)' : Grouped_Daily_into_Monthly['Min Wind Speed (Km/h)'].min().iloc[0],
                    f'{month} Avg Wind Speed (Km/h)' : Grouped_Daily_into_Monthly['Avg Wind Speed (Km/h)'].mean().iloc[0],
                    f'{month} Avg Daylight Hours' : Grouped_Daily_into_Monthly['Daylight Hours'].mean().iloc[0], 
                    f'{month} Days Below 0 (°C)': (monthly_df['Avg Temp (°C)'] <= 0).sum(),
                    f'{month} Days Above 32 (°C)': (monthly_df['Avg Temp (°C)'] > 32).sum()

                })
            
            #Adds this yearly data to original df
            final_df = pd.concat([final_df, pd.DataFrame([yearly_data])], ignore_index=True)
    
    #Add a producer/year column for future merging and reset index
    final_df = final_df.reset_index()
    final_df['Producer_Year'] = final_df.apply(lambda row: row['Producer'] + ' ' + str(row['Year']), axis=1)
    weather_df_columns = final_df.drop(columns = ['index'])

    #Format vintage_df 
    vintage_df['Producer_Year'] = vintage_df['Producer_Year'].str.lower()
    final_df['Producer_Year'] = final_df['Producer_Year'].str.lower()

    #Combine the two dataframes     
    final_df = pd.merge(final_df, vintage_df, on = 'Producer_Year', how = 'left')
    
    #Modify the dataframe for easier viewing
    final_df = final_df.drop(columns = ['index','Producer_Year', 'ID', 'Reviews_Count', 'Name', 'Year_v', 'Producer_x', 'WineType_x'])
    
    final_df.insert(0, 'Producer', final_df.pop('Producer_y'))
    final_df.insert(1, 'WineType', final_df.pop('WineType_y'))
    final_df.insert(2, 'District', final_df.pop('District'))
    final_df.insert(3, 'Region', final_df.pop('Region'))
    final_df.insert(4, 'Year', final_df.pop('Year'))
    final_df.insert(5, 'Ratings_Average', final_df.pop('Ratings_Average'))

    #Drop all the rows that have NAN in the ratings average column 
    final_df = final_df.dropna(subset = ['Ratings_Average'])

    return final_df, weather_df_columns

In [61]:
def Training_Data_Model2(formatted_weather_data, vintage_df_M2, all_bottle_data):
    """
    Input: Weather data for each district, all the vintage data for each bottle, all the bottles of interest

    Function converts, merges and formats the three dataframes into one final training dataframe 

    Output: 1 dataframe where each bottle has a rating, price, and weather data for each bottle
        Weather data is for the district, so each bottle will have the same weather data within a district
        Creating this dataframe to show local trends based on vintage qualitative data

    """

    vintage_df_M2 = vintage_df_M2.dropna(subset=['Producer', 'WineType'])
    all_bottle_data = all_bottle_data.dropna(subset=['Producer', 'WineType'])

    #Add a producer/winetype column to both dataframes
    vintage_df_M2['Producer_WineType'] = (vintage_df_M2['Producer'] + '/' +  vintage_df_M2['WineType']).str.lower().str.strip()
    all_bottle_data['Producer_WineType'] = (all_bottle_data['Producer'] + '/' +  all_bottle_data['WineType']).str.lower().str.strip()

    #Drop the producer winetype columns
    vintage_df_M2 = vintage_df_M2.drop(columns = ['Producer', 'WineType'])
    all_bottle_data = all_bottle_data.drop(columns = ['Producer', 'WineType'])

    #Grab important data from all_bottle_data
    district_df = all_bottle_data[['Producer_WineType', 'Top District', 'Price']]

    #Merge on producer_winetype with vintage data, only keeping the relevent data
    int_df = pd.merge(district_df, vintage_df_M2, on = 'Producer_WineType')

    #Set out the important columns for the final_df
    final_df = int_df[['Producer_WineType', 'Top District', 'Year_v', 'Ratings_Average', 'Price', 'Acidity', 'Intensity', 'Sweetness', 'Tannin', 'Type']].copy()
    final_df.columns = ['Producer_WineType', 'District', 'Year', 'Ratings_Average', 'Price', 'Acidity', 'Intensity', 'Sweetness', 'Tannin', 'Type']

    #Create a producer, winetype, and district_year column (the expand statement for split did not work)
    final_df = final_df[final_df['Producer_WineType'].str.contains('/')]
    final_df['Producer'] = final_df['Producer_WineType'].str.split('/').str[0]
    final_df['WineType'] = final_df['Producer_WineType'].str.split('/').str[1]

    final_df['District_Year'] = (final_df['District'] + '/' + final_df['Year'].astype(str)).str.lower().str.strip()
    
    #Create a district_year column for merging and drop columns that are not relevent 
    formatted_weather_data['District_Year'] = (formatted_weather_data['District'] + '/' + formatted_weather_data['Year'].astype(str)).str.lower().str.strip()
    formatted_weather_data = formatted_weather_data.drop(columns = ['Producer', 'Year', 'WineType', 'District'])

    #Merge weather data on the final dataframe and drop irrelevent columns 
    final_df_M2 = pd.merge(final_df, formatted_weather_data, on = 'District_Year', how = 'left')  
    final_df_M2 = final_df_M2.drop(columns = ['Producer_Year', 'District_Year', 'Producer_WineType', ])

    #Re-order columns for easier viewing
    final_df_M2.insert(0, 'Producer', final_df_M2.pop('Producer'))
    final_df_M2.insert(1, 'WineType', final_df_M2.pop('WineType'))
    final_df_M2.insert(2, 'District', final_df_M2.pop('District'))
    final_df_M2.insert(3, 'Year', final_df_M2.pop('Year'))

    #Drop na from applicable columns
    final_df_M2 = final_df_M2.dropna(subset = [
        'Producer', 'WineType', 'District', 
        'Year', 'Ratings_Average', 'Acidity', 'Intensity', 
        'Sweetness', 'Tannin'
        ]) 

    return final_df_M2

---
## Variable Section 4 


### Load previous CSV Files

In [35]:
final_df_file_path = r'C:\Users\fwhal\Downloads\CME528\Project\Repo-2\BreakinBadCode\Final_DataFrames'

FINAL_wine_df_FINAL = pd.read_csv(os.path.join(final_df_file_path, 'FINAL_wine_df_FINAL.csv'))
FINAL_weather_data_FINAL = pd.read_csv(os.path.join(final_df_file_path, 'FINAL_weather_data_FINAL.csv'))
FINAL_wine_df_filtered_1Bottle_FINAL = pd.read_csv(os.path.join(final_df_file_path, 'FINAL_wine_df_filtered_1Bottle_FINAL.csv'))
FINAL_wine_df_filtered_All_Bottles_FINAL = pd.read_csv(os.path.join(final_df_file_path, 'FINAL_wine_df_filtered_All_Bottles_FINAL.csv'))

### Create training dataset 1

In [36]:
recommended_vintages_M1_df, all_vintages_M1_df = Vintage_Dataframe(FINAL_wine_df_filtered_1Bottle_FINAL)
vintage_df_M1_final = Final_Vintage_Dataframe(recommended_vintages_M1_df, all_vintages_M1_df, FINAL_wine_df_FINAL, lower_date_bound = 2000, lower_rating_bound  = 3, lower_review_count = 1)

FINAL_training_data_Model1_FINAL, Columned_Weather_Data = Training_Data(FINAL_weather_data_FINAL, vintage_df_M1_final)
FINAL_training_data_Model1_FINAL.to_csv(os.path.join(final_df_file_path, 'FINAL_training_data_Model1_FINAL.csv'), index=False)

An error occurred for URL: https://www.vivino.com/red-rooster-winery-malbec/w/94134 - list index out of range


  Grouped_Daily_into_Monthly = monthly_df.groupby(monthly_df.index.to_period('M'))
  'WineType' : Grouped_Daily_into_Monthly['Top WineType'].first()[0],
  'District' : Grouped_Daily_into_Monthly['Top District'].first()[0],


### Create training dataset 2

In [37]:
# recommended_vintages_M2_df, all_vintages_M2_df = Vintage_Dataframe(FINAL_wine_df_filtered_All_Bottles_FINAL)
# vintage_df_M2_final = Final_Vintage_Dataframe(recommended_vintages_M2_df, all_vintages_M2_df, FINAL_wine_df_FINAL, lower_date_bound = 2000, lower_rating_bound  = 3, lower_review_count = 1)
# vintage_df_M2_final.to_csv(os.path.join(final_df_file_path, 'vintage_df_M2_final.csv'), index=False)

# FINAL_training_data_Model2_FINAL = Training_Data_Model2(Columned_Weather_Data, vintage_df_M2_final, FINAL_wine_df_filtered_All_Bottles_FINAL)
# FINAL_training_data_Model2_FINAL.to_csv(os.path.join(final_df_file_path, 'FINAL_training_data_Model2_FINAL.csv'), index=False)

In [38]:
# recommended_vintages_M2_df, all_vintages_M2_df = Vintage_Dataframe(FINAL_wine_df_filtered_All_Bottles_FINAL)
# vintage_df_M2_final = Final_Vintage_Dataframe(recommended_vintages_M2_df, all_vintages_M2_df, FINAL_wine_df_FINAL, lower_date_bound = 2000, lower_rating_bound  = 3, lower_review_count = 1)
# vintage_df_M2_final.to_csv(os.path.join(final_df_file_path, 'vintage_df_M2_final.csv'), index=False)

In [39]:
vintage_df_M2_final = pd.read_csv(os.path.join(final_df_file_path, 'vintage_df_M2_final.csv'))

In [62]:
FINAL_training_data_Model2_FINAL = Training_Data_Model2(Columned_Weather_Data, vintage_df_M2_final, FINAL_wine_df_filtered_All_Bottles_FINAL)
FINAL_training_data_Model2_FINAL.to_csv(os.path.join(final_df_file_path, 'FINAL_training_data_Model2_FINAL.csv'), index=False)

---