In [1]:
import os
import time
import requests
import pandas as pd
import geopandas as gpd
import folium
import openmeteo_requests
import requests_cache
import sqlite3
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import json
import re
from bs4 import BeautifulSoup


In [2]:
wines_file_path = r'C:\Users\fwhal\Downloads\CME528\Project\Repo-2\BreakinBadCode\Final_Wines_Of_Interest'
final_df_file_path = r'C:\Users\fwhal\Downloads\CME528\Project\Repo-2\BreakinBadCode\Final_DataFrames'


Dry_Wines = pd.read_csv(os.path.join(wines_file_path, 'Best_Drywines.csv'))
Medium_Wines = pd.read_csv(os.path.join(wines_file_path, 'Best_Medium_Drywines.csv'))
Sweet_Wines = pd.read_csv(os.path.join(wines_file_path, 'Best_Sweetwines.csv'))
Best_Wines = pd.read_csv(os.path.join(wines_file_path, 'Best_Wines.csv'))

FINAL_weather_data_FINAL = pd.read_csv(os.path.join(final_df_file_path, 'FINAL_weather_data_FINAL.csv'))

In [3]:
def Training_Data(weather_df, wine_df):
    """
    Input: Weather dataframe indexed on the daily scale,
        Vintage dataframe which has yearly bottles from the same producer and wine type
    Output: Weather dataframe indexed based on a single row for a producer and year 
        columns will be monthly min, max or average for the respective data per year
    """
    #Weather_df.index = pd.to_datetime(weather_df.index)
    producers = weather_df['Producer'].unique()

    #Initializes a dataframe 
    final_df = pd.DataFrame()

    #Check to see if date is the index
    if type(weather_df.index[0]) == int:

        weather_df = weather_df.set_index('Date') 

    #Filter the wine_df to make winetypes uniform 
    popular_wine_types = ['château margaux', 'cabernet sauvignon', 'pinot noir', 'zinfandel', 'syrah', 
                            'pinot gris', 'sauvignon blanc', 'chardonnay', 'baco noir', 'bordeaux',
                            'malbec', 'chardonnay', 'pinot grigio', 'merlot', 'sangiovese', 'shiraz',
                            'cabernet franc', 'muscat', 'grenache', 'sangiovese'  ]
    
    #Sets all the winetypes to be lower 
    wine_df['WineType'] = wine_df['WineType'].str.lower()

    #Filters through each of the winetypes, then changes the df winetype name if the wine type is in the row string 
    for winetype in popular_wine_types:

        wine_df['WineType']  = wine_df['WineType'].apply(lambda row: next((winetype for winetype in popular_wine_types if winetype in row), row))

    #Create a column for the year and produce all the unique years
    weather_df.index = pd.to_datetime(weather_df.index)
    weather_df['Year'] = weather_df.index.year
    unique_years = weather_df['Year'].unique()

    #Find all the unique months
    weather_df['Month'] = weather_df.index.month
    unique_months = weather_df['Month'].unique()

    #Gives key for the month names 
    month_names = {1 : 'January', 2 : 'February', 3 : 'March', 4 : 'April', 5 : 'May', 
                                  6 : 'June', 7 : 'July', 8 : 'August', 9 : 'September', 10 : 'October', 11 : 'November',
                                  12 : 'December'}

    for producer in producers:

        #Filter the data based on the producer
        producer_df = weather_df[weather_df['Producer'] == producer]

        #Iterate over each unique year
        for year in unique_years:
            
            #Filter the data for the specific year
            yearly_df = producer_df[producer_df['Year'] == year]

            #Creates initial data
            yearly_data = {'Producer' : producer, 'Year' : year}

            #Ierate over each unique month
            for month in unique_months: 

                #Filter the data for the specific month
                monthly_df = yearly_df[yearly_df['Month'] == month]

                #Group by month
                Grouped_Daily_into_Monthly = monthly_df.groupby(monthly_df.index.to_period('M'))

                #Change month number to month name
                month = month_names[month]

                yearly_data.update({
                    
                    'WineType' : Grouped_Daily_into_Monthly['Top WineType'].first()[0],
                    'District' : Grouped_Daily_into_Monthly['Top District'].first()[0],
                    f'{month} Max Temp (°C)' : Grouped_Daily_into_Monthly['Max Temp (°C)'].max().iloc[0],
                    f'{month} Min Temp (°C)' : Grouped_Daily_into_Monthly['Min Temp (°C)'].min().iloc[0],
                    f'{month} Avg Temp (°C)' : Grouped_Daily_into_Monthly['Avg Temp (°C)'].mean().iloc[0],
                    f'{month} Max Relative Humidity' : Grouped_Daily_into_Monthly['Max Relative Humidity'].max().iloc[0],
                    f'{month} Min Relative Humidity' : Grouped_Daily_into_Monthly['Min Relative Humidity'].min().iloc[0],
                    f'{month} Avg Relative Humidity' : Grouped_Daily_into_Monthly['Max Relative Humidity'].mean().iloc[0],
                    f'{month} Cumulative Rain (mm)' : Grouped_Daily_into_Monthly['Cumulative Rain (mm)'].sum().iloc[0],
                    f'{month} Cumulative Snow (mm)' : Grouped_Daily_into_Monthly['Cumulative Snow (mm)'].sum().iloc[0],
                    f'{month} Cumulative Precip (mm)' : Grouped_Daily_into_Monthly['Cumulative Rain (mm)'].sum().iloc[0] + Grouped_Daily_into_Monthly['Cumulative Snow (mm)'].sum().iloc[0],
                    f'{month} Cumulative Snow (mm)' : Grouped_Daily_into_Monthly['Cumulative Snow (mm)'].sum().iloc[0],
                    f'{month} Avg Cloud Cover (%)' : Grouped_Daily_into_Monthly['Avg Cloud Cover (%)'].mean().iloc[0],
                    f'{month} Max Wind Speed (Km/h)' : Grouped_Daily_into_Monthly['Max Wind Speed (Km/h)'].max().iloc[0],
                    f'{month} Min Wind Speed (Km/h)' : Grouped_Daily_into_Monthly['Min Wind Speed (Km/h)'].min().iloc[0],
                    f'{month} Avg Wind Speed (Km/h)' : Grouped_Daily_into_Monthly['Avg Wind Speed (Km/h)'].mean().iloc[0],
                    f'{month} Avg Daylight Hours' : Grouped_Daily_into_Monthly['Daylight Hours'].mean().iloc[0], 
                    f'{month} Days Below 0 (°C)': (monthly_df['Avg Temp (°C)'] <= 0).sum(),
                    f'{month} Days Above 32 (°C)': (monthly_df['Avg Temp (°C)'] > 32).sum()

                })
            
            #Adds this yearly data to original df
            final_df = pd.concat([final_df, pd.DataFrame([yearly_data])], ignore_index=True)
    
    #Add a producer/year column for future merging and reset index
    final_df = final_df.reset_index()
    final_df['Producer_WineType_Year'] = final_df['Producer'] + '/' + final_df['WineType'] + '/' + str(final_df['Year'])
    final_df['Producer_WineType_Year'] = final_df['Producer_WineType_Year'].str.lower()

    weather_df_columns = final_df.drop(columns = ['index'])
    print(final_df)

    #Format wine_df 
    wine_df['Producer_WineType_Year'] = wine_df['Producer'] + '/' + wine_df['WineType'] + '/' + str(wine_df['year'])
    wine_df['Producer_WineType_Year'] = wine_df['Producer_WineType_Year'].str.lower()

    #Combine the two dataframes     
    final_df = pd.merge(final_df, wine_df, on = 'Producer_WineType_Year', how = 'left')
    
    #Modify the dataframe for easier viewing
    final_df = final_df.drop(columns = ['index', 'year', 'Producer_y', 'WineType_y', 'has_valid_ratings', 'Producer_WineType_Year'])
    
    final_df.insert(0, 'Producer', final_df.pop('Producer_x'))
    final_df.insert(1, 'WineType', final_df.pop('WineType_x'))
    final_df.insert(2, 'District', final_df.pop('District'))
    final_df.insert(3, 'Region', final_df.pop('Region'))
    final_df.insert(4, 'Country', final_df.pop('Country'))
    final_df.insert(5, 'Year', final_df.pop('Year'))
    final_df.insert(5, 'Ratings_Average', final_df.pop('ratings_average'))

    #Drop all the rows that have NAN in the ratings average column 
    final_df = final_df.dropna(subset = ['Ratings_Average'])

    return final_df

In [4]:
Dry_Wines

Unnamed: 0,id,name_x,Country,Region,Producer,WineType,year,has_valid_ratings,ratings_average,reviews_count,...,type,Tannin,Acidity,Intensity,Sweetness,Acidity_norm,Tannin_norm,Intensity_norm,score,price_range
0,156238156,Beringer Beringer Bros. Bourbon Barrel Aged Ca...,United States,Napa Valley,Beringer,Beringer Bros. Bourbon Barrel Aged Cabernet Sa...,2016.0,True,4.1,4089.0,...,price_qpr,4.0,3.5,5.0,1.0,0.25,0.75,1.0,203.813,0-30
1,167029502,Bread & Butter Pinot Noir 2021,United States,Napa Valley,Bread & Butter,Pinot Noir,2021.0,True,4.0,3171.0,...,best_user_rated,2.0,3.5,3.0,1.0,0.25,0.25,0.0,156.726,0-30
2,85993665,Casa Ferreirinha Papa Figos Douro 2016,Portugal,Douro,Casa Ferreirinha,Papa Figos Douro,2016.0,True,3.9,2767.0,...,oldest_available,4.0,3.0,5.0,1.0,0.0,0.75,1.0,136.6285,0-30
3,99119393,Rutini Cabernet - Malbec 2016,Argentina,Mendoza,Rutini,Cabernet - Malbec,2016.0,True,4.2,3286.0,...,most_user_rated,3.5,3.0,4.5,1.0,0.0,0.625,0.75,161.55525,30-60
4,156135225,Marqués de Riscal Rioja Reserva 2018,Spain,Rioja,Marqués de Riscal,Rioja Reserva,2018.0,True,4.1,3220.0,...,top_listed,4.5,4.0,4.0,1.0,0.5,0.875,0.5,158.89875,30-60
5,150266396,Bodegas Marqués de Cáceres Crianza 2017,Spain,Rioja,Bodegas Marqués de Cáceres,Crianza,2017.0,True,3.7,3146.0,...,best_user_rated,4.5,4.0,4.0,1.0,0.5,0.875,0.5,155.45975,30-60
6,162874246,Austin Hope Austin Hope Cabernet Sauvignon 2020,United States,Paso Robles,Austin Hope,Austin Hope Cabernet Sauvignon,2020.0,True,4.5,3107.0,...,price_discounted,4.5,3.5,5.0,1.0,0.25,0.875,1.0,149.25725,60-100
7,160471734,Barbanera Gigino 80 Anniversario 2018,Italy,Toscana,Barbanera,Gigino 80 Anniversario,2018.0,True,4.3,1460.0,...,price_qpr,3.0,3.0,3.0,1.0,0.0,0.5,0.0,67.086,60-100
8,92859032,Faust Cabernet Sauvignon 2016,United States,Napa Valley,Faust,Cabernet Sauvignon,2016.0,True,4.3,1429.0,...,most_user_rated,4.0,3.5,5.0,1.0,0.25,0.75,1.0,65.461,60-100
9,4051132,Bodegas Faustino I Gran Reserva 2004,Spain,Rioja,Bodegas Faustino,I Gran Reserva,2004.0,True,4.0,2356.0,...,best_user_rated,4.5,4.0,4.0,1.0,0.5,0.875,0.5,107.76975,100-150


In [5]:
FINAL_weather_data_FINAL.head(1)

Unnamed: 0,Date,Producer,Top WineType,Max Temp (°C),Min Temp (°C),Avg Temp (°C),Max Relative Humidity,Min Relative Humidity,Avg Relative Humidity,Cumulative Precip (Rain + Snow)(mm),...,Max Wind Speed (Km/h),Min Wind Speed (Km/h),Avg Wind Speed (Km/h),Daylight Hours,Top Country,Top Region,Top District,Price,Lat,Long
0,2000-01-01 00:00:00+00:00,Fisher Vineyards,cabernet sauvignon,11.8295,2.2295,5.831583,99.291336,55.711567,87.09057,0.0,...,18.25026,0.72,6.144656,9.528758,United States,California,Sonoma,240.0,38.51108,-122.847339


In [6]:
df = Training_Data(FINAL_weather_data_FINAL, Dry_Wines)
df

  Grouped_Daily_into_Monthly = monthly_df.groupby(monthly_df.index.to_period('M'))
  'WineType' : Grouped_Daily_into_Monthly['Top WineType'].first()[0],
  'District' : Grouped_Daily_into_Monthly['Top District'].first()[0],


     index          Producer  Year            WineType    District  \
0        0  Fisher Vineyards  2000  cabernet sauvignon      Sonoma   
1        1  Fisher Vineyards  2001  cabernet sauvignon      Sonoma   
2        2  Fisher Vineyards  2002  cabernet sauvignon      Sonoma   
3        3  Fisher Vineyards  2003  cabernet sauvignon      Sonoma   
4        4  Fisher Vineyards  2004  cabernet sauvignon      Sonoma   
..     ...               ...   ...                 ...         ...   
595    595     Catena Zapata  2019              malbec  San Rafael   
596    596     Catena Zapata  2020              malbec  San Rafael   
597    597     Catena Zapata  2021              malbec  San Rafael   
598    598     Catena Zapata  2022              malbec  San Rafael   
599    599     Catena Zapata  2023              malbec  San Rafael   

     January Max Temp (°C)  January Min Temp (°C)  January Avg Temp (°C)  \
0                15.929500                 1.0295               9.091798   
1      

Unnamed: 0,Producer,WineType,District,Region,Country,Ratings_Average,Year,January Max Temp (°C),January Min Temp (°C),January Avg Temp (°C),...,type,Tannin,Acidity,Intensity,Sweetness,Acidity_norm,Tannin_norm,Intensity_norm,score,price_range


In [7]:
for i in df.columns:
    print(i)

Producer
WineType
District
Region
Country
Ratings_Average
Year
January Max Temp (°C)
January Min Temp (°C)
January Avg Temp (°C)
January Max Relative Humidity
January Min Relative Humidity
January Avg Relative Humidity
January Cumulative Rain (mm)
January Cumulative Snow (mm)
January Cumulative Precip (mm)
January Avg Cloud Cover (%)
January Max Wind Speed (Km/h)
January Min Wind Speed (Km/h)
January Avg Wind Speed (Km/h)
January Avg Daylight Hours
January Days Below 0 (°C)
January Days Above 32 (°C)
February Max Temp (°C)
February Min Temp (°C)
February Avg Temp (°C)
February Max Relative Humidity
February Min Relative Humidity
February Avg Relative Humidity
February Cumulative Rain (mm)
February Cumulative Snow (mm)
February Cumulative Precip (mm)
February Avg Cloud Cover (%)
February Max Wind Speed (Km/h)
February Min Wind Speed (Km/h)
February Avg Wind Speed (Km/h)
February Avg Daylight Hours
February Days Below 0 (°C)
February Days Above 32 (°C)
March Max Temp (°C)
March Min Temp 