In [1]:
import os
import time
import requests
import pandas as pd
import geopandas as gpd
import folium
import openmeteo_requests
import requests_cache
import sqlite3
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import json
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from geopy.geocoders import Nominatim
from geopy.geocoders import OpenCage
from folium import Marker
from folium import GeoJson
from retry_requests import retry
from timezonefinder import TimezoneFinder 

---
Function Breakdown
---

Lat_Long_Coordinates
- takes in a location name and returns a lat long value that is associated with it 

Wine_DataFrame 
- creates a dataframe from the initial wine data that contains unique locations and coordinates

Top_Bottles 
- Takes in the dataframes of each of the top bottles and returns 1 dataframe where they are all merged together, with lat/long coords

Weather_Data_DataFrame
- pulls the weather data on a hourly scale for a specific lat and long, from a start date to an end data

Parsed_Weather_Data
- converts hourly data to daily data and returns desired metrics about each day 

All_Weather_Data
- parses through all the popular wines returned from the Popular_Wine_Stats
- final dataframe containes daily weather data on each wine in a large dataframe based on popular producers

Training_Data
- converts output dataframe from All_Weather_Data into a dataframe indexed across producers and years 
- the columns are monthly averages based on the daily data
- adds in the rating and price data for the specific producer of a specific wine for a specific year 

In [2]:
def Lat_Long_Coordinates(location_name):
    """
    name: takes in a location name
    return: returns the lat/long coordinates of the names area
    """
    
    #Uses an OpenCage api_key to filter name through geolocator database
    geolocator = OpenCage(api_key = 'f339a0ad9adf4d79be69204907140726')
    location = geolocator.geocode(location_name) 

    if location:
        return location.latitude, location.longitude
    
    else:
        #To grab further data points, uses Nominatim service to filter name through additional geolocator database
        geolocator = Nominatim(user_agent = "your_unique_user_agent", timeout = 10)
        location = geolocator.geocode(location_name)

        if location:
            return location.latitude, location.longitude
        
        else:
            return None, None

In [3]:
def Wine_DataFrame(raw_wine_data):
    """
    Raw_Wine_Data: takes in a df of all the wines 
    return (): returns a data frame where the indices are unique locations, and columns are #instances of each location, and lat/long coordinates

    """
    #Creates a new column with a combination of region + country from the original dataframe
    raw_wine_data['Locations'] = raw_wine_data['Region'] + ', ' + raw_wine_data['Country']

    #Creates a list of unique locations and number of instances of each unique locations
    global_locations = raw_wine_data['Locations'].unique()

    #Creates a data frame with 5 columns: Locations, Location_Instances, Latitude, and Longitude
    complete_wine_data = pd.DataFrame({"Locations" : global_locations,
                       'Location_Instances' : raw_wine_data['Locations'].value_counts()
                       })
    
    complete_wine_data[["Lat","Long"]] = complete_wine_data["Locations"].apply(lambda row: pd.Series(Lat_Long_Coordinates(row)))
        
    #Set index to Locations
    complete_wine_data.set_index('Locations', inplace = True)

    return complete_wine_data

In [4]:
def Top_Bottles(Dry_df, Medium_df, Sweet_df, Best_df):
    """
    Input: Takes in each of the top wines
    Output: A dataframe with the top wines added together and each of the lat/long coords found for each
    """
    #Initialize dataframe
    df = pd.DataFrame()

    #Go through each of the dataframes and add them together
    for wine_list in [Dry_df, Medium_df, Sweet_df, Best_df]:
        
        df = pd.concat([df, wine_list],  axis = 0)

    #Run the wine_dataframe function
    df = Wine_DataFrame(df)

    return df

In [5]:
def Weather_Data_DataFrame (lat, long, start_date, end_date):
    """ 
    Input: lat/long coordinates of where weather data should be pulled, the respective timezone, and the start and end dates for the desired location
    Output: 2 dataframes, the first which includes hourly data on temp, precipitation, humiduty, cloud cover and soil data
        the second includes daily data on the amount of sun recieved each day 
    """
    # Setup the Open-Meteo API client with cache and retry on error
    cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
    retry_session = retry(cache_session, retries = 1, backoff_factor = 0.2)
    openmeteo = openmeteo_requests.Client(session = retry_session)

    # Make sure all required weather variables are listed here
    # The order of variables in hourly or daily is important to assign them correctly below
    url = "https://archive-api.open-meteo.com/v1/archive"
    params = {
        "latitude": lat,
        "longitude": long,
        "start_date": f"{start_date}-01-01",
        "end_date": f"{end_date}-12-31",
        "hourly": ["temperature_2m", "relative_humidity_2m", "precipitation", "rain", "snowfall", "cloud_cover", "wind_speed_10m", "soil_temperature_0_to_7cm", "soil_moisture_0_to_7cm"],
        "daily": ["sunrise", "sunset", "daylight_duration"],
    }
    responses = openmeteo.weather_api(url, params=params)

    # Process first location. Add a for-loop for multiple locations or weather models
    response = responses[0]

    # Process hourly data. The order of variables needs to be the same as requested.
    hourly = response.Hourly()
    hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
    hourly_relative_humidity_2m = hourly.Variables(1).ValuesAsNumpy()
    hourly_rain = hourly.Variables(3).ValuesAsNumpy()
    hourly_snowfall = hourly.Variables(4).ValuesAsNumpy()
    hourly_cloud_cover = hourly.Variables(5).ValuesAsNumpy()
    hourly_wind_speed_10m = hourly.Variables(6).ValuesAsNumpy()
    hourly_soil_temperature_0_to_7cm = hourly.Variables(7).ValuesAsNumpy()
    hourly_soil_moisture_0_to_7cm = hourly.Variables(8).ValuesAsNumpy()

    hourly_data = {"date": pd.date_range(
        start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
        end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
        freq = pd.Timedelta(seconds = hourly.Interval()),
        inclusive = "left"
    )}
    hourly_data["temperature_2m"] = hourly_temperature_2m
    hourly_data["relative_humidity_2m"] = hourly_relative_humidity_2m
    hourly_data["rain"] = hourly_rain
    hourly_data["snowfall"] = hourly_snowfall
    hourly_data["cloud_cover"] = hourly_cloud_cover
    hourly_data["wind_speed_10m"] = hourly_wind_speed_10m
    hourly_data["soil_temperature_0_to_7cm"] = hourly_soil_temperature_0_to_7cm
    hourly_data["soil_moisture_0_to_7cm"] = hourly_soil_moisture_0_to_7cm

    hourly_dataframe = pd.DataFrame(data = hourly_data)
    hourly_dataframe.set_index("date", inplace=True)

    # Process daily data. The order of variables needs to be the same as requested.
    daily = response.Daily()
    daily_daylight_duration = daily.Variables(2).ValuesAsNumpy()

    daily_data = {"date": pd.date_range(
        start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
        end = pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
        freq = pd.Timedelta(seconds = daily.Interval()),
        inclusive = "left"
    )}

    daily_data["daylight_duration"] = daily_daylight_duration

    daily_dataframe = pd.DataFrame(data = daily_data)
    daily_dataframe.set_index("date", inplace=True)

    
    return hourly_dataframe, daily_dataframe

In [6]:
def Parsed_Weather_Data(Weather_Hourly_Data, Weather_Daily_Data):
    """
    Input: two data frames with hourly and daily data for various weather metrics
    Ouput: a new data frame with daily data that was grouped from the hourly and daily 
        input data frames, with new metrics which will be used for future comparison 
    """
    
    #Most data has almost none or no datapoints that are NaN, however just in case, we will drop them 
    Weather_Hourly_Data = Weather_Hourly_Data.dropna()
    Weather_Daily_Data = Weather_Daily_Data.dropna()

    #Group the hourly data into daily 
    Grouped_Hourly_into_Daily = Weather_Hourly_Data.groupby(Weather_Hourly_Data.index.floor('D'))    

    #Create the DataFrame with all the data required
    daily_df = pd.DataFrame({

        'Date' : Grouped_Hourly_into_Daily.size().index,
        'Max Temp (°C)' : Grouped_Hourly_into_Daily['temperature_2m'].max(),
        'Min Temp (°C)' : Grouped_Hourly_into_Daily['temperature_2m'].min(),
        'Avg Temp (°C)' : Grouped_Hourly_into_Daily['temperature_2m'].mean(),
        'Max Relative Humidity' : Grouped_Hourly_into_Daily['relative_humidity_2m'].max(),
        'Min Relative Humidity' : Grouped_Hourly_into_Daily['relative_humidity_2m'].min(),
        'Avg Relative Humidity' : Grouped_Hourly_into_Daily['relative_humidity_2m'].mean(),
        'Cumulative Precip (Rain + Snow)(mm)' : Grouped_Hourly_into_Daily['rain'].sum() + Grouped_Hourly_into_Daily['snowfall'].sum(),
        'Cumulative Rain (mm)' : Grouped_Hourly_into_Daily['rain'].sum(),
        'Cumulative Snow (mm)' : Grouped_Hourly_into_Daily['snowfall'].sum(),
        'Avg Cloud Cover (%)' : Grouped_Hourly_into_Daily['cloud_cover'].mean(),
        'Max Wind Speed (Km/h)' : Grouped_Hourly_into_Daily['wind_speed_10m'].max(),
        'Min Wind Speed (Km/h)' : Grouped_Hourly_into_Daily['wind_speed_10m'].min(),
        'Avg Wind Speed (Km/h)' : Grouped_Hourly_into_Daily['wind_speed_10m'].mean(),

    })

    daily_df.reset_index(drop=True, inplace=True)
    
    #Ensures the dataframes are the same size and adds the daylight duration column
    daily_df = daily_df.iloc[:len(Weather_Daily_Data)]
    daily_df['Daylight Hours'] = Weather_Daily_Data['daylight_duration'].values / (60*60)

    return daily_df

In [7]:
def All_Weather_Data(wine_dataframe, start_data, end_date):
    """
    Input: takes in the datframe of wine locations, start and end dates, and the saved folder location
    Output: a dataframe that containes all the weather data for each specific location between the start and end dates 
        Output data is all saved to the specified folder location
    """

    #Drop an unnecessary column
    if 'Unnamed: 0' in wine_dataframe.columns:
        wine_dataframe = wine_dataframe.drop(columns = ['Unnamed: 0'])

    #Defines the initial empty dataframe
    final_df = pd.DataFrame()

    #Creates a batch size and delay for future use
    delay = 60 
    max_retries = 5

    # print(wine_dataframe)
    
    #Iterate through each of the rows to add the respective data to a master file 
    for index, row in wine_dataframe.iterrows():
        
        #Creates a while loop, so if the process fails the iteration will wait then rerun from the same row
        success = False
        retries = 0 

        while not success and retries < max_retries:

            #Hitting the weather website API call limit was an issue so a Try and Execpt block added to iterate and wait if call limit hit
            try:
                #Grab weather data using Weather_Data_DataFrame and Parsed_Weather_Data functions
                weather_data_hourly, weather_data_daily  = Weather_Data_DataFrame(row['Lat'], row['Long'], start_data, end_date)
                weather_data_daily_final = Parsed_Weather_Data(weather_data_hourly, weather_data_daily)
                
                #Grab all the important columns and add them to the weather dataframe
                weather_data_daily_final[['Top Country', 'Top Region', 'Top District', 'Top WineType', 'Producer', 'Price', 'Lat', 'Long']] = row[['Top Country', 'Top Region', 'Top District', 'Top WineType', 'Producer', 'Price', 'Lat', 'Long']]
                
                #Reorder column so producer at front
                weather_data_daily_final.insert(0,'Producer',weather_data_daily_final.pop('Producer'))
                weather_data_daily_final.insert(1,'Top WineType',weather_data_daily_final.pop('Top WineType'))

                #Add each single data set to the final data set 
                weather_data_daily_final.set_index('Date', inplace=True)
                final_df = pd.concat([final_df, weather_data_daily_final])

                #Add a delay to prevent API call limit
                time.sleep(delay) 
                success = True 

            except: 
                time.sleep(delay) 
                retries += 1

        
    return final_df

In [8]:
def Training_Data(weather_df, wine_df):
    """
    Input: Weather dataframe indexed on the daily scale,
        Vintage dataframe which has yearly bottles from the same producer and wine type
    Output: Weather dataframe indexed based on a single row for a producer and year 
        columns will be monthly min, max or average for the respective data per year
    """
    #Weather_df.index = pd.to_datetime(weather_df.index)
    producers = weather_df['Producer'].unique()

    #Initializes a dataframe 
    final_df = pd.DataFrame()

    #Check to see if date is the index
    if type(weather_df.index[0]) == int:

        weather_df = weather_df.set_index('Date') 

    #Filter the wine_df to make winetypes uniform 
    popular_wine_types = ['château margaux', 'cabernet sauvignon', 'pinot noir', 'zinfandel', 'syrah', 
                            'pinot gris', 'sauvignon blanc', 'chardonnay', 'baco noir', 'bordeaux',
                            'malbec', 'chardonnay', 'pinot grigio', 'merlot', 'sangiovese', 'shiraz',
                            'cabernet franc', 'muscat', 'grenache', 'sangiovese'  ]
    
    #Sets all the winetypes to be lower 
    wine_df['WineType'] = wine_df['WineType'].str.lower()

    #Filters through each of the winetypes, then changes the df winetype name if the wine type is in the row string 
    for winetype in popular_wine_types:

        wine_df['WineType']  = wine_df['WineType'].apply(lambda row: next((winetype for winetype in popular_wine_types if winetype in row), row))

    #Create a column for the year and produce all the unique years
    weather_df.index = pd.to_datetime(weather_df.index)
    weather_df['Year'] = weather_df.index.year
    unique_years = weather_df['Year'].unique()

    #Find all the unique months
    weather_df['Month'] = weather_df.index.month
    unique_months = weather_df['Month'].unique()

    #Gives key for the month names 
    month_names = {1 : 'January', 2 : 'February', 3 : 'March', 4 : 'April', 5 : 'May', 
                                  6 : 'June', 7 : 'July', 8 : 'August', 9 : 'September', 10 : 'October', 11 : 'November',
                                  12 : 'December'}

    for producer in producers:

        #Filter the data based on the producer
        producer_df = weather_df[weather_df['Producer'] == producer]

        #Iterate over each unique year
        for year in unique_years:
            
            #Filter the data for the specific year
            yearly_df = producer_df[producer_df['Year'] == year]

            #Creates initial data
            yearly_data = {'Producer' : producer, 'Year' : year}

            #Ierate over each unique month
            for month in unique_months: 

                #Filter the data for the specific month
                monthly_df = yearly_df[yearly_df['Month'] == month]

                #Group by month
                Grouped_Daily_into_Monthly = monthly_df.groupby(monthly_df.index.to_period('M'))

                #Change month number to month name
                month = month_names[month]

                yearly_data.update({
                    
                    'WineType' : Grouped_Daily_into_Monthly['Top WineType'].first()[0],
                    'District' : Grouped_Daily_into_Monthly['Top District'].first()[0],
                    f'{month} Max Temp (°C)' : Grouped_Daily_into_Monthly['Max Temp (°C)'].max().iloc[0],
                    f'{month} Min Temp (°C)' : Grouped_Daily_into_Monthly['Min Temp (°C)'].min().iloc[0],
                    f'{month} Avg Temp (°C)' : Grouped_Daily_into_Monthly['Avg Temp (°C)'].mean().iloc[0],
                    f'{month} Max Relative Humidity' : Grouped_Daily_into_Monthly['Max Relative Humidity'].max().iloc[0],
                    f'{month} Min Relative Humidity' : Grouped_Daily_into_Monthly['Min Relative Humidity'].min().iloc[0],
                    f'{month} Avg Relative Humidity' : Grouped_Daily_into_Monthly['Max Relative Humidity'].mean().iloc[0],
                    f'{month} Cumulative Rain (mm)' : Grouped_Daily_into_Monthly['Cumulative Rain (mm)'].sum().iloc[0],
                    f'{month} Cumulative Snow (mm)' : Grouped_Daily_into_Monthly['Cumulative Snow (mm)'].sum().iloc[0],
                    f'{month} Cumulative Precip (mm)' : Grouped_Daily_into_Monthly['Cumulative Rain (mm)'].sum().iloc[0] + Grouped_Daily_into_Monthly['Cumulative Snow (mm)'].sum().iloc[0],
                    f'{month} Cumulative Snow (mm)' : Grouped_Daily_into_Monthly['Cumulative Snow (mm)'].sum().iloc[0],
                    f'{month} Avg Cloud Cover (%)' : Grouped_Daily_into_Monthly['Avg Cloud Cover (%)'].mean().iloc[0],
                    f'{month} Max Wind Speed (Km/h)' : Grouped_Daily_into_Monthly['Max Wind Speed (Km/h)'].max().iloc[0],
                    f'{month} Min Wind Speed (Km/h)' : Grouped_Daily_into_Monthly['Min Wind Speed (Km/h)'].min().iloc[0],
                    f'{month} Avg Wind Speed (Km/h)' : Grouped_Daily_into_Monthly['Avg Wind Speed (Km/h)'].mean().iloc[0],
                    f'{month} Avg Daylight Hours' : Grouped_Daily_into_Monthly['Daylight Hours'].mean().iloc[0], 
                    f'{month} Days Below 0 (°C)': (monthly_df['Avg Temp (°C)'] <= 0).sum(),
                    f'{month} Days Above 32 (°C)': (monthly_df['Avg Temp (°C)'] > 32).sum()

                })
            
            #Adds this yearly data to original df
            final_df = pd.concat([final_df, pd.DataFrame([yearly_data])], ignore_index=True)
    
    #Add a producer/year column for future merging and reset index
    final_df = final_df.reset_index()
    final_df['Producer_WineType_Year'] = final_df['Producer'] + '/' + final_df['WineType'] + '/' + final_df['Year'].astype(str)
    final_df['Producer_WineType_Year'] = final_df['Producer_WineType_Year'].str.lower()

    weather_df_columns = final_df.drop(columns = ['index'])
    print(final_df)

    #Format wine_df 
    wine_df['Producer_WineType_Year'] = wine_df['Producer'] + '/' + wine_df['WineType'] + '/' + wine_df['year'].astype(str)
    wine_df['Producer_WineType_Year'] = wine_df['Producer_WineType_Year'].str.lower()

    #Combine the two dataframes     
    final_df = pd.merge(final_df, wine_df, on = 'Producer_WineType_Year', how = 'left')
    
    #Modify the dataframe for easier viewing
    final_df = final_df.drop(columns = ['index', 'year', 'Producer_y', 'WineType_y', 'has_valid_ratings', 'Producer_WineType_Year'])
    
    final_df.insert(0, 'Producer', final_df.pop('Producer_x'))
    final_df.insert(1, 'WineType', final_df.pop('WineType_x'))
    final_df.insert(2, 'District', final_df.pop('District'))
    final_df.insert(3, 'Region', final_df.pop('Region'))
    final_df.insert(4, 'Country', final_df.pop('Country'))
    final_df.insert(5, 'Year', final_df.pop('Year'))
    final_df.insert(5, 'Ratings_Average', final_df.pop('ratings_average'))

    #Drop all the rows that have NAN in the ratings average column 
    final_df = final_df.dropna(subset = ['Ratings_Average'])

    return final_df

---
## Code Running Section


In [9]:
wines_file_path = r'C:\Users\fwhal\Downloads\CME528\Project\Repo-2\BreakinBadCode\Final_Wines_Of_Interest'
final_df_file_path = r'C:\Users\fwhal\Downloads\CME528\Project\Repo-2\BreakinBadCode\Final_DataFrames'


Dry_Wines = pd.read_csv(os.path.join(wines_file_path, 'Best_Drywines.csv'))
Medium_Wines = pd.read_csv(os.path.join(wines_file_path, 'Best_Medium_Drywines.csv'))
Sweet_Wines = pd.read_csv(os.path.join(wines_file_path, 'Best_Sweetwines.csv'))
Best_Wines = pd.read_csv(os.path.join(wines_file_path, 'Best_Wines.csv'))

FINAL_weather_data_FINAL = pd.read_csv(os.path.join(final_df_file_path, 'FINAL_weather_data_FINAL.csv'))

In [10]:
wine_df = Top_Bottles(Dry_Wines, Medium_Wines, Sweet_Wines, Best_Wines)

In [None]:
FINAL_weather_data_FINAL = All_Weather_Data(wine_df, 2000, 2023)

In [None]:
Top_Wines_df = Training_Data(FINAL_weather_data_FINAL, wine_df)