In [1]:
# %pip install requests
# %pip install beautifulsoup4
# %pip install selenium
# %pip install pandas
# %pip install geopy
# %pip install geopandas
# %pip install folium
# %pip install openmeteo-requests
# %pip install requests-cache retry-requests numpy pandas
# %pip install timezonefinder
# % pip install seaborn

In [2]:
import os
import time
import requests
import pandas as pd
import geopandas as gpd
import folium
import openmeteo_requests
import requests_cache
import sqlite3
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import json
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from geopy.geocoders import Nominatim
from geopy.geocoders import OpenCage
from folium import Marker
from folium import GeoJson
from retry_requests import retry
from timezonefinder import TimezoneFinder 

Function Breakdown Section 3
---
Weather_Data_DataFrame
- pulls the weather data on a hourly scale for a specific lat and long, from a start date to an end data

Parsed_Weather_Data
- converts hourly data to daily data and returns desired metrics about each day 

All_Weather_Data
- parses through all the popular wines returned from the Popular_Wine_Stats
- final dataframe containes daily weather data on each wine in a large dataframe based on popular producers

In [9]:
def Weather_Data_DataFrame (lat, long, start_date, end_date):
    """ 
    Input: lat/long coordinates of where weather data should be pulled, the respective timezone, and the start and end dates for the desired location
    Output: 2 dataframes, the first which includes hourly data on temp, precipitation, humiduty, cloud cover and soil data
        the second includes daily data on the amount of sun recieved each day 
    """
    # Setup the Open-Meteo API client with cache and retry on error
    cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
    retry_session = retry(cache_session, retries = 1, backoff_factor = 0.2)
    openmeteo = openmeteo_requests.Client(session = retry_session)

    # Make sure all required weather variables are listed here
    # The order of variables in hourly or daily is important to assign them correctly below
    url = "https://archive-api.open-meteo.com/v1/archive"
    params = {
        "latitude": lat,
        "longitude": long,
        "start_date": f"{start_date}-01-01",
        "end_date": f"{end_date}-12-31",
        "hourly": ["temperature_2m", "relative_humidity_2m", "precipitation", "rain", "snowfall", "cloud_cover", "wind_speed_10m", "soil_temperature_0_to_7cm", "soil_moisture_0_to_7cm"],
        "daily": ["sunrise", "sunset", "daylight_duration"],
    }
    responses = openmeteo.weather_api(url, params=params)

    # Process first location. Add a for-loop for multiple locations or weather models
    response = responses[0]

    # Process hourly data. The order of variables needs to be the same as requested.
    hourly = response.Hourly()
    hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
    hourly_relative_humidity_2m = hourly.Variables(1).ValuesAsNumpy()
    hourly_rain = hourly.Variables(3).ValuesAsNumpy()
    hourly_snowfall = hourly.Variables(4).ValuesAsNumpy()
    hourly_cloud_cover = hourly.Variables(5).ValuesAsNumpy()
    hourly_wind_speed_10m = hourly.Variables(6).ValuesAsNumpy()
    hourly_soil_temperature_0_to_7cm = hourly.Variables(7).ValuesAsNumpy()
    hourly_soil_moisture_0_to_7cm = hourly.Variables(8).ValuesAsNumpy()

    hourly_data = {"date": pd.date_range(
        start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
        end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
        freq = pd.Timedelta(seconds = hourly.Interval()),
        inclusive = "left"
    )}
    hourly_data["temperature_2m"] = hourly_temperature_2m
    hourly_data["relative_humidity_2m"] = hourly_relative_humidity_2m
    hourly_data["rain"] = hourly_rain
    hourly_data["snowfall"] = hourly_snowfall
    hourly_data["cloud_cover"] = hourly_cloud_cover
    hourly_data["wind_speed_10m"] = hourly_wind_speed_10m
    hourly_data["soil_temperature_0_to_7cm"] = hourly_soil_temperature_0_to_7cm
    hourly_data["soil_moisture_0_to_7cm"] = hourly_soil_moisture_0_to_7cm

    hourly_dataframe = pd.DataFrame(data = hourly_data)
    hourly_dataframe.set_index("date", inplace=True)

    # Process daily data. The order of variables needs to be the same as requested.
    daily = response.Daily()
    daily_daylight_duration = daily.Variables(2).ValuesAsNumpy()

    daily_data = {"date": pd.date_range(
        start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
        end = pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
        freq = pd.Timedelta(seconds = daily.Interval()),
        inclusive = "left"
    )}

    daily_data["daylight_duration"] = daily_daylight_duration

    daily_dataframe = pd.DataFrame(data = daily_data)
    daily_dataframe.set_index("date", inplace=True)

    
    return hourly_dataframe, daily_dataframe

In [10]:
def Parsed_Weather_Data(Weather_Hourly_Data, Weather_Daily_Data):
    """
    Input: two data frames with hourly and daily data for various weather metrics
    Ouput: a new data frame with daily data that was grouped from the hourly and daily 
        input data frames, with new metrics which will be used for future comparison 
    """
    
    #Most data has almost none or no datapoints that are NaN, however just in case, we will drop them 
    Weather_Hourly_Data = Weather_Hourly_Data.dropna()
    Weather_Daily_Data = Weather_Daily_Data.dropna()

    #Group the hourly data into daily 
    Grouped_Hourly_into_Daily = Weather_Hourly_Data.groupby(Weather_Hourly_Data.index.floor('D'))    

    #Create the DataFrame with all the data required
    daily_df = pd.DataFrame({

        'Date' : Grouped_Hourly_into_Daily.size().index,
        'Max Temp (°C)' : Grouped_Hourly_into_Daily['temperature_2m'].max(),
        'Min Temp (°C)' : Grouped_Hourly_into_Daily['temperature_2m'].min(),
        'Avg Temp (°C)' : Grouped_Hourly_into_Daily['temperature_2m'].mean(),
        'Max Relative Humidity' : Grouped_Hourly_into_Daily['relative_humidity_2m'].max(),
        'Min Relative Humidity' : Grouped_Hourly_into_Daily['relative_humidity_2m'].min(),
        'Avg Relative Humidity' : Grouped_Hourly_into_Daily['relative_humidity_2m'].mean(),
        'Cumulative Precip (Rain + Snow)(mm)' : Grouped_Hourly_into_Daily['rain'].sum() + Grouped_Hourly_into_Daily['snowfall'].sum(),
        'Cumulative Rain (mm)' : Grouped_Hourly_into_Daily['rain'].sum(),
        'Cumulative Snow (mm)' : Grouped_Hourly_into_Daily['snowfall'].sum(),
        'Avg Cloud Cover (%)' : Grouped_Hourly_into_Daily['cloud_cover'].mean(),
        'Max Wind Speed (Km/h)' : Grouped_Hourly_into_Daily['wind_speed_10m'].max(),
        'Min Wind Speed (Km/h)' : Grouped_Hourly_into_Daily['wind_speed_10m'].min(),
        'Avg Wind Speed (Km/h)' : Grouped_Hourly_into_Daily['wind_speed_10m'].mean(),

    })

    daily_df.reset_index(drop=True, inplace=True)
    
    #Ensures the dataframes are the same size and adds the daylight duration column
    daily_df = daily_df.iloc[:len(Weather_Daily_Data)]
    daily_df['Daylight Hours'] = Weather_Daily_Data['daylight_duration'].values / (60*60)

    return daily_df

In [11]:
def All_Weather_Data(wine_dataframe, start_data, end_date):
    """
    Input: takes in the datframe of wine locations, start and end dates, and the saved folder location
    Output: a dataframe that containes all the weather data for each specific location between the start and end dates 
        Output data is all saved to the specified folder location
    """

    #Defines the initial empty dataframe
    final_df = pd.DataFrame()

    #Creates a batch size and delay for future use
    delay = 60 
    max_retries = 2

    #Iterate through each of the rows to add the respective data to a master file 
    for index, row in wine_dataframe.iterrows():
        
        #Creates a while loop, so if the process fails the iteration will wait then rerun from the same row
        success = False
        retries = 0 

        while not success and retries < max_retries:

            #Hitting the weather website API call limit was an issue so a Try and Execpt block added to iterate and wait if call limit hit
            try:
                #Grab weather data using Weather_Data_DataFrame and Parsed_Weather_Data functions
                weather_data_hourly, weather_data_daily  = Weather_Data_DataFrame(row['Lat'], row['Long'], start_data, end_date)
                weather_data_daily_final = Parsed_Weather_Data(weather_data_hourly, weather_data_daily)
                
                #Grab all the important columns and add them to the weather dataframe
                weather_data_daily_final[['Top Country'], ['Top Region'], ['Top District'], ['Top WineType'], ['Producer'], ['Price'], ['Lat'],
                                         ['Long']] = row[['Top Country'], ['Top Region'], ['Top District'], ['Top WineType'],
                                                         ['Producer'], ['Price'], ['Lat'], ['Long']]

                #Reorder column so producer at front
                weather_data_daily_final.insert(0,'Producer',weather_data_daily_final.pop('Producer'))
                weather_data_daily_final.insert(1,'Top WineType',weather_data_daily_final.pop('Top WineType'))

                #Add each single data set to the final data set 
                weather_data_daily_final.set_index('Date', inplace=True)
                final_df = pd.concat([final_df, weather_data_daily_final])

                #Add a delay to prevent API call limit
                time.sleep(delay) 
                success = True 

            except: 
                time.sleep(delay) 
                retries += 1

        
    return final_df

---

Variable Section 3
---

In [None]:
#Long processing time, have saved previous output to csv, load from this
FINAL_weather_data_FINAL = All_Weather_Data(pd.read_csv(r'C:\Users\fwhal\Downloads\CME528\Project\BreakinBadCode\Final_DataFrames\FINAL_wine_df_filtered_FINAL.csv'), 2000, 2023)
FINAL_weather_data_FINAL.to_csv(r'C:\Users\fwhal\Downloads\CME528\Project\BreakinBadCode\Final_DataFrames\FINAL_weather_data_FINAL.csv', index = True)

In [None]:
FINAL_training_data_FINAL

Unnamed: 0,Producer,Year,Ratings Average,WineType,January Max Temp (°C),January Min Temp (°C),January Avg Temp (°C),January Max Relative Humidity,January Min Relative Humidity,January Avg Relative Humidity,...,December Min Relative Humidity,December Avg Relative Humidity,December Cumulative Rain (mm),December Cumulative Snow (mm),December Avg Cloud Cover (%),December Max Wind Speed (Km/h),December Min Wind Speed (Km/h),December Avg Wind Speed (Km/h),December Avg Daylight Hours,Type
0,Spottswoode,2000,4.4,Cabernet Sauvignon,16.265999,2.0160,9.505382,100.0,27.844180,97.471508,...,26.004856,90.442934,15.000001,0.00,63.619624,15.379206,0.000000,5.202262,9.530056,
1,Spottswoode,2001,4.6,Cabernet Sauvignon,18.866000,0.2160,8.075879,100.0,30.725060,89.962441,...,32.204582,94.170632,247.700000,0.00,78.372312,24.627789,0.000000,7.542654,9.531516,
2,Spottswoode,2002,4.7,Cabernet Sauvignon,17.516000,-1.1840,8.166202,100.0,29.874908,92.855892,...,35.938175,96.567983,361.300000,0.77,72.435484,34.838253,0.360000,8.357912,9.532616,best_user_rated
3,Spottswoode,2003,4.5,Cabernet Sauvignon,21.166000,2.7160,11.895704,100.0,31.300783,95.859775,...,42.869484,97.018014,238.000000,0.00,76.689515,27.238941,0.509117,7.207038,9.533591,latest_available
4,Spottswoode,2004,4.6,Cabernet Sauvignon,16.316000,0.0660,8.648191,100.0,35.702976,96.035980,...,27.285332,93.196508,214.400000,0.00,55.418011,24.130743,0.360000,6.528542,9.529678,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1050,Quinta do Vallado,2018,4.3,Tinto,18.163500,0.7135,8.395691,100.0,48.113730,96.475339,...,47.727577,98.715139,55.500000,0.00,72.076613,23.377836,0.000000,4.786329,9.266552,
1051,Quinta do Vallado,2019,4.3,Tinto,16.813500,-3.5865,5.779024,100.0,30.845370,96.142419,...,48.820850,95.361842,145.600000,0.00,66.778226,34.454840,0.000000,7.201181,9.267922,top_ranked
1052,Quinta do Vallado,2020,4.3,Tinto,18.013500,-2.1365,7.141323,100.0,35.739544,95.170381,...,46.140537,96.316910,60.400000,0.00,64.638441,18.844202,0.360000,7.005120,9.263821,
1053,Quinta do Vallado,2021,4.2,Tinto,20.363499,-3.2365,6.837895,100.0,38.745500,94.391654,...,47.642980,96.938014,53.500000,0.00,70.969086,23.333443,0.000000,6.326624,9.264957,


---