In [1]:
# %pip install requests
# %pip install beautifulsoup4
# %pip install selenium
# %pip install pandas
# %pip install geopy
# %pip install geopandas
# %pip install folium
# %pip install openmeteo-requests
# %pip install requests-cache retry-requests numpy pandas
# %pip install timezonefinder
# % pip install seaborn

In [2]:
import os
import time
import requests
import pandas as pd
import geopandas as gpd
import folium
import openmeteo_requests
import requests_cache
import sqlite3
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import json
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from geopy.geocoders import Nominatim
from geopy.geocoders import OpenCage
from folium import Marker
from folium import GeoJson
from retry_requests import retry
from timezonefinder import TimezoneFinder 

Function Breakdown Section 1
---
SQL_Files_to_df
- pulls scraped wine data from an SQL file and converts it to a dataframe

Lat_Long_Coordinates
- takes in a location name and returns a lat long value that is associated with it 

Wine_DataFrame 
- creates a dataframe from the initial wine data that contains unique locations and coordinates

Regions_and_Districts
- takes in global shape files for regions and districts, adds this data to the wine dataframe 

Popular_Wine_Stats
- returns a dataframe with the most popular producer of the most popular winetype in the most popular district and region in the top 8 most popular countries 

In [3]:
def SQL_Files_to_df(file_name, host_folder):
    """
    host_folder: name of the host folder with each of the SQL files to read
    return: DataFrame of the contents of each sql file 
    """
    
    # Connect to the SQLite database 
    conn = sqlite3.connect(os.path.join(host_folder, file_name))

    df = pd.read_sql(f'SELECT * FROM red_wines', conn)

    df.set_index('id', inplace = True)

    return df

In [4]:
def Lat_Long_Coordinates(location_name):
    """
    name: takes in a location name
    return: returns the lat/long coordinates of the names area
    """
    
    #Uses an OpenCage api_key to filter name through geolocator database
    geolocator = OpenCage(api_key = '74ac6790c4464814b25887115034e579')
    location = geolocator.geocode(location_name)

    if location:
        return location.latitude, location.longitude
    
    else:
        #To grab further data points, uses Nominatim service to filter name through additional geolocator database
        geolocator = Nominatim(user_agent="your_unique_user_agent", timeout=10)
        location = geolocator.geocode(location_name)

        if location:
            return location.latitude, location.longitude
        
        else:
            return None, None

In [5]:
def Wine_DataFrame(raw_wine_data):
    """
    Raw_Wine_Data: takes in a df of all the wines 
    return (): returns a data frame where the indices are unique locations, and columns are #instances of each location, and lat/long coordinates

    """
    #Creates a new column with a combination of region + country from the original dataframe
    raw_wine_data['Locations'] = raw_wine_data['Region'] + ', ' + raw_wine_data['Country']

    #Drops any na values in the location column
    raw_wine_data = raw_wine_data.dropna()

    #Creates a list of unique locations and number of instances of each unique locations
    global_locations = raw_wine_data['Locations'].unique()

    #Creates a data frame with 5 columns: Locations, Location_Instances, Latitude, and Longitude
    complete_wine_data = pd.DataFrame({"Locations" : global_locations,
                       'Location_Instances' : raw_wine_data['Locations'].value_counts()
                       })
    
    complete_wine_data[["Lat","Long"]] = complete_wine_data["Locations"].apply(lambda row: pd.Series(Lat_Long_Coordinates(row)))
        
    #Set index to Locations
    complete_wine_data.set_index('Locations', inplace = True)

    return complete_wine_data

In [6]:
def Regions_and_Districts(wine_data, Location_DataFrame, geojson_host_folder):
    """
    Inputs: The wine data, the intermediate location df created before and a host folder that contains teh geojson data
    Output: A modified wine data df with 3 new columns 
        LatLong_Points: geodataframe geometry point using lat long coords
        Regions: polygon geometery of the region the lat long coords are in 
        District: polygon geometery of the region the lat long coords are in 
    """

    #Defines the file names for the regional and district files
    regions_filename = 'geoBoundariesCGAZ_ADM1_regions.geojson'
    districts_filename = 'geoBoundariesCGAZ_ADM2_districts.geojson'

    #Adds the lat long coords to the wine dataframe 
    Location_df_changed = Location_DataFrame.drop(columns=['Location_Instances'])
    wine_data = pd.merge(wine_data, Location_df_changed, on = "Locations", how='left')
    wine_Data = wine_data.drop('Region', axis = 1)

    #Creates a geodataframe with the coordinates
    wine_data_gdf = gpd.GeoDataFrame(wine_data, geometry = gpd.points_from_xy(x = wine_data['Long'], y = wine_data['Lat']), crs = 'EPSG:4326')
    wine_data_gdf = wine_data_gdf.rename(columns={'geometry': 'LatLong_Points'})
    wine_data_gdf = wine_data_gdf.set_geometry('LatLong_Points')

    #Reads the regional and district geodataframes 
    regions_gdf = gpd.read_file(os.path.join(geojson_host_folder, regions_filename))
    districts_gdf = gpd.read_file(os.path.join(geojson_host_folder, districts_filename))

    #Create an intermediate file 
    wine_data_gdf_districts = wine_data_gdf

    #Iterates through the regions_gdf, checks if a lat/long is in the region and adds it if it does
    for idx, region in regions_gdf.iterrows():
        region_polygon = region['geometry']
        within_region = wine_data_gdf['LatLong_Points'].within(region_polygon)

        #Add a geometry and name column
        wine_data_gdf.loc[within_region, 'Region'] = region['shapeName']
        wine_data_gdf.loc[within_region, 'Region_Geometry'] = region_polygon

    #Iterates through the districts_gdf, checks if a lat/long is in the district and adds it if it does
    #Work performed on a dummy dataframe to ensure overwriting does not occur
    for idx, district in districts_gdf.iterrows():
        district_polygon = district['geometry']
        within_district = wine_data_gdf_districts['LatLong_Points'].within(district_polygon)

        #Add a geometry and name column
        wine_data_gdf_districts.loc[within_district, 'District'] = district['shapeName']
        wine_data_gdf_districts.loc[within_district, 'District_Geometry'] = district_polygon

    #District data from dummy dataframe added to main dataframe
    wine_data_gdf = wine_data_gdf.assign(
        District = wine_data_gdf_districts['District'], 
        District_Geometry = wine_data_gdf_districts['District_Geometry']  
    )

    #Drop a column and change the order of the columns for viewing ease
    wine_data_gdf.drop(['url_idx'], axis = 1, inplace = True)
    wine_data_gdf = wine_data_gdf[['WineType','Producer', 'Locations', 'Rating',
                                    'Num_Ratings', 'Price', 'Country',  'Lat', 'Long',
                                    'LatLong_Points', 'Region', 'Region_Geometry',
                                    'District', 'District_Geometry', 'URL']]
    
    return wine_data_gdf

In [None]:
def Popular_Wine_Stats(wine_data_final):
    """
    Input: wine_data_final which should contain a dataframe that has counties, regions, wintetype, and producers
    Output: A dataframe containing the top countries, 
            region within those countries, top 5 districts within that region,
            top winetype within that district and top producer of that wine
    """

    #Drop the geodata for easier dataframe use 
    df = wine_data_final.drop(columns = ['LatLong_Points', 'Region_Geometry', 'District_Geometry'])

    #Get total bottle counts by country and filter out countries with bottle counts more than the 75th percentile
    total_bottles_country = df['Country'].value_counts()
    top_countries = total_bottles_country[total_bottles_country > total_bottles_country.quantile(0.75)]

    #Creates dataframe for output
    popular_wines = pd.DataFrame()

    #------------------------------------------------------------------------------------------------------------#
    #Goes through the top countries and pulls the top regions and the amount of bottles associated with them
    for country in top_countries.index:

        #Checks if region is in top countries, then grabs the top region 
        top_region = df[df['Country'] == country]['Region'].value_counts().idxmax()
        top_region_count = int(df[df['Country'] == country]['Region'].value_counts().max())

        popular_wines = pd.concat([popular_wines, pd.DataFrame([{'Top Country': country, 'Country Count': int(top_countries[country]), 'Top Region': top_region, 'Region Count': top_region_count}])])     
        popular_wines.reset_index(drop = True, inplace = True)

    #------------------------------------------------------------------------------------------------------------#
    #Goes through the top countries and regions within those countries and pulls the top districts and the amount of bottles associated with them
    new_rows = []
    for index, row in popular_wines.iterrows():

        #Checks if winetype is in top region, then grabs the top winetype 
        top_district_count = df[df['Region'] == row['Top Region']]['District'].value_counts().nlargest(5)

        #Goes through each of the top 5 districts and adds the district name and count for a given country and region
        for district, count in top_district_count.items():
            new_rows.append({'index' : index, 'Top District' : district, 'District Count' : count})
    
    #Adds the district data to the popular_wines dataframe 
    district_type = pd.DataFrame(new_rows)
    district_type.set_index('index', inplace = True)

    #Merge district dataframe to the popular wines dateframe
    popular_wines = pd.merge(popular_wines, district_type, left_index=True, right_index=True, how='left')

    #------------------------------------------------------------------------------------------------------------#
    # Goes through the top counties and gets top district and region within those countries and pulls the top winetype and the amount of bottles associated with them
    new_rows = []
    for index, row in popular_wines.iterrows():

        #Filter the dataframe based on region and district
        filtered_df = df[(df['Region'] == row['Top Region']) & (df['District'] == row['Top District'])]

        #Checks if winetype is in top region, then grabs the top winetype 
        top_winetype = filtered_df['WineType'].value_counts().idxmax()
        top_winetype_count = int((filtered_df['WineType'].value_counts().max()))
        
        new_rows.append({'Top WineType' : top_winetype, 'WineType Count' : top_winetype_count})
    
    #Adds the winetype data to the popular_wines dataframe 
    wine_type = pd.DataFrame(new_rows)
    popular_wines['Top WineType'] = wine_type['Top WineType'].values
    popular_wines['WineType Count'] = wine_type['WineType Count'].values
    popular_wines.reset_index(inplace = True)

    #------------------------------------------------------------------------------------------------------------#
    #Goes through the top countries, regions within those countries, and top winetype within that region 
    #Pulls the producers of that winetype, and takes a mean of them, returning the top 5 from that mean
    
    new_rows = []
    for index, row in popular_wines.iterrows():

                #Checks if producer is in top region, district, and produces top winetype, then grabs the top producers
                top_producers = df[(df['WineType'] == row['Top WineType']) & (df['District'] == row['Top District']) & (df['Region'] == row['Top Region'])]

                #Grabs the quantiles for the number of ratings, then filters out the data below the lowest 25% 
                #The mean is then found for the ratings based on the producer 
                Num_of_Ratings_Quantile = np.quantile(df['Num_Ratings'], [0.25, 0.5, 0.75])
                top_producer = top_producers[top_producers['Num_Ratings'] >  Num_of_Ratings_Quantile[0]].groupby(by = 'Producer')['Rating'].mean().sort_values(ascending = False)

                new_rows.append({'Producer' : top_producer.index[0], 'Average Rating' : top_producer[0]})

    producer_rating = pd.DataFrame(new_rows)
    popular_wines['Producer'] = producer_rating['Producer'].values
    popular_wines['Average Rating'] = producer_rating['Average Rating'].values
    popular_wines.reset_index(inplace = True)
    #------------------------------------------------------------------------------------------------------------#

    #Add price, URL, and lat/long coords for the outputed points
    df_urllatlong = pd.DataFrame()
    df_urllatlong[['Price', 'Producer','Lat', 'Long', 'URL']] = df.drop_duplicates(subset = 'Producer')[['Price', 'Producer',  'Lat', 'Long', 'URL']]
    final_popular_wines = pd.merge(popular_wines, df_urllatlong, on = 'Producer')
    
    return final_popular_wines.iloc[:, 2:]

  new_rows.append({'Producer' : top_producer.index[0], 'Average Rating' : top_producer[0]})
  new_rows.append({'Producer' : top_producer.index[0], 'Average Rating' : top_producer[0]})
  new_rows.append({'Producer' : top_producer.index[0], 'Average Rating' : top_producer[0]})
  new_rows.append({'Producer' : top_producer.index[0], 'Average Rating' : top_producer[0]})
  new_rows.append({'Producer' : top_producer.index[0], 'Average Rating' : top_producer[0]})
  new_rows.append({'Producer' : top_producer.index[0], 'Average Rating' : top_producer[0]})
  new_rows.append({'Producer' : top_producer.index[0], 'Average Rating' : top_producer[0]})
  new_rows.append({'Producer' : top_producer.index[0], 'Average Rating' : top_producer[0]})
  new_rows.append({'Producer' : top_producer.index[0], 'Average Rating' : top_producer[0]})
  new_rows.append({'Producer' : top_producer.index[0], 'Average Rating' : top_producer[0]})
  new_rows.append({'Producer' : top_producer.index[0], 'Average Rating' : top_pr

Unnamed: 0,Top Country,Country Count,Top Region,Region Count,Top District,District Count,Top WineType,WineType Count,Producer,Average Rating,Price,Lat,Long,URL
0,United States,6735,California,2514,Sonoma,466,Cabernet Sauvignon,74,Lancaster Estate,4.42,144.99,38.671231,-122.82787,https://www.vivino.com/lancaster-estate-winema...
1,United States,6735,California,2514,Tulare,445,Pinot Noir,93,Meiomi,4.025,37.8,36.701463,-118.755997,https://www.vivino.com/meiomi-pinot-noir/w/113...
2,United States,6735,California,2514,Mendocino,363,Pinot Noir,64,Lindstrom,4.4,259.99,36.578676,-79.389888,https://www.vivino.com/lindstrom-wines-caberne...
3,United States,6735,California,2514,San Luis Obispo,318,Cabernet Sauvignon,62,My Favorite Neighbor,4.333333,73.33,35.354021,-120.375716,https://www.vivino.com/my-favorite-neighbor-ca...
4,United States,6735,California,2514,Napa,262,Cabernet Sauvignon,62,Spottswoode,4.516667,419.99,38.503831,-122.467071,https://www.vivino.com/spottswoode-cabernet-sa...


Function Breakdown Section 2
---
Plotting_Unique_Locations
- plots all the wine locations from producer using the Wine_DataFrame function 

In [8]:
def Plotting_Unique_Locations(DataFrame):
    """
    df: A dataframe with columns including Location_Instances, and lat/long coordinates
    return: a global map of where wines are located from 
    """
    #Drop any NaN from the data
    DataFrame = DataFrame.dropna()

    #Converts DataFrame into a GeoDataFrame
    Global_Areas = gpd.GeoDataFrame(DataFrame, geometry = gpd.points_from_xy(x = DataFrame['Long'], y = DataFrame['Lat']), crs = 'EPSG:4326')

    # Create map centered around Toronto 
    map_1 = folium.Map(location = [43.6426, -79.3871], 
                    tiles = 'cartodbpositron', 
                    zoom_start = 2)
        
    # Plot each buffer area and show the map 
    GeoJson(Global_Areas).add_to(map_1)

    for idx, row in Global_Areas.iterrows():

        #Creates a base size for each location, that grows with each instance recorded
        radius = 6500 + row['Location_Instances'] * 50

        folium.Circle(
            location = [row['Lat'], row['Long']],
            radius = radius,  
            color = 'blue',
            fill = True,
            fill_color = 'blue',
            fill_opacity = 0.3,
            popup = folium.Popup(f"Location: {row.name}", parse_html = True)
        ).add_to(map_1)

    return map_1

Function Breakdown Section 3
---
Weather_Data_DataFrame
- pulls the weather data on a hourly scale for a specific lat and long, from a start date to an end data

Parsed_Weather_Data
- converts hourly data to daily data and returns desired metrics about each day 

All_Weather_Data
- parses through all the popular wines returned from the Popular_Wine_Stats
- final dataframe containes daily weather data on each wine in a large dataframe based on popular producers

In [9]:
def Weather_Data_DataFrame (lat, long, start_date, end_date):
    """ 
    Input: lat/long coordinates of where weather data should be pulled, the respective timezone, and the start and end dates for the desired location
    Output: 2 dataframes, the first which includes hourly data on temp, precipitation, humiduty, cloud cover and soil data
        the second includes daily data on the amount of sun recieved each day 
    """
    # Setup the Open-Meteo API client with cache and retry on error
    cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
    retry_session = retry(cache_session, retries = 1, backoff_factor = 0.2)
    openmeteo = openmeteo_requests.Client(session = retry_session)

    # Make sure all required weather variables are listed here
    # The order of variables in hourly or daily is important to assign them correctly below
    url = "https://archive-api.open-meteo.com/v1/archive"
    params = {
        "latitude": lat,
        "longitude": long,
        "start_date": f"{start_date}-01-01",
        "end_date": f"{end_date}-12-31",
        "hourly": ["temperature_2m", "relative_humidity_2m", "precipitation", "rain", "snowfall", "cloud_cover", "wind_speed_10m", "soil_temperature_0_to_7cm", "soil_moisture_0_to_7cm"],
        "daily": ["sunrise", "sunset", "daylight_duration"],
    }
    responses = openmeteo.weather_api(url, params=params)

    # Process first location. Add a for-loop for multiple locations or weather models
    response = responses[0]

    # Process hourly data. The order of variables needs to be the same as requested.
    hourly = response.Hourly()
    hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
    hourly_relative_humidity_2m = hourly.Variables(1).ValuesAsNumpy()
    hourly_rain = hourly.Variables(3).ValuesAsNumpy()
    hourly_snowfall = hourly.Variables(4).ValuesAsNumpy()
    hourly_cloud_cover = hourly.Variables(5).ValuesAsNumpy()
    hourly_wind_speed_10m = hourly.Variables(6).ValuesAsNumpy()
    hourly_soil_temperature_0_to_7cm = hourly.Variables(7).ValuesAsNumpy()
    hourly_soil_moisture_0_to_7cm = hourly.Variables(8).ValuesAsNumpy()

    hourly_data = {"date": pd.date_range(
        start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
        end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
        freq = pd.Timedelta(seconds = hourly.Interval()),
        inclusive = "left"
    )}
    hourly_data["temperature_2m"] = hourly_temperature_2m
    hourly_data["relative_humidity_2m"] = hourly_relative_humidity_2m
    hourly_data["rain"] = hourly_rain
    hourly_data["snowfall"] = hourly_snowfall
    hourly_data["cloud_cover"] = hourly_cloud_cover
    hourly_data["wind_speed_10m"] = hourly_wind_speed_10m
    hourly_data["soil_temperature_0_to_7cm"] = hourly_soil_temperature_0_to_7cm
    hourly_data["soil_moisture_0_to_7cm"] = hourly_soil_moisture_0_to_7cm

    hourly_dataframe = pd.DataFrame(data = hourly_data)
    hourly_dataframe.set_index("date", inplace=True)

    # Process daily data. The order of variables needs to be the same as requested.
    daily = response.Daily()
    daily_daylight_duration = daily.Variables(2).ValuesAsNumpy()

    daily_data = {"date": pd.date_range(
        start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
        end = pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
        freq = pd.Timedelta(seconds = daily.Interval()),
        inclusive = "left"
    )}

    daily_data["daylight_duration"] = daily_daylight_duration

    daily_dataframe = pd.DataFrame(data = daily_data)
    daily_dataframe.set_index("date", inplace=True)

    
    return hourly_dataframe, daily_dataframe

In [10]:
def Parsed_Weather_Data(Weather_Hourly_Data, Weather_Daily_Data):
    """
    Input: two data frames with hourly and daily data for various weather metrics
    Ouput: a new data frame with daily data that was grouped from the hourly and daily 
        input data frames, with new metrics which will be used for future comparison 
    """
    
    #Most data has almost none or no datapoints that are NaN, however just in case, we will drop them 
    Weather_Hourly_Data = Weather_Hourly_Data.dropna()
    Weather_Daily_Data = Weather_Daily_Data.dropna()

    #Group the hourly data into daily 
    Grouped_Hourly_into_Daily = Weather_Hourly_Data.groupby(Weather_Hourly_Data.index.floor('D'))    

    #Create the DataFrame with all the data required
    daily_df = pd.DataFrame({

        'Date' : Grouped_Hourly_into_Daily.size().index,
        'Max Temp (°C)' : Grouped_Hourly_into_Daily['temperature_2m'].max(),
        'Min Temp (°C)' : Grouped_Hourly_into_Daily['temperature_2m'].min(),
        'Avg Temp (°C)' : Grouped_Hourly_into_Daily['temperature_2m'].mean(),
        'Max Relative Humidity' : Grouped_Hourly_into_Daily['relative_humidity_2m'].max(),
        'Min Relative Humidity' : Grouped_Hourly_into_Daily['relative_humidity_2m'].min(),
        'Avg Relative Humidity' : Grouped_Hourly_into_Daily['relative_humidity_2m'].mean(),
        'Cumulative Precip (Rain + Snow)(mm)' : Grouped_Hourly_into_Daily['rain'].sum() + Grouped_Hourly_into_Daily['snowfall'].sum(),
        'Cumulative Rain (mm)' : Grouped_Hourly_into_Daily['rain'].sum(),
        'Cumulative Snow (mm)' : Grouped_Hourly_into_Daily['snowfall'].sum(),
        'Avg Cloud Cover (%)' : Grouped_Hourly_into_Daily['cloud_cover'].mean(),
        'Max Wind Speed (Km/h)' : Grouped_Hourly_into_Daily['wind_speed_10m'].max(),
        'Min Wind Speed (Km/h)' : Grouped_Hourly_into_Daily['wind_speed_10m'].min(),
        'Avg Wind Speed (Km/h)' : Grouped_Hourly_into_Daily['wind_speed_10m'].mean(),

    })

    daily_df.reset_index(drop=True, inplace=True)
    
    #Ensures the dataframes are the same size and adds the daylight duration column
    daily_df = daily_df.iloc[:len(Weather_Daily_Data)]
    daily_df['Daylight Hours'] = Weather_Daily_Data['daylight_duration'].values / (60*60)

    return daily_df

In [11]:
def All_Weather_Data(wine_dataframe, start_data, end_date, saved_data_folder_location):
    """
    Input: takes in the datframe of wine locations, start and end dates, and the saved folder location
    Output: a dataframe that containes all the weather data for each specific location between the start and end dates 
        Output data is all saved to the specified folder location
    """

    #Defines the initial empty dataframe
    final_df = pd.DataFrame()

    #Creates a batch size and delay for future use
    delay = 60 
    max_retries = 2

    #Iterate through each of the rows to add the respective data to a master file 
    for index, row in wine_dataframe.iterrows():
        
        #Creates a while loop, so if the process fails the iteration will wait then rerun from the same row
        success = False
        retries = 0 

        while not success and retries < max_retries:

            #Hitting the weather website API call limit was an issue so a Try and Execpt block added to iterate and wait if call limit hit
            try:
                #Grab weather data using Weather_Data_DataFrame and Parsed_Weather_Data functions
                weather_data_hourly, weather_data_daily  = Weather_Data_DataFrame(row['Lat'], row['Long'], start_data, end_date)
                weather_data_daily_final = Parsed_Weather_Data(weather_data_hourly, weather_data_daily)
                
                #Assign the index to the producer and add a lat and long column
                weather_data_daily_final['Producer'] = row['Producer']
                weather_data_daily_final['Top WineType'] = row['Top WineType']
                weather_data_daily_final['Lat'] = row['Lat']
                weather_data_daily_final['Long'] = row['Long']

                #Reorder column so producer at front
                weather_data_daily_final.insert(0,'Producer',weather_data_daily_final.pop('Producer'))
                weather_data_daily_final.insert(1,'Top WineType',weather_data_daily_final.pop('Top WineType'))

                #Add each single data set to the final data set 
                weather_data_daily_final.set_index('Date', inplace=True)
                final_df = pd.concat([final_df, weather_data_daily_final])

                #Saves the data for each location to a csv on the local drive as it processes it as a safeguard
                name = saved_data_folder_location + '\\' + 'Weather_for_' +row['Producer'] + '.csv'
                weather_data_daily_final.to_csv(name)

                #Add a delay to prevent API call limit
                time.sleep(delay) 
                success = True 

            except: 
                time.sleep(delay) 
                retries += 1

        
    return final_df

Function Breakdown Section 4
---
Vintage_Dataframe
- grabs two dataframes based on the URLs pulled from the popular producers of the specific wines
- one for all the wine vintage data, and one based on popular stats for the wine 

Final_Vintage_DataFrame 
- creates a dataframe with vintage data
- included here critically is the ratings data tied to the specific producers

Converted_Weather_Data
- converts output dataframe from All_Weather_Data into a dataframe indexed across producers and years 
- the columns are monthly averages based on the daily data
- adds in the rating and price data for the specific producer of a specific wine for a specific year 
- final form set up to allow easier training on a model 

In [12]:
def Vintage_Dataframe(popular_wines_df):
    """
    Input: takes in a dataframe of popular wines, which includes a column with the URL for each of the wines
    Output: two dataframes 
        Recommended vintages: which has the data on the specific wines chosen 
        All Vintages: which has data on all the wines
    """
    #Initialize lists to store the extracted data
    all_recommended_vintages = []
    all_vintages_data = []

    #Stips each of the URLs so they are in proper form 
    popular_wines_df['URL'] = popular_wines_df['URL'].str.split('?').str[0]

    #Loop row in the DataFrame
    for index, row in popular_wines_df.iterrows():

        #Grabs each URL and strips and remaining white space 
        url = row['URL'].strip() 
        
        try:
            r = requests.get(url, headers={
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0"
            })

            #Check if the request was successful
            if r.status_code == 200:
                
                #Search for the JavaScript data in the page source
                res = re.search(r"window\.__PRELOADED_STATE__\.winePageInformation\s*=\s*(.*});", r.text, re.MULTILINE)
                
                if res:
                    data = json.loads(res.group(1))
                    
                    #Extract recommended vintages
                    recommended_vintages = data.get("recommended_vintages", [])
                    if recommended_vintages:
                        recommended_df = pd.DataFrame(recommended_vintages)

                        #Add the URL for reference
                        recommended_df['source_url'] = url  
                        all_recommended_vintages.append(recommended_df)

                    #Extract all vintages
                    all_vintages = data.get("wine", {}).get("vintages", [])
                    if all_vintages:
                        all_vintages_df = pd.DataFrame(all_vintages)

                        #Add the URL for reference
                        all_vintages_df['source_url'] = url  
                        all_vintages_data.append(all_vintages_df)

                else:
                    print(f"No data found for URL: {url}")
            else:
                print(f"Failed to retrieve data for URL: {url}, Status code: {r.status_code}")
        
        except Exception as e:
            print(f"An error occurred for URL: {url} - {str(e)}")

    #Concats all the recommended vintages and all vintages data into DataFrames
    recommended_vintages_df = pd.concat(all_recommended_vintages, ignore_index=True)
    all_vintages_df = pd.concat(all_vintages_data, ignore_index=True)
   
    return recommended_vintages_df, all_vintages_df

In [13]:
def Final_Vintage_Dataframe(recommended_vintages_df, all_vintages_df, list_of_producers, lower_date_bound = 1900, lower_rating_bound  = 3, lower_review_count = 1):
    """
    Input: two dataframes 
        Recommended vintages: which has the data on the specific wines chosen 
        All Vintages: which has data on all the wines
    Output:  one dataframe 
        all_filtered_winebottle: dataframe with vintage dataframe, of importance is the rating, price, year and producer name
    """

    #Defines a function for internal function use that grabs key-values pairs
    def extract_object_data(Object_data):
        """
        Extracts key-value pairs from an object and returns a Series
        """
        if isinstance(Object_data, dict):
            return pd.Series(Object_data)
        else:
            return pd.Series()  

    #Filters the vintages df so that the all the data is only gotten for those with a valid rating 
    all_vintages_df_True = all_vintages_df[all_vintages_df["has_valid_ratings"] == True]

    #Apply the function to the 'object_column' and create a new DataFrame
    new_columns = all_vintages_df_True['statistics'].apply(extract_object_data)

    #Concatenate the original DataFrame with the new columns
    all_vintages_df_True = pd.concat([all_vintages_df_True, new_columns], axis=1)

    #Grabs specific volumns from the vintage data 
    df4 = all_vintages_df_True[['id', 'name', 'year', 'ratings_average', 'reviews_count']]

    #Grabs the id and amount from the vintage data then creates a new dataframe 
    recommended_vintages_df['id'] =recommended_vintages_df['vintage'].apply(lambda x: x.get('id'))
    df5 =recommended_vintages_df[['id', 'type']].drop_duplicates(subset = ['id'])
    
    #Merges the two dataframes
    final_merge_df = pd.merge(df4, df5, on='id', how='left')

    #Filters the data by year, ratings average and reviews count 
    all_filtered_winebottle = final_merge_df[
        (final_merge_df['year'] >= lower_date_bound) &
        (final_merge_df['ratings_average'] > lower_rating_bound) &
        (final_merge_df['reviews_count'] > lower_review_count)
    ]

    #Change the columns names 
    all_filtered_winebottle = all_filtered_winebottle.rename(columns={'id': 'ID',
                                                                      'name': 'Producer_v',
                                                                      'year': 'Year_v',
                                                                      'ratings_average': 'Ratings Average',
                                                                      'reviews_count': 'Reviews Count',
                                                                      'type': 'Type',
                                                                      })
    
    def producer_name(row, list_of_producers):
        """
        Checks if a row is in the list of producers, and returns the producer name 
        """
        for producer in list_of_producers:

            lower_producer = producer.lower()
            lower_row = row.lower()

            if lower_producer in lower_row:

                return producer
            
        return row
    
    #Goes through the names column and changes the name to the producer name, to allow for later merging between dataframes
    all_filtered_winebottle['Producer_v'] = all_filtered_winebottle['Producer_v'].apply(lambda row: producer_name(row, list_of_producers))

    #Add a producer/year column for future merging 
    all_filtered_winebottle['Producer/Year'] = all_filtered_winebottle.apply(lambda row: row['Producer_v'] + ' ' + str(row['Year_v']), axis=1)

    return all_filtered_winebottle

In [14]:
def Training_Data(weather_df, vintage_df):
    """
    Input: Weather dataframe indexed on the daily scale,
        Vintage dataframe which has yearly bottles from the same producer and wine type
    Output: Weather dataframe indexed based on a single row for a producer and year 
        columns will be monthly min, max or average for the respective data per year
    """
    #Weather_df.index = pd.to_datetime(weather_df.index)
    producers = weather_df['Producer'].unique()

    #Initializes a dataframe 
    final_df = pd.DataFrame()

    #Check to see if date is the index
    if type(weather_df.index[0]) == int:

        weather_df = weather_df.set_index('Date') 

    #Create a column for the year and produce all the unique years
    weather_df.index = pd.to_datetime(weather_df.index)
    weather_df['Year'] = weather_df.index.year
    unique_years = weather_df['Year'].unique()

    #Find all the unique months
    weather_df['Month'] = weather_df.index.month
    unique_months = weather_df['Month'].unique()

    #Gives key for the month names 
    month_names = {1 : 'January', 2 : 'February', 3 : 'March', 4 : 'April', 5 : 'May', 
                                  6 : 'June', 7 : 'July', 8 : 'August', 9 : 'September', 10 : 'October', 11 : 'November',
                                  12 : 'December'}

    for producer in producers:

        #Filter the data based on the producer
        producer_df = weather_df[weather_df['Producer'] == producer]

        #Iterate over each unique year
        for year in unique_years:
            
            #Filter the data for the specific year
            yearly_df = producer_df[producer_df['Year'] == year]

            #Creates initial data
            yearly_data = {'Producer' : producer, 'Year' : year}

            #Ierate over each unique month
            for month in unique_months: 

                #Filter the data for the specific month
                monthly_df = yearly_df[yearly_df['Month'] == month]

                #Group by month
                Grouped_Daily_into_Monthly = monthly_df.groupby(monthly_df.index.to_period('M'))

                #Change month number to month name
                month = month_names[month]

                yearly_data.update({
                    
                    'WineType' : Grouped_Daily_into_Monthly['Top WineType'].first()[0],
                    f'{month} Max Temp (°C)' : Grouped_Daily_into_Monthly['Max Temp (°C)'].max().iloc[0],
                    f'{month} Min Temp (°C)' : Grouped_Daily_into_Monthly['Min Temp (°C)'].min().iloc[0],
                    f'{month} Avg Temp (°C)' : Grouped_Daily_into_Monthly['Avg Temp (°C)'].mean().iloc[0],
                    f'{month} Max Relative Humidity' : Grouped_Daily_into_Monthly['Max Relative Humidity'].max().iloc[0],
                    f'{month} Min Relative Humidity' : Grouped_Daily_into_Monthly['Min Relative Humidity'].min().iloc[0],
                    f'{month} Avg Relative Humidity' : Grouped_Daily_into_Monthly['Max Relative Humidity'].mean().iloc[0],
                    f'{month} Cumulative Rain (mm)' : Grouped_Daily_into_Monthly['Cumulative Rain (mm)'].sum().iloc[0],
                    f'{month} Cumulative Snow (mm)' : Grouped_Daily_into_Monthly['Cumulative Snow (mm)'].sum().iloc[0],
                    f'{month} Avg Cloud Cover (%)' : Grouped_Daily_into_Monthly['Avg Cloud Cover (%)'].mean().iloc[0],
                    f'{month} Max Wind Speed (Km/h)' : Grouped_Daily_into_Monthly['Max Wind Speed (Km/h)'].max().iloc[0],
                    f'{month} Min Wind Speed (Km/h)' : Grouped_Daily_into_Monthly['Min Wind Speed (Km/h)'].min().iloc[0],
                    f'{month} Avg Wind Speed (Km/h)' : Grouped_Daily_into_Monthly['Avg Wind Speed (Km/h)'].mean().iloc[0],
                    f'{month} Avg Daylight Hours' : Grouped_Daily_into_Monthly['Daylight Hours'].mean().iloc[0] 

                })
            
            #Adds this yearly data to original df
            final_df = pd.concat([final_df, pd.DataFrame([yearly_data])], ignore_index=True)
    
    #Add a producer/year column for future merging and reset index
    final_df = final_df.reset_index()
    final_df['Producer/Year'] = final_df.apply(lambda row: row['Producer'] + ' ' + str(row['Year']), axis=1)

    #Combine the two dataframes
    final_df = pd.merge(final_df, vintage_df, on = 'Producer/Year', how = 'left')

    #Modify the dataframe for easier viewing
    final_df = final_df.drop(columns = ['index','Producer/Year', 'ID', 'Reviews Count', 'Producer_v', 'Year_v'])
    final_df.insert(0, 'Producer', final_df.pop('Producer'))
    final_df.insert(1, 'Year', final_df.pop('Year'))
    final_df.insert(2, 'Ratings Average', final_df.pop('Ratings Average'))

    #Drop all the rows that have NAN in the ratings average column 
    final_df = final_df.dropna(subset = ['Ratings Average'])

    return final_df

---

Variable Section 1
---

In [19]:
red_wines = SQL_Files_to_df('red_wines_final.db', r'C:\Users\fwhal\Downloads\CME528\Project\Wine_Raw_Data')
location_df = Wine_DataFrame(red_wines)
wine_df_final = Regions_and_Districts(red_wines, location_df, r'C:\Users\fwhal\Downloads\CME528\Project\GeoJsonFiles')

#Long processing time, have saved previous output to csv, load from this
FINAL_wine_df_filtered_FINAL, FINAL_producer_list_FINAL = Popular_Wine_Stats(wine_df_final)
FINAL_wine_df_filtered_FINAL

  return lib.within(a, b, **kwargs)
  return lib.within(a, b, **kwargs)
  return lib.within(a, b, **kwargs)


Unnamed: 0,Top Country,Country Count,Top Region,Region Count,Top WineType,WineType Count,Producer,Average Rating,Lat,Long,URL
0,United States,6735,California,2514,Cabernet Sauvignon,318,Spottswoode,4.516667,38.503831,-122.467071,https://www.vivino.com/spottswoode-cabernet-sa...
1,United States,6735,California,2514,Cabernet Sauvignon,318,Red Cap Vineyards,4.5,38.5813,-122.451097,https://www.vivino.com/red-cap-cabernet-sauvig...
2,United States,6735,California,2514,Cabernet Sauvignon,318,Outpost,4.5,38.5813,-122.451097,https://www.vivino.com/outpost-wines-true-vine...
3,United States,6735,California,2514,Cabernet Sauvignon,318,La Jota,4.5,38.5813,-122.451097,https://www.vivino.com/la-jota-cabernet-sauvig...
4,United States,6735,California,2514,Cabernet Sauvignon,318,Lancaster Estate,4.42,38.671231,-122.82787,https://www.vivino.com/lancaster-estate-winema...
5,Italy,6188,Centro,3065,Brunello di Montalcino,437,Stella di Campalto,4.6,43.320499,11.328186,https://www.vivino.com/stella-di-campalto-brun...
6,Italy,6188,Centro,3065,Brunello di Montalcino,437,Poggio di Sotto,4.5,43.320499,11.328186,https://www.vivino.com/poggio-di-sotto-brunell...
7,Italy,6188,Centro,3065,Brunello di Montalcino,437,Cerbaiona,4.5,43.320499,11.328186,https://www.vivino.com/cerbaiona-brunello-di-m...
8,Italy,6188,Centro,3065,Brunello di Montalcino,437,Biondi-Santi,4.493333,43.320499,11.328186,https://www.vivino.com/biondi-santi-brunello-d...
9,Italy,6188,Centro,3065,Brunello di Montalcino,437,Salvioni,4.477778,43.320499,11.328186,https://www.vivino.com/salvioni-maria-grazia-b...


In [23]:
wine_df_final.head()

Unnamed: 0,WineType,Producer,Locations,Rating,Num_Ratings,Price,Country,Lat,Long,LatLong_Points,Region,Region_Geometry,District,District_Geometry,URL
0,Tenuta Tignanello 'Solaia',Antinori,"Toscana, Italy",4.7,2152,749.99,Italy,43.458654,11.13892,POINT (11.13892 43.45865),Centro,MULTIPOLYGON (((12.98497349698058 40.935411903...,Toscana,MULTIPOLYGON (((10.31862415679168 42.346868261...,https://www.vivino.com/antinori-tuscany-tenuta...
1,Château Margaux (Premier Grand Cru Classé),Château Margaux,"Margaux, France",4.7,1883,1274.99,France,45.046334,-0.672912,POINT (-0.67291 45.04633),Nouvelle-Aquitaine,MULTIPOLYGON (((-1.412478525654974 46.18438510...,Gironde,MULTIPOLYGON (((-1.016335577485999 44.65528692...,https://www.vivino.com/chateau-margaux-chateau...
2,Tenuta Tignanello 'Solaia',Antinori,"Toscana, Italy",4.7,1772,729.99,Italy,43.458654,11.13892,POINT (11.13892 43.45865),Centro,MULTIPOLYGON (((12.98497349698058 40.935411903...,Toscana,MULTIPOLYGON (((10.31862415679168 42.346868261...,https://www.vivino.com/antinori-tuscany-tenuta...
3,Cabernet Sauvignon Beckstoffer To Kalon Vineyard,Schrader,"Oakville, United States",4.7,399,509.99,United States,41.099817,-91.044402,POINT (-91.04440 41.09982),Iowa,POLYGON ((-95.76565290299999 40.58521515600006...,Louisa,POLYGON ((-91.48400181599999 41.42385555500005...,https://www.vivino.com/schrader-cellars-cabern...
4,Château Margaux (Premier Grand Cru Classé),Château Margaux,"Margaux, France",4.7,391,1374.99,France,45.046334,-0.672912,POINT (-0.67291 45.04633),Nouvelle-Aquitaine,MULTIPOLYGON (((-1.412478525654974 46.18438510...,Gironde,MULTIPOLYGON (((-1.016335577485999 44.65528692...,https://www.vivino.com/chateau-margaux-chateau...


In [24]:
red_wines.head()

Unnamed: 0_level_0,Producer,WineType,Year,Region,Country,URL,Rating,Num_Ratings,Price,url_idx,Locations
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Antinori,Tenuta Tignanello 'Solaia',2011.0,Toscana,Italy,https://www.vivino.com/antinori-tuscany-tenuta...,4.7,2152,749.99,0,"Toscana, Italy"
2,Château Margaux,Château Margaux (Premier Grand Cru Classé),2006.0,Margaux,France,https://www.vivino.com/chateau-margaux-chateau...,4.7,1883,1274.99,0,"Margaux, France"
3,Antinori,Tenuta Tignanello 'Solaia',2017.0,Toscana,Italy,https://www.vivino.com/antinori-tuscany-tenuta...,4.7,1772,729.99,0,"Toscana, Italy"
4,Schrader,Cabernet Sauvignon Beckstoffer To Kalon Vineyard,2017.0,Oakville,United States,https://www.vivino.com/schrader-cellars-cabern...,4.7,399,509.99,0,"Oakville, United States"
5,Château Margaux,Château Margaux (Premier Grand Cru Classé),2017.0,Margaux,France,https://www.vivino.com/chateau-margaux-chateau...,4.7,391,1374.99,0,"Margaux, France"


Variable Section 2
---

In [None]:
#Plotting_Unique_Locations(location_df)

Variable Section 3
---

In [None]:
#Long processing time, have saved previous output to csv, load from this
# FINAL_weather_data_FINAL = All_Weather_Data(pd.read_csv(r'C:\Users\fwhal\Downloads\CME528\Project\Final_DataFrames\FINAL_wine_df_filtered_FINAL.csv'), 2000, 2023, r'C:\Users\fwhal\Downloads\CME528\Project\Final_Weather_Data_DONOTTOUCH')
# FINAL_weather_data_FINAL.to_csv(r'C:\Users\fwhal\Downloads\CME528\Project\Final_DataFrames\FINAL_weather_data_FINAL.csv', index = True)

Variable Section 4 
---

In [None]:
recommended_vintages_df, all_vintages_df = Vintage_Dataframe(pd.read_csv(r'C:\Users\fwhal\Downloads\CME528\Project\Final_DataFrames\FINAL_wine_df_filtered_FINAL.csv'))
vintage_df_final = Final_Vintage_Dataframe(recommended_vintages_df, all_vintages_df, FINAL_producer_list_FINAL, lower_date_bound = 1900, lower_rating_bound  = 3, lower_review_count = 1)
FINAL_training_data_FINAL = Training_Data(pd.read_csv(r'C:\Users\fwhal\Downloads\CME528\Project\Final_DataFrames\FINAL_weather_data_FINAL.csv'), vintage_df_final)
FINAL_training_data_FINAL.to_csv(r'C:\Users\fwhal\Downloads\CME528\Project\Final_DataFrames\FINAL_training_data_FINAL.csv', index=False)

In [None]:
FINAL_training_data_FINAL

Unnamed: 0,Producer,Year,Ratings Average,WineType,January Max Temp (°C),January Min Temp (°C),January Avg Temp (°C),January Max Relative Humidity,January Min Relative Humidity,January Avg Relative Humidity,...,December Min Relative Humidity,December Avg Relative Humidity,December Cumulative Rain (mm),December Cumulative Snow (mm),December Avg Cloud Cover (%),December Max Wind Speed (Km/h),December Min Wind Speed (Km/h),December Avg Wind Speed (Km/h),December Avg Daylight Hours,Type
0,Spottswoode,2000,4.4,Cabernet Sauvignon,16.265999,2.0160,9.505382,100.0,27.844180,97.471508,...,26.004856,90.442934,15.000001,0.00,63.619624,15.379206,0.000000,5.202262,9.530056,
1,Spottswoode,2001,4.6,Cabernet Sauvignon,18.866000,0.2160,8.075879,100.0,30.725060,89.962441,...,32.204582,94.170632,247.700000,0.00,78.372312,24.627789,0.000000,7.542654,9.531516,
2,Spottswoode,2002,4.7,Cabernet Sauvignon,17.516000,-1.1840,8.166202,100.0,29.874908,92.855892,...,35.938175,96.567983,361.300000,0.77,72.435484,34.838253,0.360000,8.357912,9.532616,best_user_rated
3,Spottswoode,2003,4.5,Cabernet Sauvignon,21.166000,2.7160,11.895704,100.0,31.300783,95.859775,...,42.869484,97.018014,238.000000,0.00,76.689515,27.238941,0.509117,7.207038,9.533591,latest_available
4,Spottswoode,2004,4.6,Cabernet Sauvignon,16.316000,0.0660,8.648191,100.0,35.702976,96.035980,...,27.285332,93.196508,214.400000,0.00,55.418011,24.130743,0.360000,6.528542,9.529678,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1050,Quinta do Vallado,2018,4.3,Tinto,18.163500,0.7135,8.395691,100.0,48.113730,96.475339,...,47.727577,98.715139,55.500000,0.00,72.076613,23.377836,0.000000,4.786329,9.266552,
1051,Quinta do Vallado,2019,4.3,Tinto,16.813500,-3.5865,5.779024,100.0,30.845370,96.142419,...,48.820850,95.361842,145.600000,0.00,66.778226,34.454840,0.000000,7.201181,9.267922,top_ranked
1052,Quinta do Vallado,2020,4.3,Tinto,18.013500,-2.1365,7.141323,100.0,35.739544,95.170381,...,46.140537,96.316910,60.400000,0.00,64.638441,18.844202,0.360000,7.005120,9.263821,
1053,Quinta do Vallado,2021,4.2,Tinto,20.363499,-3.2365,6.837895,100.0,38.745500,94.391654,...,47.642980,96.938014,53.500000,0.00,70.969086,23.333443,0.000000,6.326624,9.264957,


---

- format the winetypes 
- create points in each district 
