In [1]:
# %pip install requests
# %pip install beautifulsoup4
# %pip install selenium
# %pip install pandas
# %pip install geopy
# %pip install geopandas
# %pip install folium
# %pip install openmeteo-requests
# %pip install requests-cache retry-requests numpy pandas
# %pip install timezonefinder
# % pip install seaborn

In [2]:
import os
import time
import requests
import pandas as pd
import geopandas as gpd
import folium
import openmeteo_requests
import requests_cache
import sqlite3
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import json
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from geopy.geocoders import Nominatim
from geopy.geocoders import OpenCage
from folium import Marker
from folium import GeoJson
from retry_requests import retry
from timezonefinder import TimezoneFinder 

Function Breakdown Section 4
---
Vintage_Dataframe
- grabs two dataframes based on the URLs pulled from the popular producers of the specific wines
- one for all the wine vintage data, and one based on popular stats for the wine 

Final_Vintage_DataFrame 
- creates a dataframe with vintage data
- included here critically is the ratings data tied to the specific producers

Converted_Weather_Data
- converts output dataframe from All_Weather_Data into a dataframe indexed across producers and years 
- the columns are monthly averages based on the daily data
- adds in the rating and price data for the specific producer of a specific wine for a specific year 
- final form set up to allow easier training on a model 

In [12]:
def Vintage_Dataframe(popular_wines_df):
    """
    Input: takes in a dataframe of popular wines, which includes a column with the URL for each of the wines
    Output: two dataframes 
        Recommended vintages: which has the data on the specific wines chosen 
        All Vintages: which has data on all the wines
    """
    #Initialize lists to store the extracted data
    all_recommended_vintages = []
    all_vintages_data = []

    #Stips each of the URLs so they are in proper form 
    popular_wines_df['URL'] = popular_wines_df['URL'].str.split('?').str[0]

    #Loop row in the DataFrame
    for index, row in popular_wines_df.iterrows():

        #Grabs each URL and strips and remaining white space 
        url = row['URL'].strip() 
        
        try:
            r = requests.get(url, headers={
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0"
            })

            #Check if the request was successful
            if r.status_code == 200:
                
                #Search for the JavaScript data in the page source
                res = re.search(r"window\.__PRELOADED_STATE__\.winePageInformation\s*=\s*(.*});", r.text, re.MULTILINE)
                
                if res:
                    data = json.loads(res.group(1))
                    
                    #Extract recommended vintages
                    recommended_vintages = data.get("recommended_vintages", [])
                    if recommended_vintages:
                        recommended_df = pd.DataFrame(recommended_vintages)

                        #Add the URL for reference
                        recommended_df['source_url'] = url  
                        all_recommended_vintages.append(recommended_df)

                    #Extract all vintages
                    all_vintages = data.get("wine", {}).get("vintages", [])
                    if all_vintages:
                        all_vintages_df = pd.DataFrame(all_vintages)

                        #Add the URL for reference
                        all_vintages_df['source_url'] = url  
                        all_vintages_data.append(all_vintages_df)

                else:
                    print(f"No data found for URL: {url}")
            else:
                print(f"Failed to retrieve data for URL: {url}, Status code: {r.status_code}")
        
        except Exception as e:
            print(f"An error occurred for URL: {url} - {str(e)}")

    #Concats all the recommended vintages and all vintages data into DataFrames
    recommended_vintages_df = pd.concat(all_recommended_vintages, ignore_index=True)
    all_vintages_df = pd.concat(all_vintages_data, ignore_index=True)
   
    return recommended_vintages_df, all_vintages_df

In [13]:
def Final_Vintage_Dataframe(recommended_vintages_df, all_vintages_df, popular_wines_df, lower_date_bound = 1900, lower_rating_bound  = 3, lower_review_count = 1):
    """
    Input: two dataframes 
        Recommended vintages: which has the data on the specific wines chosen 
        All Vintages: which has data on all the wines
    Output:  one dataframe 
        all_filtered_winebottle: dataframe with vintage dataframe, of importance is the rating, price, year and producer name
    """

    #Defines a function for internal function use that grabs key-values pairs
    def extract_object_data(Object_data):
        """
        Extracts key-value pairs from an object and returns a Series
        """
        if isinstance(Object_data, dict):
            return pd.Series(Object_data)
        else:
            return pd.Series()  

    #Filters the vintages df so that the all the data is only gotten for those with a valid rating 
    all_vintages_df_True = all_vintages_df[all_vintages_df["has_valid_ratings"] == True]

    #Apply the function to the 'object_column' and create a new DataFrame
    new_columns = all_vintages_df_True['statistics'].apply(extract_object_data)

    #Concatenate the original DataFrame with the new columns
    all_vintages_df_True = pd.concat([all_vintages_df_True, new_columns], axis=1)

    #Grabs specific volumns from the vintage data 
    df4 = all_vintages_df_True[['id', 'name', 'year', 'ratings_average', 'reviews_count']]

    #Grabs the id and amount from the vintage data then creates a new dataframe 
    recommended_vintages_df['id'] =recommended_vintages_df['vintage'].apply(lambda x: x.get('id'))
    df5 =recommended_vintages_df[['id', 'type']].drop_duplicates(subset = ['id'])
    
    #Merges the two dataframes
    final_merge_df = pd.merge(df4, df5, on='id', how='left')

    #Filters the data by year, ratings average and reviews count 
    all_filtered_winebottle = final_merge_df[
        (final_merge_df['year'] >= lower_date_bound) &
        (final_merge_df['ratings_average'] > lower_rating_bound) &
        (final_merge_df['reviews_count'] > lower_review_count)
    ]

    #Change the columns names 
    all_filtered_winebottle = all_filtered_winebottle.rename(columns={'id': 'ID',
                                                                      'name': 'Producer_v',
                                                                      'year': 'Year_v',
                                                                      'ratings_average': 'Ratings Average',
                                                                      'reviews_count': 'Reviews Count',
                                                                      'type': 'Type',
                                                                      })
    
    def producer_name(row, popular_wines):
        """
        Checks if a row is in the list of producers, and returns the producer name 
        """
        list_of_producers = popular_wines['Producer'].unique()

        for producer in list_of_producers:

            lower_producer = producer.lower()
            lower_row = row.lower()

            if lower_producer in lower_row:

                return producer
            
        return row
    
    #Goes through the names column and changes the name to the producer name, to allow for later merging between dataframes
    all_filtered_winebottle['Producer_v'] = all_filtered_winebottle['Producer_v'].apply(lambda row: producer_name(row, popular_wines_df))

    #Add a producer/year column for future merging 
    all_filtered_winebottle['Producer/Year'] = all_filtered_winebottle.apply(lambda row: row['Producer_v'] + ' ' + str(row['Year_v']), axis=1)

    return all_filtered_winebottle

In [None]:
def Training_Data(weather_df, vintage_df):
    """
    Input: Weather dataframe indexed on the daily scale,
        Vintage dataframe which has yearly bottles from the same producer and wine type
    Output: Weather dataframe indexed based on a single row for a producer and year 
        columns will be monthly min, max or average for the respective data per year
    """
    #Weather_df.index = pd.to_datetime(weather_df.index)
    producers = weather_df['Producer'].unique()

    #Initializes a dataframe 
    final_df = pd.DataFrame()

    #Check to see if date is the index
    if type(weather_df.index[0]) == int:

        weather_df = weather_df.set_index('Date') 

    #Create a column for the year and produce all the unique years
    weather_df.index = pd.to_datetime(weather_df.index)
    weather_df['Year'] = weather_df.index.year
    unique_years = weather_df['Year'].unique()

    #Find all the unique months
    weather_df['Month'] = weather_df.index.month
    unique_months = weather_df['Month'].unique()

    #Gives key for the month names 
    month_names = {1 : 'January', 2 : 'February', 3 : 'March', 4 : 'April', 5 : 'May', 
                                  6 : 'June', 7 : 'July', 8 : 'August', 9 : 'September', 10 : 'October', 11 : 'November',
                                  12 : 'December'}

    for producer in producers:

        #Filter the data based on the producer
        producer_df = weather_df[weather_df['Producer'] == producer]

        #Iterate over each unique year
        for year in unique_years:
            
            #Filter the data for the specific year
            yearly_df = producer_df[producer_df['Year'] == year]

            #Creates initial data
            yearly_data = {'Producer' : producer, 'Year' : year}

            #Ierate over each unique month
            for month in unique_months: 

                #Filter the data for the specific month
                monthly_df = yearly_df[yearly_df['Month'] == month]

                #Group by month
                Grouped_Daily_into_Monthly = monthly_df.groupby(monthly_df.index.to_period('M'))

                #Change month number to month name
                month = month_names[month]

                yearly_data.update({
                    
                    'WineType' : Grouped_Daily_into_Monthly['Top WineType'].first()[0],
                    f'{month} Max Temp (°C)' : Grouped_Daily_into_Monthly['Max Temp (°C)'].max().iloc[0],
                    f'{month} Min Temp (°C)' : Grouped_Daily_into_Monthly['Min Temp (°C)'].min().iloc[0],
                    f'{month} Avg Temp (°C)' : Grouped_Daily_into_Monthly['Avg Temp (°C)'].mean().iloc[0],
                    f'{month} Max Relative Humidity' : Grouped_Daily_into_Monthly['Max Relative Humidity'].max().iloc[0],
                    f'{month} Min Relative Humidity' : Grouped_Daily_into_Monthly['Min Relative Humidity'].min().iloc[0],
                    f'{month} Avg Relative Humidity' : Grouped_Daily_into_Monthly['Max Relative Humidity'].mean().iloc[0],
                    f'{month} Cumulative Rain (mm)' : Grouped_Daily_into_Monthly['Cumulative Rain (mm)'].sum().iloc[0],
                    f'{month} Cumulative Snow (mm)' : Grouped_Daily_into_Monthly['Cumulative Snow (mm)'].sum().iloc[0],
                    f'{month} Cumulative Precip (mm)' : Grouped_Daily_into_Monthly['Cumulative Rain (mm)'].sum().iloc[0] + Grouped_Daily_into_Monthly['Cumulative Snow (mm)'].sum().iloc[0],
                    f'{month} Cumulative Snow (mm)' : Grouped_Daily_into_Monthly['Cumulative Snow (mm)'].sum().iloc[0],
                    f'{month} Avg Cloud Cover (%)' : Grouped_Daily_into_Monthly['Avg Cloud Cover (%)'].mean().iloc[0],
                    f'{month} Max Wind Speed (Km/h)' : Grouped_Daily_into_Monthly['Max Wind Speed (Km/h)'].max().iloc[0],
                    f'{month} Min Wind Speed (Km/h)' : Grouped_Daily_into_Monthly['Min Wind Speed (Km/h)'].min().iloc[0],
                    f'{month} Avg Wind Speed (Km/h)' : Grouped_Daily_into_Monthly['Avg Wind Speed (Km/h)'].mean().iloc[0],
                    f'{month} Avg Daylight Hours' : Grouped_Daily_into_Monthly['Daylight Hours'].mean().iloc[0] 

                })
            
            #Adds this yearly data to original df
            final_df = pd.concat([final_df, pd.DataFrame([yearly_data])], ignore_index=True)
    
    #Add a producer/year column for future merging and reset index
    final_df = final_df.reset_index()
    final_df['Producer/Year'] = final_df.apply(lambda row: row['Producer'] + ' ' + str(row['Year']), axis=1)

    #Combine the two dataframes
    final_df = pd.merge(final_df, vintage_df, on = 'Producer/Year', how = 'left')

    #Modify the dataframe for easier viewing
    final_df = final_df.drop(columns = ['index','Producer/Year', 'ID', 'Reviews Count', 'Producer_v', 'Year_v'])
    final_df.insert(0, 'Producer', final_df.pop('Producer'))
    final_df.insert(1, 'Year', final_df.pop('Year'))
    final_df.insert(2, 'Ratings Average', final_df.pop('Ratings Average'))

    #Drop all the rows that have NAN in the ratings average column 
    final_df = final_df.dropna(subset = ['Ratings Average'])

    return final_df

---

Variable Section 4 
---

In [None]:
wine_df_final = pd.read_csv(r'C:\Users\fwhal\Downloads\CME528\Project\BreakinBadCode\Final_DataFrames\FINAL_wine_df_FINAL.csv', index = True)

recommended_vintages_df, all_vintages_df = Vintage_Dataframe(pd.read_csv(r'C:\Users\fwhal\Downloads\CME528\Project\Final_DataFrames\FINAL_wine_df_filtered_FINAL.csv'))
vintage_df_final = Final_Vintage_Dataframe(recommended_vintages_df, all_vintages_df, wine_df_final, lower_date_bound = 1900, lower_rating_bound  = 3, lower_review_count = 1)
FINAL_training_data_FINAL = Training_Data(pd.read_csv(r'C:\Users\fwhal\Downloads\CME528\Project\Final_DataFrames\FINAL_weather_data_FINAL.csv'), vintage_df_final)
FINAL_training_data_FINAL.to_csv(r'C:\Users\fwhal\Downloads\CME528\Project\Final_DataFrames\FINAL_training_data_FINAL.csv', index=False)

---