In [None]:
# %pip install requests
# %pip install beautifulsoup4
# %pip install selenium
# %pip install pandas
# %pip install geopy
# %pip install geopandas
# %pip install folium
# %pip install openmeteo-requests
# %pip install requests-cache retry-requests numpy pandas
# %pip install timezonefinder
# % pip install seaborn

In [None]:
import os
import time
import requests
import pandas as pd
import geopandas as gpd
import folium
import openmeteo_requests
import requests_cache
import sqlite3
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import json
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from geopy.geocoders import Nominatim
from geopy.geocoders import OpenCage
from folium import Marker
from folium import GeoJson
from retry_requests import retry
from timezonefinder import TimezoneFinder 

Function Breakdown Section 1
---
SQL_Files_to_df
- pulls scraped wine data from an SQL file and converts it to a dataframe

Lat_Long_Coordinates
- takes in a location name and returns a lat long value that is associated with it 

Wine_DataFrame 
- creates a dataframe from the initial wine data that contains unique locations and coordinates

Regions_and_Districts
- takes in global shape files for regions and districts, adds this data to the wine dataframe 

Popular_Wine_Stats_1Bottle
- returns a dataframe with the most popular producer of the most popular winetype in the most popular district and region in the top 8 most popular countries 
- this dataset will focus on global trends, so a single bottle is used

Popular_Wine_Stats_All_Bottles
- returns a dataframe with all the bottles for each of the regions of interest 
- this dataset will focus on local trends

In [None]:
def SQL_Files_to_df(file_name, host_folder):
    """
    host_folder: name of the host folder with each of the SQL files to read
    return: DataFrame of the contents of each sql file 
    """
    
    # Connect to the SQLite database 
    conn = sqlite3.connect(os.path.join(host_folder, file_name))

    df = pd.read_sql(f'SELECT * FROM red_wines', conn)

    df.set_index('id', inplace = True)

    df = df.dropna()

    return df

In [None]:
def Lat_Long_Coordinates(location_name):
    """
    name: takes in a location name
    return: returns the lat/long coordinates of the names area
    """
    
    #Uses an OpenCage api_key to filter name through geolocator database
    geolocator = OpenCage(api_key = '74ac6790c4464814b25887115034e579')
    location = geolocator.geocode(location_name)

    if location:
        return location.latitude, location.longitude
    
    else:
        #To grab further data points, uses Nominatim service to filter name through additional geolocator database
        geolocator = Nominatim(user_agent = "your_unique_user_agent", timeout = 10)
        location = geolocator.geocode(location_name)

        if location:
            return location.latitude, location.longitude
        
        else:
            return None, None

In [None]:
def Wine_DataFrame(raw_wine_data):
    """
    Raw_Wine_Data: takes in a df of all the wines 
    return (): returns a data frame where the indices are unique locations, and columns are #instances of each location, and lat/long coordinates

    """
    #Creates a new column with a combination of region + country from the original dataframe
    raw_wine_data['Locations'] = raw_wine_data['Region'] + ', ' + raw_wine_data['Country']

    #Drops any na values in the location column
    raw_wine_data = raw_wine_data.dropna()

    #Creates a list of unique locations and number of instances of each unique locations
    global_locations = raw_wine_data['Locations'].unique()

    #Creates a data frame with 5 columns: Locations, Location_Instances, Latitude, and Longitude
    complete_wine_data = pd.DataFrame({"Locations" : global_locations,
                       'Location_Instances' : raw_wine_data['Locations'].value_counts()
                       })
    
    complete_wine_data[["Lat","Long"]] = complete_wine_data["Locations"].apply(lambda row: pd.Series(Lat_Long_Coordinates(row)))
        
    #Set index to Locations
    complete_wine_data.set_index('Locations', inplace = True)

    return complete_wine_data

In [22]:
def Regions_and_Districts(wine_data, Location_DataFrame, geojson_host_folder):
    """
    Inputs: The wine data, the intermediate location df created before and a host folder that contains teh geojson data
    Output: A modified wine data df with 3 new columns 
        LatLong_Points: geodataframe geometry point using lat long coords
        Regions: polygon geometery of the region the lat long coords are in 
        District: polygon geometery of the region the lat long coords are in 
    """

    wine_data = wine_data.dropna()


    #Defines the file names for the regional and district files
    regions_filename = 'geoBoundariesCGAZ_ADM1_regions.geojson'
    districts_filename = 'geoBoundariesCGAZ_ADM2_districts.geojson'

    #Adds the lat long coords to the wine dataframe 
    Location_df_changed = Location_DataFrame.drop(columns = ['Location_Instances'])
    wine_data['Locations'] = wine_data['Region'] + ', ' + wine_data['Country']
    wine_data = pd.merge(wine_data, Location_df_changed, on = "Locations", how = 'left')
    wine_data = wine_data.drop('Region', axis = 1)

    #Creates a geodataframe with the coordinates
    wine_data_gdf = gpd.GeoDataFrame(wine_data, geometry = gpd.points_from_xy(x = wine_data['Long'], y = wine_data['Lat']), crs = 'EPSG:4326')
    wine_data_gdf = wine_data_gdf.rename(columns={'geometry': 'LatLong_Points'})
    wine_data_gdf = wine_data_gdf.set_geometry('LatLong_Points')

    #Reads the regional and district geodataframes 
    regions_gdf = gpd.read_file(os.path.join(geojson_host_folder, regions_filename))
    districts_gdf = gpd.read_file(os.path.join(geojson_host_folder, districts_filename))
    
    #Ensure both GeoDataFrames are using the same CRS
    regions_gdf = regions_gdf.to_crs(wine_data_gdf.crs)
    districts_gdf = districts_gdf.to_crs(wine_data_gdf.crs)

    #Joins the wine data df with the regions and districts geodataframs
    wine_data_gdf = gpd.sjoin(wine_data_gdf, regions_gdf[['shapeName', 'geometry']], how='left', op='within')
    wine_data_gdf = gpd.sjoin(wine_data_gdf, districts_gdf[['shapeName', 'geometry']], how='left', op='within', lsuffix='_region', rsuffix='_district')

    #Rename columns for clarity
    wine_data_gdf = wine_data_gdf.rename(columns={
        'shapeName__region' : 'Region', 
        'shapeName__district' : 'District',
    })

    #Drop geodata, wont need to moving forward after locating the regions and districts for each row 
    wine_data_gdf.drop(['url_idx', 'index_right', 'LatLong_Points', 'index__district'], axis = 1, inplace = True)
    
    #Set dataframe order, for clarity
    wine_data_gdf = wine_data_gdf[['WineType','Producer', 'Locations', 'Rating',
                                    'Num_Ratings', 'Price', 'Country', 'Region', 'District',
                                    'Lat', 'Long', 'URL']]
    
    return wine_data_gdf

In [42]:
def Popular_Wine_Stats_1Bottle(wine_data_final, number_of_districts = 1):
    """
    Input: wine_data_final which should contain a dataframe that has counties, regions, wintetype, and producers
    Output: A dataframe containing the top countries, 
            region within those countries, top 5 districts within that region,
            top winetype within that district and top producer of that wine
    """

    #Drop an unnecessary column
    if 'Unnamed: 0' in wine_data_final.columns:
        wine_data_final = wine_data_final.drop(columns = ['Unnamed: 0'])
        
    #Creates a simple name for easy use while referencing 
    df = wine_data_final.copy()

    #Get total bottle counts by country and filter out countries with bottle counts more than the 75th percentile
    total_bottles_country = df['Country'].value_counts()
    top_countries = total_bottles_country[total_bottles_country > total_bottles_country.quantile(0.75)]

    #Filter the df to make winetypes uniform 
    popular_wine_types = ['château margaux', 'cabernet sauvignon', 'pinot noir', 'zinfandel', 'syrah', 
                            'pinot gris', 'sauvignon blanc', 'chardonnay', 'baco noir', 'bordeaux',
                            'malbec', 'chardonnay', 'pinot grigio', 'merlot', 'sangiovese', 'shiraz',
                            'cabernet franc', 'muscat', 'grenache', 'sangiovese'  ]
    
    #Sets all the winetypes to be lower 
    df['WineType'] = df['WineType'].str.lower()

    #Filters through each of the winetypes, then changes the df winetype name if the wine type is in the row string 
    for winetype in popular_wine_types:

        df['WineType']  = df['WineType'].apply(lambda row: next((winetype for winetype in popular_wine_types if winetype in row), row))

    #Creates dataframe for output
    popular_wines = pd.DataFrame()

    #------------------------------------------------------------------------------------------------------------#
    #Goes through the top countries and pulls the top regions and the amount of bottles associated with them
    for country in top_countries.index:

        #Checks if region is in top countries, then grabs the top region 
        filter_country_df = df[df['Country'] == country]

        top_region = filter_country_df['Region'].value_counts().idxmax()
        top_region_count = filter_country_df['Region'].value_counts().max()

        popular_wines = pd.concat([popular_wines, pd.DataFrame([{'Top Country': country, 'Country Count': int(top_countries[country]), 'Top Region': top_region, 'Region Count': top_region_count}])])     
        popular_wines.reset_index(drop = True, inplace = True)

    #------------------------------------------------------------------------------------------------------------#
    #Goes through the top countries and regions within those countries and pulls the top districts and the amount of bottles associated with them
    new_rows = []
    for index, row in popular_wines.iterrows():

        #Checks if winetype is in top region, then grabs the top winetype 
        filter_region_df = df[df['Region'] == row['Top Region']]
        top_district_count = filter_region_df['District'].value_counts().head(number_of_districts)

        #Goes through each of the top 5 districts and adds the district name and count for a given country and region
        for district, count in top_district_count.items():
            if count > 0: 
                new_rows.append({'index' : index, 'Top District' : district, 'District Count' : count})
        
    #Adds the district data to the popular_wines dataframe 
    district_type = pd.DataFrame(new_rows)
    district_type.set_index('index', inplace = True)

    #Merge district dataframe to the popular wines dateframe
    popular_wines = pd.merge(popular_wines, district_type, left_index = True, right_index = True, how = 'left')

    #------------------------------------------------------------------------------------------------------------#
    # Goes through the top counties and gets top district and region within those countries and pulls the top winetype and the amount of bottles associated with them

    new_rows = []
    for index, row in popular_wines.iterrows():

        #Filter the dataframe based on region and district
        filtered_district_df = df[(df['Region'] == row['Top Region']) & (df['District'] == row['Top District'])]

        #Checks if winetype is in top region, then grabs the top winetype 
        top_winetype = filtered_district_df['WineType'].value_counts().idxmax()
        top_winetype_count = int((filtered_district_df['WineType'].value_counts().max()))
        
        new_rows.append({'Top WineType' : top_winetype, 'WineType Count' : top_winetype_count})
    
    #Adds the winetype data to the popular_wines dataframe 
    wine_type = pd.DataFrame(new_rows)
    popular_wines['Top WineType'] = wine_type['Top WineType'].values
    popular_wines['WineType Count'] = wine_type['WineType Count'].values
    popular_wines.reset_index(inplace = True)

    #------------------------------------------------------------------------------------------------------------#
    #Goes through the top countries, regions within those countries, and top winetype within that region 
    #Pulls the producers of that winetype, and takes a mean of them, returning the top 5 from that mean
    
    new_rows = []
    for index, row in popular_wines.iterrows():

                #Checks if producer is in top region, district, and produces top winetype, then grabs the top producers
                top_producers = df[(df['WineType'] == row['Top WineType']) & (df['District'] == row['Top District']) & (df['Region'] == row['Top Region'])]

                #Grabs the quantiles for the number of ratings, then filters out the data below the lowest 25% 
                #The mean is then found for the ratings based on the producer 
                Num_of_Ratings_Quantile = np.quantile(df['Num_Ratings'], [0.25, 0.5, 0.75])
                top_producer = top_producers[top_producers['Num_Ratings'] >  Num_of_Ratings_Quantile[0]].groupby(by = 'Producer')['Rating'].mean().sort_values(ascending = False)

                #Ensure top_producer has data
                if not top_producer.empty:  
                    new_rows.append({'Producer': top_producer.index[0], 'Average Rating': top_producer.iloc[0]})
                else:
                    new_rows.append({'Producer': 'No Producer Found', 'Average Rating': None})

    producer_rating = pd.DataFrame(new_rows)
    popular_wines['Producer'] = producer_rating['Producer'].values
    popular_wines['Average Rating'] = producer_rating['Average Rating'].values
    popular_wines.reset_index(inplace = True)
    #------------------------------------------------------------------------------------------------------------#

    #Add price, URL, and lat/long coords for the outputed points
    df_urllatlong = pd.DataFrame()
    df_urllatlong[['Price', 'Producer','Lat', 'Long', 'URL']] = df.drop_duplicates(subset = 'Producer')[['Price', 'Producer',  'Lat', 'Long', 'URL']]
    final_popular_wines = pd.merge(popular_wines, df_urllatlong, on = 'Producer')

    #Drop any rows with no producer
    final_popular_wines = final_popular_wines[final_popular_wines['Producer'] != 'No Producer Found']
   
    return final_popular_wines.iloc[:, 2:]

In [25]:
def Popular_Wine_Stats_All_Bottles(wine_data_final, wine_data_1bottle, number_of_districts):
    """
    Input: wine_data_final which should contain a dataframe that has counties, regions, wintetype, and producers
    Output: A dataframe containing the top countries, 
            region within those countries, top 5 districts within that region,
            top winetype within that district and top producer of that wine
    """
    #Drop an unnecessary column
    if 'Unnamed: 0' in wine_data_final.columns:
        wine_data_final = wine_data_final.drop(columns = ['Unnamed: 0'])

    #Creates a simple name for easy use while referencing 
    df = wine_data_final.copy()

    #Get total bottle counts by country and filter out countries with bottle counts more than the 75th percentile
    total_bottles_country = df['Country'].value_counts()
    top_countries = total_bottles_country[total_bottles_country > total_bottles_country.quantile(0.75)]

    #Filter the df to make winetypes uniform 
    popular_wine_types = ['château margaux', 'cabernet sauvignon', 'pinot noir', 'zinfandel', 'syrah', 
                            'pinot gris', 'sauvignon blanc', 'chardonnay', 'baco noir', 'bordeaux',
                            'malbec', 'chardonnay', 'pinot grigio', 'merlot', 'sangiovese', 'shiraz',
                            'cabernet franc', 'muscat', 'grenache', 'sangiovese'  ]
    
    #Sets all the winetypes to be lower 
    df['WineType'] = df['WineType'].str.lower()

    #Filters through each of the winetypes, then changes the df winetype name if the wine type is in the row string 
    for winetype in popular_wine_types:

        df['WineType']  = df['WineType'].apply(lambda row: next((winetype for winetype in popular_wine_types if winetype in row), row))

    #Creates dataframe for output
    popular_wines = pd.DataFrame()

    #------------------------------------------------------------------------------------------------------------#
    #Goes through the top countries and pulls the top regions and the amount of bottles associated with them
    for country in top_countries.index:

        #Checks if region is in top countries, then grabs the top region 
        filter_country_df = df[df['Country'] == country]

        top_region = filter_country_df['Region'].value_counts().idxmax()
        top_region_count = filter_country_df['Region'].value_counts().max()

        popular_wines = pd.concat([popular_wines, pd.DataFrame([{'Top Country': country, 'Country Count': int(top_countries[country]), 'Top Region': top_region, 'Region Count': top_region_count}])])     
        popular_wines.reset_index(drop = True, inplace = True)

    #------------------------------------------------------------------------------------------------------------#
    #Goes through the top countries and regions within those countries and pulls the top districts and the amount of bottles associated with them
    new_rows = []
    for index, row in popular_wines.iterrows():

        #Checks if winetype is in top region, then grabs the top winetype 
        filter_region_df = df[df['Region'] == row['Top Region']]
        top_district_count = filter_region_df['District'].value_counts().nlargest(number_of_districts)

        #Goes through each of the top 5 districts and adds the district name and count for a given country and region
        for district, count in top_district_count.items():
            new_rows.append({'index' : index, 'Top District' : district})
        
    #Adds the district data to the popular_wines dataframe 
    district_type = pd.DataFrame(new_rows)
    district_type.set_index('index', inplace = True)

    #Merge district dataframe to the popular wines dateframe
    popular_wines = pd.merge(popular_wines, district_type, left_index = True, right_index = True, how = 'left')

    #------------------------------------------------------------------------------------------------------------#
    #Goes through the top countries, regions within those countries, and top district within that region 
    #Pulls the producers of each winetype, and takes a mean of them, returning the top 5 from that mean
    
    new_rows = []
    for index, row in popular_wines.iterrows():

                #Checks if producer is in top region, district, and produces top winetype, then grabs the top producers
                top_producers_filtered = df[(df['District'] == row['Top District']) & (df['Region'] == row['Top Region'])]

                #Filter the producers, so one rating per producer
                top_producers = top_producers_filtered.groupby(by = ['WineType','Producer'])['Rating'].mean()

                #Go through each winetype and get the producer and rating 
                for (winetype, producer), rating in top_producers.items():
                
                    new_rows.append({'District' : row['Top District'],
                                     'WineType' : winetype,
                                     'Producer' : producer, 
                                     'Rating' : rating
                                     })

    district_data = pd.DataFrame(new_rows)
    popular_wines = pd.merge(popular_wines, district_data, left_on = 'Top District', right_on = 'District')
    popular_wines.reset_index(inplace = True)
    popular_wines = popular_wines.drop(columns = ['District'])
    #------------------------------------------------------------------------------------------------------------#

    #Add price, URL, and lat/long coords for the outputed points
    df_url = pd.DataFrame()
    df_url[['Price', 'Producer', 'URL']] = df.drop_duplicates(subset = 'Producer')[['Price', 'Producer', 'URL']]
    final_popular_wines = pd.merge(popular_wines, df_url, on = 'Producer')
    
    df_latlong = pd.DataFrame()
    df_latlong[['Top District', 'Lat', 'Long']] =  wine_data_1bottle[['Top District', 'Lat', 'Long']]
    final_popular_wines = pd.merge(final_popular_wines, df_latlong, on = 'Top District')
   
    return final_popular_wines.iloc[:, 2:]

---

# Variable Section 1


In [None]:
file_path = r'C:\Users\fwhal\Downloads\CME528\Project\Repo-2\BreakinBadCode'
final_df_file_path = r'C:\Users\fwhal\Downloads\CME528\Project\Repo-2\BreakinBadCode\Final_DataFrames'

In [None]:
red_wines = SQL_Files_to_df(r'Wine_Raw_Data\red_wines_final.db', file_path )
location_df = Wine_DataFrame(red_wines)

In [None]:
#Long processing time, have saved previous output to csv, load from this
FINAL_wine_df_FINAL = Regions_and_Districts(red_wines, location_df, os.path.join(r'C:\Users\fwhal\Downloads\CME528\Project', 'GeoJsonFiles'))
FINAL_wine_df_FINAL.to_csv(os.path.join(final_df_file_path, 'FINAL_wine_df_FINAL.csv'), index = True)

### Wine Data For a Single Bottle

In [43]:
FINAL_wine_df_FINAL = pd.read_csv(os.path.join(final_df_file_path, 'FINAL_wine_df_FINAL.csv'))

FINAL_wine_df_filtered_1Bottle_FINAL = Popular_Wine_Stats_1Bottle(FINAL_wine_df_FINAL, number_of_districts = 6)
FINAL_wine_df_filtered_1Bottle_FINAL.to_csv(os.path.join(final_df_file_path, 'FINAL_wine_df_filtered_1Bottle_FINAL.csv'), index = True)

### Wine Data For a All Bottles

In [None]:
# FINAL_wine_df_filtered_All_Bottles_FINAL = Popular_Wine_Stats_All_Bottles(FINAL_wine_df_FINAL, FINAL_wine_df_filtered_1Bottle_FINAL, number_of_districts = 5)
# FINAL_wine_df_filtered_All_Bottles_FINAL.to_csv(os.path.join(final_df_file_path, 'FINAL_wine_df_filtered_All_Bottles_FINAL.csv'), index = True)

---

Function Breakdown Section 2
---
Plotting_Unique_Locations
- plots all the wine locations from producer using the Wine_DataFrame function 

In [None]:
def Plotting_Unique_Locations(DataFrame):
    """
    df: A dataframe with columns including Location_Instances, and lat/long coordinates
    return: a global map of where wines are located from 
    """
    #Drop any NaN from the data
    DataFrame = DataFrame.dropna()

    #Converts DataFrame into a GeoDataFrame
    Global_Areas = gpd.GeoDataFrame(DataFrame, geometry = gpd.points_from_xy(x = DataFrame['Long'], y = DataFrame['Lat']), crs = 'EPSG:4326')

    # Create map centered around Toronto 
    map_1 = folium.Map(location = [43.6426, -79.3871], 
                    tiles = 'cartodbpositron', 
                    zoom_start = 2)
        
    # Plot each buffer area and show the map 
    GeoJson(Global_Areas).add_to(map_1)

    for idx, row in Global_Areas.iterrows():

        #Creates a base size for each location, that grows with each instance recorded
        radius = 6500 + row['Location_Instances'] * 50

        folium.Circle(
            location = [row['Lat'], row['Long']],
            radius = radius,  
            color = 'blue',
            fill = True,
            fill_color = 'blue',
            fill_opacity = 0.3,
            popup = folium.Popup(f"Location: {row.name}", parse_html = True)
        ).add_to(map_1)

    return map_1

---
# Variable Section 2


In [None]:
Plotting_Unique_Locations(location_df)