In [1]:
import requests
import geopandas as gpd
import pandas as pd

In [2]:
from bs4 import BeautifulSoup 
from shapely.geometry import Point
from tqdm.notebook import tqdm

## Open GIS Data

In [110]:
file_path = 'data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp'
states_df = gpd.read_file(file_path)

## Get Population Data

In [111]:
state_populations = pd.read_excel('data/NST-EST2023-POP.xlsx', sheet_name=None, engine='openpyxl')

In [112]:
state_populations_df = state_populations['NST-EST2023-POP'][['table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)','Unnamed: 5']]
state_populations_df = state_populations_df.rename(columns={'table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)': "NAME",'Unnamed: 5':'POPULATION'})
state_populations_df['NAME'] = state_populations_df['NAME'].str[1:]

In [114]:
states_with_population_df = states_df.merge(state_populations_df, on='NAME', how='left')
states_with_population_df = states_with_population_df[['STUSPS','NAME','POPULATION','geometry']]

## Get Winery Data

In [48]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
r = requests.get('https://www.winerelease.com/Winery_List/Alphabetical_Winery_List.html', headers=headers) 

In [49]:
soup = BeautifulSoup(r.text, 'html.parser')

In [50]:
body_with_list = soup.find_all('body')[0]

In [81]:
list_of_wineries = body_with_list.text.split("Thanks")[1].split("FAQs")[0].replace("\n","").split(";")

In [90]:
list_of_wineries = list_of_wineries[2:]

In [101]:
state_names = ["Alaska", "Alabama", "Arkansas", "American Samoa", "Arizona",
               "California", "Colorado", "Connecticut", "District ", "of Columbia",
               "Delaware", "Florida", "Georgia", "Guam", "Hawaii", "Iowa", "Idaho",
               "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts",
               "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi",
               "Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire",
               "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma",
               "Oregon", "Pennsylvania", "Puerto Rico", "Rhode Island", "South Carolina",
               "South Dakota", "Tennessee", "Texas", "Utah", "Virginia", "Virgin Islands",
               "Vermont", "Washington", "Wisconsin", "West Virginia", "Wyoming"]

In [105]:
state_counts = {}
for wineries in tqdm(list_of_wineries, desc='Parsing Distilleries'):

    for state in state_names:
        if state in wineries:
            state_name = state
    
    if state_name not in state_counts:
        state_counts[state_name] = 1
    else:
        state_counts[state_name] += 1

Parsing Distilleries:   0%|          | 0/2978 [00:00<?, ?it/s]

In [121]:
wineries_per_state_series = pd.Series(state_counts, name="wineries")
wineries_per_state_df = wineries_per_state_series.reset_index()
wineries_per_state_df = wineries_per_state_df.rename(columns={'index':'NAME'})

## Merge Data

In [122]:
wineries_per_state_df = states_with_population_df.merge(wineries_per_state_df, on='NAME', how='right')

In [128]:
wineries_per_state_df['per_100k'] = wineries_per_state_df['wineries']/(wineries_per_state_df['POPULATION']/100000)
wineries_per_state_df['per_500k'] = wineries_per_state_df['wineries']/(wineries_per_state_df['POPULATION']/500_000)
wineries_per_state_df['per_1m'] = wineries_per_state_df['wineries']/(wineries_per_state_df['POPULATION']/1_000_000)

In [129]:
wineries_per_state_df.to_file('data/wineries.gpkg')