In [127]:
# specify the path and then load packages
import sys
sys.path.insert(0, '/Users/timtamothy/Documents/GitHub/adsi_at2/')

import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from src.data.sets import save_sets

In [128]:
%pip install torch torchvision torchaudio

Note: you may need to restart the kernel to use updated packages.


In [129]:
# load data
df = pd.read_csv('../data/raw/beer_reviews.csv')

In [130]:
# reduce the dataset to the features that will be used in the API
df_cleaned = df.drop(['brewery_id', 'review_time', 'review_overall', 'review_profilename', 'beer_name', 'beer_abv', 'beer_beerid'], axis = 1)

In [33]:
# preliminary eda on data
ProfileReport(df, title="EDA of raw data")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



EDA notes:

1. brewery_name is missing 15 values

2. review_aroma, appearance, palate, taste could be considered ordinal values

3. 104 different distinct beer styles... quite many. Unsure whether or not the few features are enough to classify with this granularity



In [131]:
df_cleaned.shape

(1586614, 6)

In [132]:
# remove the beers with missing brewery names
df_cleaned.replace(r'^\s*$', np.nan, regex=True)
df_cleaned.dropna(inplace = True)

In [133]:
df_cleaned.shape

(1586599, 6)

In [134]:
df_cleaned.head()

Unnamed: 0,brewery_name,review_aroma,review_appearance,beer_style,review_palate,review_taste
0,Vecchio Birraio,2.0,2.5,Hefeweizen,1.5,1.5
1,Vecchio Birraio,2.5,3.0,English Strong Ale,3.0,3.0
2,Vecchio Birraio,2.5,3.0,Foreign / Export Stout,3.0,3.0
3,Vecchio Birraio,3.0,3.5,German Pilsener,2.5,3.0
4,Caldera Brewing Company,4.5,4.0,American Double / Imperial IPA,4.0,4.5


In [135]:
df_cleaned['beer_style'] = df_cleaned['beer_style'].str.strip()

In [136]:
df_cleaned.to_csv('../data/interim/clean_beers.csv', index=False)

In [137]:
df_small = df_cleaned.sample(frac=0.1)

In [138]:
# define the categorical columns, which is all the features
cat_cols = ['brewery_name', 'beer_style']

In [139]:
unique_breweries = df_cleaned.brewery_name.unique()
unique_styles = df_cleaned.beer_style.unique()

In [140]:
unique_breweries

array(['Vecchio Birraio', 'Caldera Brewing Company',
       'Amstel Brouwerij B. V.', ..., 'Wissey Valley Brewery',
       'Outback Brewery Pty Ltd', 'Georg Meinel Bierbrauerei KG'],
      dtype=object)

In [141]:
cat_dict = {
    'brewery_name': [unique_breweries],
    'beer_style': [unique_styles],
}

In [142]:
from sklearn.preprocessing import OrdinalEncoder

for col, cats in cat_dict.items():
    col_encoder = OrdinalEncoder(categories=cats)
    df_cleaned[col] = col_encoder.fit_transform(df_cleaned[[col]])

In [143]:
df_cleaned.head()

Unnamed: 0,brewery_name,review_aroma,review_appearance,beer_style,review_palate,review_taste
0,0.0,2.0,2.5,0.0,1.5,1.5
1,0.0,2.5,3.0,1.0,3.0,3.0
2,0.0,2.5,3.0,2.0,3.0,3.0
3,0.0,3.0,3.5,3.0,2.5,3.0
4,1.0,4.5,4.0,4.0,4.0,4.5


In [144]:
# Save the ordinal encoded file
df_cleaned.to_csv('../data/interim/ordinal.csv', index=False)

In [145]:
# Convert y column into integer
df_cleaned['beer_style'] = df_cleaned['beer_style'].astype(int)

In [146]:
# copy df_cleaned
df_cleaned2 = df_cleaned

# Split the data into X and y
y = df_cleaned2.pop('beer_style')
X = df_cleaned2

### Use Undersample to reduce the majority classes

In [147]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_over, y_over = ros.fit_resample(X, y)

In [148]:
len(X_over)

12228736

In [152]:
from sklearn.model_selection import train_test_split
X, X_bye, y, y_bye = train_test_split(X_over, y_over, test_size=0.99, random_state=42, stratify=y_over)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

In [153]:
# Save sets
!mkdir ../data/processed/ordinal_full
save_sets(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, path='../data/processed/ordinal_full/')

mkdir: ../data/processed/ordinal_full: File exists


In [154]:
len(X_train)

97829

In [62]:
import torch
from torch.utils.data import Dataset, DataLoader
from src.models.pytorch import PytorchDataset

train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)