# 1. Set up Environment

In [1]:
%pwd

'/home/jovyan/work/notebooks'

In [2]:
%cd '/home/jovyan/work'

/home/jovyan/work


In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import os
import pandas as pd
import numpy as np

In [5]:
pd.options.display.max_rows = 10000

# 2. Load and Explore Data

In [6]:
df = pd.read_csv('data_files/raw/beer_reviews.csv')

In [7]:
df.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


In [8]:
df.shape

(1586614, 13)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 13 columns):
brewery_id            1586614 non-null int64
brewery_name          1586599 non-null object
review_time           1586614 non-null int64
review_overall        1586614 non-null float64
review_aroma          1586614 non-null float64
review_appearance     1586614 non-null float64
review_profilename    1586266 non-null object
beer_style            1586614 non-null object
review_palate         1586614 non-null float64
review_taste          1586614 non-null float64
beer_name             1586614 non-null object
beer_abv              1518829 non-null float64
beer_beerid           1586614 non-null int64
dtypes: float64(6), int64(3), object(4)
memory usage: 157.4+ MB


In [10]:
df.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


# 3. Prepare Data

In [11]:
df_cleaned = df.copy()

### Drop unused variables

In [12]:
df_cleaned = df_cleaned.drop(['brewery_id', 'review_time','review_profilename','beer_beerid','beer_name','beer_abv'], axis=1)

### Create Categorical Variable Dictionary

In [13]:
arr_brewery_name = df_cleaned.brewery_name.unique()
arr_beer_style = df_cleaned.beer_style.unique()

In [14]:
lst_brewery_name = list(arr_brewery_name)
lst_beer_style = list(arr_beer_style)

In [15]:
cats_dict = {
    'brewery_name': [lst_brewery_name],
    'beer_style': [lst_beer_style]
}

### Quantify NULL Values

In [16]:
df_cleaned.isnull().sum()

brewery_name         15
review_overall        0
review_aroma          0
review_appearance     0
beer_style            0
review_palate         0
review_taste          0
dtype: int64

In [17]:
df_cleaned.dropna(how='any', inplace=True)

### Transform Categorical column values with encoder

In [18]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

In [19]:
for col, cats in cats_dict.items():
    col_encoder = OrdinalEncoder(categories=cats)
    df_cleaned[col] = col_encoder.fit_transform(df_cleaned[[col]])

In [20]:
num_cols = ['brewery_name','review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste']

In [21]:
target_col = 'beer_style'

In [22]:
sc = StandardScaler()

In [23]:
df_cleaned[num_cols] = sc.fit_transform(df_cleaned[num_cols])

In [24]:
df_cleaned['beer_style'] = df_cleaned['beer_style'].astype(int)

In [25]:
X = df_cleaned

In [26]:
X.describe()

Unnamed: 0,brewery_name,review_overall,review_aroma,review_appearance,beer_style,review_palate,review_taste
count,1586599.0,1586599.0,1586599.0,1586599.0,1586599.0,1586599.0,1586599.0
mean,4.562954e-16,-1.835786e-16,-3.4680740000000006e-17,1.038989e-16,33.18427,1.577114e-16,-1.372899e-16
std,1.0,1.0,1.0,1.0,25.52354,1.0,1.0
min,-1.658902,-5.294872,-3.921421,-6.235533,0.0,-4.021772,-3.815568
25%,-0.8812362,-0.4379353,-0.3377775,-0.5545413,13.0,-0.3572271,-0.4001069
50%,-0.04116536,0.2559127,0.3789512,0.2570289,25.0,0.375682,0.2829853
75%,0.840909,0.9497608,0.3789512,0.2570289,53.0,0.375682,0.9660776
max,1.786589,1.643609,1.812409,1.880169,103.0,1.8415,1.64917


### Split Data

In [27]:
from sets import split_sets_by_time, save_sets, split_sets_random

In [28]:
X_train, y_train, X_val, y_val, X_test, y_test = split_sets_random(X, target_col=target_col, test_ratio=0.2, to_numpy=True)

In [29]:
save_sets(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, X_test=X_test, y_test=y_test, path='data_files/processed/beer')

# 4. Model

In [30]:
from pytorch import PytorchDataset

train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)

In [31]:
from null import NullModel

In [32]:
baseline_model = NullModel(target_type='classification')
y_base = baseline_model.fit_predict(y_train)

In [33]:
from performance import print_class_perf

In [34]:
print_class_perf(y_base, y_train, set_name='Training', average='weighted')

Accuracy Training: 0.07430256975352931
F1 Training: 0.01027805764859095


# 5. Define Architecture

In [35]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [36]:
from pytorch import PytorchMultiClass

model = PytorchMultiClass(X_train.shape[1])

In [37]:
from pytorch import get_device

device = get_device()
model.to(device)

PytorchMultiClass(
  (layer_1): Linear(in_features=6, out_features=32, bias=True)
  (layer_out): Linear(in_features=32, out_features=104, bias=True)
  (softmax): Softmax(dim=1)
)

# 6. Train Model

In [38]:
criterion = nn.CrossEntropyLoss()

In [39]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [40]:
from train_classification_model import train_classification
from test_classification_model import test_classification

In [43]:
N_EPOCHS = 10
BATCH_SIZE = 32

In [44]:
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset, model=model, criterion=criterion, optimizer=optimizer, batch_size=BATCH_SIZE, device=device)
    valid_loss, valid_acc = test_classification(val_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.1f}%')

Epoch: 0
	(train)	|	Loss: 0.1433	|	Acc: 7.4%
Epoch: 1
	(train)	|	Loss: 0.1433	|	Acc: 7.4%
Epoch: 2
	(train)	|	Loss: 0.1433	|	Acc: 7.4%
Epoch: 3
	(train)	|	Loss: 0.1433	|	Acc: 7.4%
Epoch: 4
	(train)	|	Loss: 0.1433	|	Acc: 7.4%
Epoch: 5
	(train)	|	Loss: 0.1433	|	Acc: 7.4%
Epoch: 6
	(train)	|	Loss: 0.1433	|	Acc: 7.4%
Epoch: 7
	(train)	|	Loss: 0.1433	|	Acc: 7.4%
Epoch: 8
	(train)	|	Loss: 0.1433	|	Acc: 7.4%
Epoch: 9
	(train)	|	Loss: 0.1433	|	Acc: 7.4%


In [45]:
torch.save(model, "models/pytorch_multi_beer_evaluation.pt")

In [46]:
test_loss, test_acc = test_classification(test_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)
print(f'\tLoss: {test_loss:.4f}\t|\tAccuracy: {test_acc:.1f}')

	Loss: 0.1434	|	Accuracy: 0.1
