# Beer Type Prediction

## Initialise Data and Environment

In [1]:
import numpy as np
import pandas as pd
import os

# set parent folder for this notebook folder as working directory
os.chdir(os.getcwd().rsplit('/',1)[0])

#### Load and transform data: 
brewery_name is dropped for now, because: 

- there are over 5,000 brewery names
    - hypothesis is that brewery_name is not going to help with model performance (this hypothesis is not tested with experiment in this report, but is suggested for future exercise)


- brewery_name may also cause typo problem in API, this will also increase complexity with API design

only 4 numeric features remains for modelling:

    review_aroma, review_appearance, review_palate, review_taste

In [2]:
# set loadstatus to false by default
loadstatus = False
traindata_path = 'data/processed/'

# check if training data is already stored, 
# if exist: load data and and set loadstatus to true

if os.path.exists(traindata_path):
    X_train = np.load(f'{traindata_path}X_train.npy')
    X_val = np.load(f'{traindata_path}X_val.npy')
    X_test = np.load(f'{traindata_path}X_test.npy')
    y_train = np.load(f'{traindata_path}y_train.npy')
    y_val = np.load(f'{traindata_path}y_val.npy')
    y_test = np.load(f'{traindata_path}y_test.npy')
    
    loadstatus = True

In [3]:
from src.data.load_dataset import load_data

if loadstatus == False:
    raw_filepath = 'data/processed/beer_review_cleaned.csv'
    cleaned_filepath = 'data/processed/beer_review_cleaned.csv'

    # load and cleanse data
    df_cleaned=load_data(raw_filepath, cleaned_filepath)
    
    # extract target label column
    target = df_cleaned.pop('beer_style')

In [4]:
# scale data with standard scaler, save the fitted scaler to folder
from src.data.load_dataset import scale_features
from sklearn.preprocessing import StandardScaler

if loadstatus == False:
    sc = StandardScaler()
    df_scaled = scale_features(df_cleaned, sc)

In [5]:
# encode target with label encoder
from src.data.load_dataset import encode_label
from sklearn.preprocessing import LabelEncoder

if loadstatus == False:
    enc = LabelEncoder()
    enc_target = encode_label(target, enc)

In [6]:
from src.models.train_model import split_data

if loadstatus == False:
    X_train, X_val, X_test, y_train, y_val, y_test = split_data(df_scaled, enc_target)

### Train Neural Network with Hyper Parameter Tuning

Convert datasets into tensors

In [7]:
from src.models.pytorch import PytorchDataset

train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)

In [9]:
from src.models.pytorch import PytorchMC3layers

model = PytorchMC3layers(X_train.shape[1], layer2_neurons=512)

print(model)

PytorchMC3layers(
  (layer_1): Linear(in_features=4, out_features=512, bias=True)
  (layer_out): Linear(in_features=512, out_features=104, bias=True)
  (softmax): Softmax(dim=1)
)


Detect GPU

In [19]:
from src.models.pytorch import get_device

device = get_device()
model.to(device)

PytorchMC3layers(
  (layer_1): Linear(in_features=4, out_features=512, bias=True)
  (layer_out): Linear(in_features=512, out_features=104, bias=True)
  (softmax): Softmax(dim=1)
)

define loss function and optimiser

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

define number of epochs and batch size

In [17]:
N_EPOCHS = 3
BATCH_SIZE = 16

In [18]:
from src.models.pytorch import train_classification, test_classification

train_losses = []
train_accs = []
valid_losses = []
valid_accs = []

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset, model=model, criterion=criterion, optimizer=optimizer, batch_size=BATCH_SIZE, device=device)
    valid_loss, valid_acc = test_classification(val_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.1f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.1f}%')
    
    train_losses.append(train_loss)
    train_accs.append(train_acc)
    valid_losses.append(valid_loss)
    valid_accs.append(valid_acc)

Epoch: 0
	(train)	|	Loss: 0.2858	|	Acc: 8.9%
	(valid)	|	Loss: 0.2857	|	Acc: 8.9%
Epoch: 1
	(train)	|	Loss: 0.2857	|	Acc: 8.9%
	(valid)	|	Loss: 0.2857	|	Acc: 8.9%
Epoch: 2
	(train)	|	Loss: 0.2857	|	Acc: 8.9%
	(valid)	|	Loss: 0.2858	|	Acc: 8.9%
