In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
os.chdir(os.getcwd().rsplit('/',1)[0])

In [3]:
# nominate columns
col_tar = ['beer_style']
col_cat = ['brewery_name']
col_num = ['review_aroma', 'review_appearance', 'review_palate', 'review_taste']

In [6]:
filepath = 'data/processed/beer_review_cleaned.csv'

# load data from csv
df = pd.read_csv('data/raw/beer_reviews.csv')

# clean up dataset: drop unrelated columns and drop rows that contain NA 
df_cleaned = df.copy()
# df_cleaned = df[col_cat + col_num + col_tar]
df_cleaned = df[col_num + col_tar]

# store data
df_cleaned.dropna(inplace=True)
df_cleaned.to_csv('data/processed/beer_review_cleaned.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [8]:
from sklearn.model_selection import train_test_split

# extract target label column
target = df_cleaned.pop(col_tar[0])

In [9]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder
enc = LabelEncoder()
enc_target = np.array(target)
enc_target = enc.fit_transform(target)

In [8]:
enc.transform(['American IPA'])

array([12])

In [9]:
enc.inverse_transform([12])

array(['American IPA'], dtype=object)

In [10]:
import joblib

joblib.dump(enc, 'models/output_encoder.joblib')

['models/output_encoder.joblib']

In [11]:
sc = StandardScaler()
df_scaled = sc.fit_transform(df_cleaned)
df_scaled.shape

(1586614, 4)

In [14]:
joblib.dump(sc, 'models/standard_scaler.joblib')

['models/standard_scaler.joblib']

In [12]:
# split data
X_data, X_test, y_data, y_test = train_test_split(df_scaled, enc_target, stratify=enc_target, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, stratify=y_data, test_size=0.15, random_state=42)

In [13]:
from src.models.pytorch import PytorchDataset

train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)

## Build Neural Network

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

In [15]:
from src.models.pytorch import PytorchMultiClass

model = PytorchMultiClass(X_train.shape[1])

In [16]:
from src.models.pytorch import get_device

device = get_device()
model.to(device)

PytorchMultiClass(
  (layer_1): Linear(in_features=4, out_features=128, bias=True)
  (layer_out): Linear(in_features=128, out_features=104, bias=True)
  (softmax): Softmax(dim=1)
)

In [17]:
len(np.unique(y_train))

104

In [18]:
device

device(type='cpu')

In [19]:
print(model)

PytorchMultiClass(
  (layer_1): Linear(in_features=4, out_features=128, bias=True)
  (layer_out): Linear(in_features=128, out_features=104, bias=True)
  (softmax): Softmax(dim=1)
)


In [20]:
criterion = nn.CrossEntropyLoss()

In [22]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [23]:
N_EPOCHS = 5
BATCH_SIZE = 16

In [None]:
from src.models.pytorch import train_classification, test_classification
import time

# store train and validation scores for chartting
train_losses = []
train_accs = []
valid_losses = []
valid_accs = []

# train model
for epoch in range(N_EPOCHS):
    # time trianing
    tic = time.perf_counter()

    train_loss, train_acc = train_classification(train_dataset, model=model, criterion=criterion, optimizer=optimizer, batch_size=BATCH_SIZE, device=device)
    valid_loss, valid_acc = test_classification(val_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.1f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.1f}%')

    train_losses.append(train_loss)
    train_accs.append(train_acc)
    valid_losses.append(valid_loss)
    valid_accs.append(valid_acc)

    # time training in seconds
    toc = time.perf_counter()
    print(f"Epoch took {toc - tic:0.4f} seconds")

epoch_count = range(1, len(train_accs)+1)

plt.plot(epoch_count, valid_losses, 'r--')
plt.plot(epoch_count, valid_accs, 'b-')
plt.legend(['Validation Loss', 'Validation Accuracy'])
plt.xlabel('Epoch')
plt.ylabel('Epoch Scores')
plt.show()

Epoch: 0
	(train)	|	Loss: 0.2867	|	Acc: 7.4%
	(valid)	|	Loss: 0.2867	|	Acc: 7.4%
Epoch took 221.8989 seconds
Epoch: 1
	(train)	|	Loss: 0.2867	|	Acc: 7.4%
	(valid)	|	Loss: 0.2867	|	Acc: 7.4%
Epoch took 277.0414 seconds
Epoch: 2
	(train)	|	Loss: 0.2867	|	Acc: 7.4%
	(valid)	|	Loss: 0.2867	|	Acc: 7.4%
Epoch took 264.8372 seconds


In [27]:
y_pred_list = []

with torch.no_grad():
    model.eval()
    data = DataLoader(test_dataset, batch_size=1, collate_fn=None)
    for X_batch, _ in data:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        _, y_pred_tags = torch.max(y_test_pred, dim=-1)
        y_pred_list.append(y_pred_tags.cpu().numpy())

In [28]:
y_pred_list[:20]

[array([9]),
 array([1]),
 array([14]),
 array([12]),
 array([12]),
 array([12]),
 array([12]),
 array([12]),
 array([12]),
 array([9]),
 array([9]),
 array([12]),
 array([12]),
 array([14]),
 array([12]),
 array([12]),
 array([12]),
 array([12]),
 array([60]),
 array([9])]

In [40]:
torch.save(model.state_dict(), "models/pt_base_state_dict")

In [69]:
arch = model._modules
arch = {i: arch[i] for i in model._modules}
arch

{'layer_1': Linear(in_features=4, out_features=128, bias=True),
 'layer_out': Linear(in_features=128, out_features=104, bias=True),
 'softmax': Softmax(dim=1)}

In [47]:
[[print(k) for k,v in model._modules]]

TypeError: 'PytorchMultiClass' object is not iterable

In [72]:
from torchsummary import summary

In [81]:
model._modules

OrderedDict([('layer_1', Linear(in_features=4, out_features=128, bias=True)),
             ('layer_out',
              Linear(in_features=128, out_features=104, bias=True)),
             ('softmax', Softmax(dim=1))])

In [2]:
[print(i) for i in model._modules]

NameError: name 'model' is not defined