In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
os.chdir(os.getcwd().rsplit('/',1)[0])

In [3]:
# nominate columns
col_tar = ['beer_style']
col_cat = ['brewery_name']
col_num = ['review_aroma', 'review_appearance', 'review_palate', 'review_taste']

In [4]:
filepath = 'data/processed/beer_review_cleaned.csv'

# load data from csv
df = pd.read_csv('data/raw/beer_reviews.csv')

# clean up dataset: drop unrelated columns and drop rows that contain NA 
df_cleaned = df.copy()
df_cleaned = df[col_cat + col_num + col_tar]
# df_cleaned = df[col_num + col_tar]

# store data
df_cleaned.dropna(inplace=True)
df_cleaned.to_csv('data/processed/beer_review_cleaned.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [5]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1586599 entries, 0 to 1586613
Data columns (total 6 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   brewery_name       1586599 non-null  object 
 1   review_aroma       1586599 non-null  float64
 2   review_appearance  1586599 non-null  float64
 3   review_palate      1586599 non-null  float64
 4   review_taste       1586599 non-null  float64
 5   beer_style         1586599 non-null  object 
dtypes: float64(4), object(2)
memory usage: 84.7+ MB


In [6]:
from sklearn.model_selection import train_test_split

# remove brewery name as it's irrelevant
bco = df_cleaned.pop('brewery_name')

# extract target label column
target = df_cleaned.pop(col_tar[0])

In [7]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
enc = OrdinalEncoder()
enc_target = np.array(target)
enc_target = enc_target.reshape(-1,1)
enc_target = enc.fit_transform(enc_target)

In [8]:
# split data
X_data, X_test, y_data, y_test = train_test_split(df_cleaned, enc_target, stratify=enc_target, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, stratify=y_data, test_size=0.15, random_state=42)

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

num_tfm = Pipeline(steps=[('scaler', StandardScaler())])
cat_tfm = Pipeline(steps=[('one_hot_encoder', OneHotEncoder(sparse=False))])

In [10]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('col_num', num_tfm, col_num)
    ]
)

In [11]:
from src.models.pytorch import PytorchDataset

In [12]:
train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)

In [13]:
train_dataset

<src.models.pytorch.PytorchDataset at 0x7f69950896a0>

## Build Neural Network

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

In [15]:
from src.models.pytorch import PytorchMultiClass

model = PytorchMultiClass(X_train.shape[1])

In [16]:
from src.models.pytorch import get_device

device = get_device()
model.to(device)

PytorchMultiClass(
  (layer_1): Linear(in_features=4, out_features=32, bias=True)
  (layer_out): Linear(in_features=32, out_features=4, bias=True)
  (softmax): Softmax(dim=1)
)

In [17]:
device

device(type='cuda', index=0)

In [18]:
print(model)

PytorchMultiClass(
  (layer_1): Linear(in_features=4, out_features=32, bias=True)
  (layer_out): Linear(in_features=32, out_features=4, bias=True)
  (softmax): Softmax(dim=1)
)


In [19]:
criterion = nn.CrossEntropyLoss()

In [28]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)

In [29]:
def train_classification(train_data, model, criterion, optimizer, batch_size, device, scheduler=None, generate_batch=None):
    """Train a Pytorch multi-class classification model

    Parameters
    ----------
    train_data : torch.utils.data.Dataset
        Pytorch dataset
    model: torch.nn.Module
        Pytorch Model
    criterion: function
        Loss function
    optimizer: torch.optim
        Optimizer
    bacth_size : int
        Number of observations per batch
    device : str
        Name of the device used for the model
    scheduler : torch.optim.lr_scheduler
        Pytorch Scheduler used for updating learning rate
    collate_fn : function
        Function defining required pre-processing steps

    Returns
    -------
    Float
        Loss score
    Float:
        Accuracy Score
    """
    
    # Set model to training mode
    model.train()
    train_loss = 0
    train_acc = 0
    
    # Create data loader
    data = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)
    
    # Iterate through data by batch of observations
    for feature, target_class in data:

        # Reset gradients
        optimizer.zero_grad()
        
        # Load data to specified device
        feature, target_class = feature.to(device), target_class.to(device)
        
        # Make predictions
        output = model(feature)
        
        # Calculate loss for given batch
#         loss = criterion(output, target_class.long())
        loss = criterion(output, torch.max(target_class, 1)[1])       

        # Calculate global loss
        train_loss += loss.item()
        
        # Calculate gradients
        loss.backward()

        # Update Weights
        optimizer.step()
        
        # Calculate global accuracy
        train_acc += (output.argmax(1) == target_class).sum().item()

    # Adjust the learning rate
    if scheduler:
        scheduler.step()

    return train_loss / len(train_data), train_acc / len(train_data)

In [30]:
def test_classification(test_data, model, criterion, batch_size, device, generate_batch=None):
    """Calculate performance of a Pytorch multi-class classification model

    Parameters
    ----------
    test_data : torch.utils.data.Dataset
        Pytorch dataset
    model: torch.nn.Module
        Pytorch Model
    criterion: function
        Loss function
    bacth_size : int
        Number of observations per batch
    device : str
        Name of the device used for the model
    collate_fn : function
        Function defining required pre-processing steps

    Returns
    -------
    Float
        Loss score
    Float:
        Accuracy Score
    """    
    
    # Set model to evaluation mode
    model.eval()
    test_loss = 0
    test_acc = 0
    
    # Create data loader
    data = DataLoader(test_data, batch_size=batch_size, collate_fn=generate_batch)
    
    # Iterate through data by batch of observations
    for feature, target_class in data:
        
        # Load data to specified device
        feature, target_class = feature.to(device), target_class.to(device)
        
        # Set no update to gradients
        with torch.no_grad():
            
            # Make predictions
            output = model(feature)
            
            # Calculate loss for given batch
#             loss = criterion(output, target_class.long())
            loss = criterion(output, torch.max(target_class, 1)[1])

            # Calculate global loss
            test_loss += loss.item()
            
            # Calculate global accuracy
            test_acc += (output.argmax(1) == target_class).sum().item()

    return test_loss / len(test_data), test_acc / len(test_data)

In [33]:
N_EPOCHS = 8
BATCH_SIZE = 8

In [34]:
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset, model=model, criterion=criterion, optimizer=optimizer, batch_size=BATCH_SIZE, device=device)
    valid_loss, valid_acc = test_classification(val_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.1f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.1f}%')

Epoch: 0
	(train)	|	Loss: 0.0930	|	Acc: 3.9%
	(valid)	|	Loss: 0.0930	|	Acc: 3.9%
Epoch: 1
	(train)	|	Loss: 0.0930	|	Acc: 3.9%
	(valid)	|	Loss: 0.0930	|	Acc: 3.9%


KeyboardInterrupt: 