# Train a PyTorch Classifier
In the udacity course we were introduced to a plagiarism dataset which we will use here to train our model. Rather than go through the feature engineering steps we'll simply take the training and testing data from that notebook as our starting point.

# Constants

In [None]:
# where to save torch model output
MODEL_DIR = 'model'

## Load Data

In [None]:
import pickle
plagiarism_data = pickle.load(open('../udacity/plagiarism_data.p', 'rb'))
plagiarism_data.keys()

In [None]:
# unpack data
train_x, train_y, test_x, test_y = plagiarism_data.values()

In [None]:
# check
print(f"train_x has shape {train_x.shape}")
print(f"train_y has shape {train_y.shape}")
print(f"test_x has shape {test_x.shape}")
print(f"test_y has shape {test_y.shape}")

# Visualise Training Data
Visualising our training data may give an indication as to the relationship between the inputs and the targets, and guide us to the level of non-linearity present in the data. It will inform our choice of algorithm

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize = (8, 5))
plt.scatter(train_x[:,1], train_y)
plt.title('Longest Common Subsequence vs Plagiarism')
plt.xlabel('c20')
plt.ylabel('plagiarism')
plt.show()

Everything above longest common subsequence 0.38 appears to be plagiarised

In [None]:
plt.figure(figsize = (8, 5))
plt.scatter(train_x[:,0], train_y)
plt.title('Containmnet vs Plagiarism')
plt.xlabel('c20')
plt.ylabel('plagiarism')
plt.show()

Containmnet looks to be a much better predictor of plagiarism, with everything above 0 indicating plagiarism

In [None]:
# points are coloured by class, train_y
import matplotlib.pyplot as plt

plt.figure(figsize = (8, 5))
plt.scatter(train_x[:,0], train_x[:,1], c = train_y)
plt.title('Two Way Scatter Plot')
plt.xlabel('c20')
plt.ylabel('lcs')
plt.show()

Using both these features in a model should give us a good decision boundary for classifying plagiarism

# Training the Model

We are going to train a neural network to classify the students answers into plagiarised vs non-plagiarised. As part of the udacity course they provided some boiler plate code to train the network using amazon sagemaker. 

In our situation we are not looking to deploy the model as an API and therefore using sagemaker is overkill, and too timeconsuming. Instead we'll write a basic pytorch classifier ourselves, and score the medium data using the local machine

In [None]:
import torch.utils.data

# Turn the numpy arrays into tensors
train_x = torch.from_numpy(train_x).float()
train_y = torch.from_numpy(train_y).float().squeeze()

# build the torch dataset
train_ds = torch.utils.data.TensorDataset(train_x, train_y)

# build the data loaders
train_dl = torch.utils.data.DataLoader(train_ds, batch_size = 10)

In [None]:
def train(model, train_loader, epochs, optimizer, criterion, device):
    
    for epoch in range(1, epochs + 1):
        # monitor training loss
        train_loss_total = 0
        
        ######################
        # train the model    #
        ######################
        model.train() # prep model for training
        
        for data, target in train_loader:
            target = target.contiguous().view(-1, 1)
            
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad() # zero accumulated gradients
            output = model(data) # make a forward pass
            loss = criterion(output, target)
            loss.backward() # make a backward pass
            optimizer.step()
            
            train_loss_total += loss.item()
            
        # calculate average loss over an epoch
        train_loss = train_loss_total / len(train_loader)
        
        # print loss statistics
        print(f"Epoch: {epoch}, train_loss: {train_loss}")

In [None]:
# torch imports
import torch.nn.functional as F
import torch.nn as nn

class BinaryClassifier(nn.Module):
    """
    Define a neural network that performs binary classification.
    The network should accept your number of features as input, and produce 
    a single sigmoid value, that can be rounded to a label: 0 or 1, as output.
    
    Notes on training:
    To train a binary classifier in PyTorch, use BCELoss.
    BCELoss is binary cross entropy loss, documentation: https://pytorch.org/docs/stable/nn.html#torch.nn.BCELoss
    """

    ## TODO: Define the init function, the input params are required (for loading code in train.py to work)
    def __init__(self, input_features, hidden_dim, output_dim):
        """
        Initialize the model by setting up linear layers.
        Use the input parameters to help define the layers of your model.
        :param input_features: the number of input features in your training/test data
        :param hidden_dim: helps define the number of nodes in the hidden layer(s)
        :param output_dim: the number of outputs you want to produce
        """
        super(BinaryClassifier, self).__init__()

        # define any initial layers, here
        self.fc1 = nn.Linear(input_features, hidden_dim)
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
        # sigmoid layer
        self.sig = nn.Sigmoid()
        
    ## TODO: Define the feedforward behavior of the network
    def forward(self, x):
        """
        Perform a forward pass of our model on input features, x.
        :param x: A batch of input features of size (batch_size, input_features)
        :return: A single, sigmoid-activated value as output
        """
        
        # define the feedforward behavior
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.sig(x)
        
        return x

In [None]:
import torch.optim as optim

input_features = 2
hidden_dim = 7
output_dim = 1
learning_rate = 0.001
epochs = 300

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BinaryClassifier(input_features, hidden_dim, output_dim)
optimizer = optim.Adam(model.parameters(), lr = learning_rate)
loss_fn = torch.nn.BCELoss()

train(model, train_dl, epochs, optimizer, loss_fn, device)

# Predict Test Data

In [None]:
# Provided predict function
def predict_fn(input_data, model):
    print('Predicting class probabilities for the input data...')

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Process input_data so that it is ready to be sent to our model.
    data = torch.from_numpy(input_data.astype('float32'))
    data = data.to(device)

    # Put the model into evaluation mode
    model.eval()

    # Predicted scores
    probabilities = model(data).cpu().detach().numpy()

    return probabilities

In [None]:
probabilities = predict_fn(test_x, model)

In [None]:
labels = probabilities.round()

In [None]:
labels.shape

## Evaluate Model

In [None]:
# returns a variety of model metrics
import pandas as pd
import numpy as np

def evaluate(test_preds, test_labels, verbose=True):
    """
    Evaluate a model on a test set given the prediction endpoint.  
    Return binary classification metrics.
    :param predictor: A prediction endpoint
    :param test_features: Test features
    :param test_labels: Class labels for test data
    :param verbose: If True, prints a table of all performance metrics
    :return: A dictionary of performance metrics.
    """
    test_preds = np.squeeze(test_preds)
    
    # calculate true positives, false positives, true negatives, false negatives
    tp = np.logical_and(test_labels, test_preds).sum()
    fp = np.logical_and(1-test_labels, test_preds).sum()
    tn = np.logical_and(1-test_labels, 1-test_preds).sum()
    fn = np.logical_and(test_labels, 1-test_preds).sum()
    
    # calculate binary classification metrics
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    accuracy = (tp + tn) / (tp + fp + tn + fn)
    
    # print metrics
    if verbose:
        print(pd.crosstab(test_labels, test_preds, rownames=['actuals'], colnames=['predictions']))
        print("\n{:<11} {:.3f}".format('Recall:', recall))
        print("{:<11} {:.3f}".format('Precision:', precision))
        print("{:<11} {:.3f}".format('Accuracy:', accuracy))
        print()
        
    return {'TP': tp, 'FP': fp, 'FN': fn, 'TN': tn, 
            'Precision': precision, 'Recall': recall, 'Accuracy': accuracy}

In [None]:
metrics = evaluate(labels, test_y)

# Save Model

In [None]:
## TODO: complete in the model_info by adding three argument names, the first is given
# Keep the keys of this dictionary as they are 
import os

model_info_path = os.path.join(MODEL_DIR, 'model_info.pth')
with open(model_info_path, 'wb') as f:
    model_info = {
        'input_features': input_features,
        'hidden_dim': hidden_dim,
        'output_dim': output_dim,
    }
    torch.save(model_info, f)
  
# Save the model parameters
model_path = os.path.join(MODEL_DIR, 'model.pth')
with open(model_path, 'wb') as f:
    torch.save(model.cpu().state_dict(), f)