In [None]:
import scipy.io 
import numpy as np 
import pandas as pd
import scipy.sparse
import os
import json
import random
import time
import yaml
from matplotlib import pyplot as plt
from matplotlib.pyplot import rcParams
rcParams['figure.figsize'] = 14, 8
from sklearn.model_selection import train_test_split
from sklearn.metrics import PrecisionRecallDisplay

import utils
import multi_layer_perceptron as mlp
import perceptron

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')


SEED = 8 # after some testing this seed seems to work very well
random.seed(SEED)

## Data

### load the data and bring it into the desired format

In [None]:
data = scipy.io.loadmat('emails.mat') #load matlab data using the loadmat function
data

The X data is given as a 57173x10000 dense matrix. For this project I want it to be row by row instead of column by column and to transpose the matrix it has to be decompressed. Decompression can be done by using the Objects method todense() which returns a standard numpy array. This array can then be transposed into the wanted format.\
The Y data is not compressed and can be read directly. It is however also stored as a column vector (1x10000) and for this project I want it as a row vector (10000) which can be archieved by reshaping it into just one dimension (-1 means the size of the dimension should be infered).

In [None]:
X = np.asarray(data['X'].astype('int16').todense().transpose(1,0)) # since this data is in a compressed sparse representation it is decompressed using the todense() method
Y = np.asarray(data['Y'].reshape(-1)) # the labels are not compressed and can be read directly

In [None]:
# add a one to all instances for the offset
X = np.hstack([X,np.ones([X.shape[0], 1], dtype='int16')])

In [None]:
print(X.shape)
print(Y.shape)

### split data into train/test sets

I chose a train/test split of 70/30 and did not create a validation split since there are no hyperparameters to be tuned with this model.

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.7, shuffle=True, random_state = SEED)

## Perceptron from scratch

Implementation of the Perceptron Model and algorithm in NumPy. For documentation look at perceptron.py

A pretrained version of this model is stored in models/perceptron_weights.npy. To look at the results, skip to the 'pre-trained' subsection after initiallizing the model.

In [None]:
config = {'weigth_size': X_train.shape[1]}
model = perceptron.Perceptron(config) # initiallization

### training

In [None]:
perceptron.train_perceptron(model, (X_train, Y_train), (X_test, Y_test), 50)

### saving

In [None]:
# with open(os.path.join('models', 'perceptron_weigths.npy'), 'wb') as f:
#     np.save(f, model.weights)

### pre-trained

In [None]:
with open(os.path.join('models', 'perceptron_weigths.npy'), 'rb') as f:
    model.weights = np.load(f)
perceptron.test_perceptron(model, (X_train, Y_train), (X_test, Y_test))

### precision-recall curve

In [None]:
preds = np.empty(Y.shape[0])
for i in range(len(Y)):
    preds[i] = model.forward(X[i])
preds = np.sign(preds)

In [None]:
display = PrecisionRecallDisplay.from_predictions(
    preds, Y, name="Perceptron"
)
_ = display.ax_.set_title("Perceptron prediction curve")

## Multi-layer Perceptron as a more complex approach

The model can be trained in this notebook though I recommend running it directly from command line ($ python multi_layer_perceptron.py) especially when running this on a GPU since Jupyter notebooks will store values in your GPU's RAM even after the computation.

A pre-trained model is again provided. To run it you can skip to the 'pre-trained' subsection after initiallization.

Additional documentation is provided in multi_layer_perceptron.py

### initialization

In [None]:
# loading experiment configurations
with open('config.yml', 'r') as f:
    config = yaml.safe_load(f)

torch.manual_seed(config['manual_seed']) # set a manual seed for reproducability
torch.cuda.manual_seed(config['manual_seed']) # also need to set it for runs on GPU

# setting up the data in dataloaders
data = scipy.io.loadmat('emails.mat')
X_torch = np.asarray(data['X'].astype('float32').todense().transpose(1,0)) 
Y = np.asarray(data['Y'].astype('float32').reshape(-1)) 
Y_torch = (Y+1)/2 # labels are given as [-1, 1] but need to be [0,1] for binary Cross Entropy Loss
config['in_size'] = X_torch.shape[1]
X_train, X_test, Y_train, Y_test = train_test_split(X_torch, Y_torch, train_size = config['train_size'], shuffle=True, random_state=config['manual_seed'])
trainset = utils.Dataset(X_train, Y_train)
train_loader = DataLoader(trainset, batch_size=config['batch_size'])
testset = utils.Dataset(X_test, Y_test)
test_loader = DataLoader(testset, batch_size=config['batch_size'])

# initiallizing the model
model = mlp.MLP(config) # initiallizes MLP model
model = model.to(device) # only effects GPU environments

### training

In [None]:
# training the model
model.train()
optimizer = optim.Adam(params = model.parameters(), lr=config['lr']) # Adam seems to be a good multi-purpose Optimizer
loss_fn = nn.BCELoss() # binary Cross Entropy Loss
losses, train_accs, test_accs = mlp.train(model, optimizer, loss_fn, train_loader, config, test_loader) # trains the model and returns results

# logging the training results
log_data = {'losses': losses, 'train_accs': train_accs, 'test_accs': test_accs}
with open(os.path.join('results', f'{config["manual_seed"]}.json'), 'w') as f:
    json.dump(log_data, f) # log training results
print()

# running tests and printing results
model.eval()
utils.run_analysis(model, train_loader, test_loader, device)

In [None]:
### saving ###
# torch.save(model.state_dict(), os.path.join('models', 'mlp.pth'))

### pre-trained

In [None]:
model.load_state_dict(torch.load(os.path.join('models', 'mlp.pth')))
model.eval()
utils.run_analysis(model, train_loader, test_loader, device)

### precision-recall curve

In [None]:
preds = model(torch.tensor(X_torch, device=device))
preds = preds.cpu().detach().numpy()
preds = np.sign(preds-0.5)
display = PrecisionRecallDisplay.from_predictions(
    preds, Y, name="MLP"
)
_ = display.ax_.set_title("MLP")

## Data visualizations

### Perceptron

In [None]:
with open(os.path.join('results', 'perceptron2.json'), 'r') as f:
    data = json.load(f)
df = pd.DataFrame(data)
df.head()

In [None]:
fig = plt.figure(dpi=180)
ax = fig.add_axes([0,0,1,1])
ax.plot(list(range(len(df))),df['losses'])
# ax.legend(labels=['Churn'])
ax.set_xlabel('epochs')
ax.set_ylabel('Loss')
plt.show()

In [None]:
fig = plt.figure(dpi=180)
ax = fig.add_axes([0,0,1,1])
ax.plot(list(range(len(df))),df['train_accs'])
ax.plot(list(range(len(df))),df['test_accs'])
ax.legend(labels=['Training Data', 'Test Data'])
ax.set_xlabel('epochs')
ax.set_ylabel('Accuracy')
plt.show()

### Multi-layer Perceptron

In [None]:
with open(os.path.join('results', '42.json'), 'r') as f:
    data = json.load(f)
df = pd.DataFrame(data)
df.head()

In [None]:
fig = plt.figure(dpi=180)
ax = fig.add_axes([0,0,1,1])
ax.plot(list(range(len(df))),df['losses'])
# ax.legend(labels=['Churn'])
ax.set_xlabel('epochs')
ax.set_ylabel('Loss')
plt.show()

In [None]:
fig = plt.figure(dpi=180)
ax = fig.add_axes([0,0,1,1])
ax.plot(list(range(len(df))),df['train_accs'])
ax.plot(list(range(len(df))),df['test_accs'])
ax.legend(labels=['Training Data', 'Test Data'])
ax.set_xlabel('epochs')
ax.set_ylabel('Accuracy')
plt.show()