# Notebook Setup
Installs necessary requirements and configures proper notebook behavior  
Note: The notebook assumes you are using the base conda environment

In [25]:
%load_ext autoreload
%autoreload 2
    
print("Notebook setup has completed")

Notebook setup has completed


# Environment Setup
Downloads configured dataset and performs necessary environment bootstrapping

In [24]:
from bootstrap import setup


# Please rename .env-template to .env and adjust values as needed
setup()

print("Environment setup has completed")

del setup

Environment setup has completed


# Sandbox
Available data split into training, testing, and validation sets

In [154]:
from preprocessing import get_dataframe, get_subsets_no_val
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

df = get_dataframe()  # In the form [features, labels]

word_vectorizer = CountVectorizer(
    ngram_range=(1, 2), 
    analyzer="word", 
    min_df=0.001, 
    max_df=0.7
    )
X = df["review"].to_numpy()
X = word_vectorizer.fit_transform(X)
y = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0).to_numpy()

# TODO: Transform raw text into a representation suitable for a MLP
# df can has as many feature columns as needed, but the last column must be the label column

X_train, X_test, y_train, y_test = get_subsets_no_val(X,y)


# Manual Testing

In [155]:
print("Data and shapes:")
print("X_train:", type(X_train), "of shape", X_train.shape)
print("Y_train:", type(y_train), "of shape", y_train.shape)

Data and shapes:
X_train: <class 'scipy.sparse._csr.csr_matrix'> of shape (40000, 33418)
Y_train: <class 'numpy.ndarray'> of shape (40000,)


In [108]:
from mlp import MultiLayerPerceptron
from analysis import accuracy

input_size = X_train.shape[1]

mlp = MultiLayerPerceptron(
    epochs=150,
    lr=0.05,
    input_layer=input_size,
    hidden_layers=[30, 10],
    output_layer=1,
    activation="sigmoid",
)

mlp.fit(X_train, y_train)

100%|██████████| 150/150 [15:32<00:00,  6.21s/it]


In [113]:
print(accuracy(mlp.predict(X_test), y_test))

0.55


In [156]:
from multiprocessing import Pool, cpu_count
from multip import train_mlp
import numpy as np

PROCCESSES = cpu_count() - 1
classifiers = []
with Pool(PROCCESSES) as p:
    classifiers = p.map(train_mlp, [(X_train, y_train, X_test, y_test)] * PROCCESSES)
accuracies = [clf.score(X_test, y_test) for clf in classifiers]
print(accuracies, ":", np.mean(accuracies))
print(np.argmax(accuracies), "is the best classifier")
best_clf = classifiers[np.argmax(accuracies)]

[0.9096, 0.9101, 0.9123, 0.9111, 0.9086, 0.911, 0.9091, 0.9147, 0.9124, 0.9123, 0.9119, 0.9102, 0.9092, 0.913, 0.9111] : 0.9111066666666666
7 is the best classifier


In [157]:

p = best_clf.predict_proba(X_test)
p = p @ np.array([-1,1]).reshape(2,1)
print(f"Most positive example is {np.argmax(p)}")
print(f"Most negative example is {np.argmin(p)}")
print(f"Most neutral example is {np.argmin(np.abs(p))}")

p = (p*0.5 + 0.5)
incorrect_amount = p - y_test.reshape(-1,1)
print(f'Most incorrect example is {np.argmax(np.abs(incorrect_amount))}')


Most positive example is 6197
Most negative example is 8165
Most neutral example is 5622
Most incorrect example is 2694


In [165]:
example = 6197
print(f"Predicted: {best_clf.predict(X_test[example])[0]} {p[example]}\nActual: {y_test[example]}")
print(word_vectorizer.inverse_transform(X_test[example])[0])


Predicted: 1 [1.]
Actual: 1
['one' 'has' 'just' 'you' 'be' 'are' 'right' 'as' 'what' 'with' 'br'
 'violence' 'which' 'from' 'go' 'not' 'or' 'no' 'on' 'city' 'an' 'where'
 'all' 'have' 'so' 'more' 'far' 'fact' 'audiences' 'around' 'ever' 'who'
 'out' 'get' 'well' 'middle' 'into' 'experience' 'if' 'can' 'side'
 'one of' 'of the' 'this is' 'br br' 'br the' 'from the' 'in the' 'it is'
 'as that' 'that is' 'is the' 'to the' 'all the' 'on the' 'more so'
 'that it' 'out for' 'and get' 'with it' 'what is' 'you can' 'very' 'time'
 'sense' 'realism' 'only' 'he' 'down' 'pat' 'see' 'by' 'written' 'his'
 'life' 'things' 'rather' 'than' 'sets' 'done' 'sense of' 'of realism'
 'not only' 'by the' 'but it' 'written and' 'rather than' 'and the' 'way'
 'dialogue' 'characters' 'even' 'serial' 'killer' 'while' 'some' 'style'
 'us' 'most' 'at' 'years' 'been' 'she' 'her' 'young' 'woman' 'interesting'
 'the dialogue' 'dialogue is' 'the characters' 'characters are'
 'serial killer' 'of us' 'the most' 'in this'

In [None]:
y_

## Runs
1. 89.98: (10,5), Logisitic, Early Stopping
2. 89.95: (10,5), Relu, Early Stopping,
3. 89.71: (5), Relu, Early Stopping
4. 85.12: (5), Relu, 
5. 91.11: (5), Relu, Early Stopping, 2-Grams

In [18]:
from sklearn.neural_network import MLPClassifier
from analysis import accuracy
clf = MLPClassifier(
    hidden_layer_sizes=(5),
    # early_stopping=True,
)
clf = clf.fit(X_train, y_train)

In [19]:
print(f"Training accuracy: {accuracy(clf.predict(X_train), y_train)}")
print(f"Testing accuracy: {accuracy(clf.predict(X_test), y_test)}")

Training accuracy: 1.0
Testing accuracy: 0.74


# Training

In [None]:
from sympy import Lambda, Symbol
from training import matrix_train


x = Symbol("x")


# each axis must be an iterable. if you want to use a constant, wrap it in an iterable of len 1
hyperparameter_matrix = {
    "epochs": np.logspace(np.log10(100), np.log10(100000), num=20, dtype="int64"),
    "lr": np.logspace(np.log10(.00001), np.log10(.1), num=20),
    "hidden_layers": [5, 6, 7],
    "neurons_per_layer": [3],
    "activation": [Lambda(x, x**2)],
}

best_params = matrix_train(hyperparameter_matrix, MultiLayerPerceptron, X_train, y_train, X_test, y_test)

print(best_params)
# mlp = MultiLayerPerceptron(**best_params)


# Performance

In [None]:
from analysis import accuracy, confusion, report


mlp = MultiLayerPerceptron(**best_params)
mlp.fit(X_train, y_train)

# print(accuracy(mlp.predict(X_test), y_test))
# print(confusion(mlp.predict(X_test), y_test))
print(report(mlp.predict(X_test), y_test))


# Benchmarking

In [None]:
# TODO utilize other classifiers and compare performance