# Notebook Setup
Installs necessary requirements and configures proper notebook behavior  
Note: The notebook assumes you are using the base conda environment

In [None]:
%load_ext autoreload
%autoreload 2
    
print("Notebook setup has completed")

# Load Data
Load the data from the csv file into a pandas dataframe

In [None]:
CDS_AND_VINYL_JSON_PARAMS = {
    'file': 'reviews_CDs_and_Vinyl_5.json',
    'filetype': 'json',
    'features': "reviewText",
    'labels': "overall",
}
CELL_PHONE_JSON_PARAMS = {
    'file': 'reviews_Cell_Phones_and_Accessories_5.json',
    'filetype': 'json',
    'features': "reviewText",
    'labels': "overall",
}
CLOTHING_JSON_PARAMS = {
    'file': 'reviews_Clothing_Shoes_and_Jewelry_5.json',
    'filetype': 'json',
    'features': "reviewText",
    'labels': "overall",
}
ELECTRONICS_JSON_PARAMS = {
    'file': 'reviews_Electronics_5.json',
    'filetype': 'json',
    'features': "reviewText",
    'labels': "overall",
}
HOME_AND_KITCHEN_JSON_PARAMS = {
    'file': 'reviews_Home_and_Kitchen_5.json',
    'filetype': 'json',
    'features': "reviewText",
    'labels': "overall",
}
KINDLE_STORE_JSON_PARAMS = {
    'file': 'reviews_Kindle_Store_5.json',
    'filetype': 'json',
    'features': "reviewText",
    'labels': "overall",
}
MOVIES_JSON_PARAMS = {
    'file': 'reviews_Movies_and_TV_5.json',
    'filetype': 'json',
    'features': "reviewText",
    'labels': "overall",
}
SPORTS_JSON_PARAMS = {
    'file': 'reviews_Sports_and_Outdoors_5.json',
    'filetype': 'json',
    'features': "reviewText",
    'labels': "overall",
}

In [None]:
# Please add the above files you have in your data directory
files = [ELECTRONICS_JSON_PARAMS, CDS_AND_VINYL_JSON_PARAMS, CELL_PHONE_JSON_PARAMS, HOME_AND_KITCHEN_JSON_PARAMS, SPORTS_JSON_PARAMS]

In [None]:
from preprocessing import get_dataframe_file
frames = []
for file in files:
    frame = get_dataframe_file(params = file, points=50_000, equalize=True)
    frames.append(frame)

In [None]:
import pandas as pd
df = pd.concat(frames)

In [None]:
df.head()

# Preprocess Data
Split into training, testing, and validation sets, and vectorize the data

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from preprocessing import get_subsets

word_vectorizer = CountVectorizer(
    min_df=0.0001, 
    max_df=0.7
 )
X = df["reviewText"].to_numpy()
X = word_vectorizer.fit_transform(X)
y = df['overall'].to_numpy()

X_train, X_val, X_test, y_train, y_val, y_test = get_subsets(X,y, train_split=0.8, val_split=0.1, test_split=0.1)


# Visualize Data Distribution

In [None]:
import matplotlib.pyplot as plt
import numpy as np
plt.bar(np.unique(y_train, return_counts=True)[0], np.unique(y_train, return_counts=True)[1])
plt.xlabel("Rating")
plt.ylabel("Count")
plt.show()

In [None]:
print("Data and shapes:")
print("X_train:", type(X_train), "of shape", X_train.shape)
print("Y_train:", type(y_train), "of shape", y_train.shape)

# Training the Model

In [None]:
from mlp import MultiLayerPerceptron
clf = MultiLayerPerceptron(
    epochs=50,
    lr=0.1,
    hidden_layers=[500, 200],
    activation="sigmoid",
)
clf.fit(X_train, y_train, X_val, y_val, batch_size=100)
clf.plot_loss()

In [None]:
print(f"Training accuracy: {clf.score(X_train, y_train)}")
print(f"Testing accuracy: {clf.score(X_test, y_test)}")

In [None]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)
cm = classification_report(y_test, y_pred)
print(cm)

# Benchmarking

In [None]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(
    hidden_layer_sizes=(500, 200),
    activation="logistic",
    max_iter=500,
    early_stopping=True,
    validation_fraction=0.1,
)
clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report
pred = clf.predict(X_test)
print(f"Training accuracy: {clf.score(X_train, y_train)}")
print(f"Testing accuracy: {clf.score(X_test, y_test)}")
report = classification_report(y_test, pred)
print(report)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=50,
)

clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report
pred = clf.predict(X_test)
print(f"Training accuracy: {clf.score(X_train, y_train)}")
print(f"Testing accuracy: {clf.score(X_test, y_test)}")
report = classification_report(y_test, pred)
print(report)

In [None]:

p = clf.predict_proba(X_test)
p = p @ np.array([-1,1]).reshape(2,1)
print(f"Most positive example is {np.argmax(p)}")
print(f"Most negative example is {np.argmin(p)}")
print(f"Most neutral example is {np.argmin(np.abs(p))}")

p = (p*0.5 + 0.5)
incorrect_amount = p - y_test.reshape(-1,1)
print(f'Most incorrect example is {np.argmax(np.abs(incorrect_amount))}')


In [None]:
example = 6197
print(f"Predicted: {best_clf.predict(X_test[example])[0]} {p[example]}\nActual: {y_test[example]}")
print(word_vectorizer.inverse_transform(X_test[example])[0])


## Runs
1. 89.98: (10,5), Logisitic, Early Stopping
2. 89.95: (10,5), Relu, Early Stopping,
3. 89.71: (5), Relu, Early Stopping
4. 85.12: (5), Relu, 
5. 91.11: (5), Relu, Early Stopping, 2-Grams

# Training

In [None]:
from sympy import Lambda, Symbol
from training import matrix_train


x = Symbol("x")


# each axis must be an iterable. if you want to use a constant, wrap it in an iterable of len 1
hyperparameter_matrix = {
    "epochs": np.logspace(np.log10(100), np.log10(100000), num=20, dtype="int64"),
    "lr": np.logspace(np.log10(.00001), np.log10(.1), num=20),
    "hidden_layers": [5, 6, 7],
    "neurons_per_layer": [3],
    "activation": [Lambda(x, x**2)],
}

best_params = matrix_train(hyperparameter_matrix, MultiLayerPerceptron, X_train, y_train, X_test, y_test)

print(best_params)
# mlp = MultiLayerPerceptron(**best_params)


# Performance

In [None]:
from analysis import accuracy, confusion, report


mlp = MultiLayerPerceptron(**best_params)
mlp.fit(X_train, y_train)

# print(accuracy(mlp.predict(X_test), y_test))
# print(confusion(mlp.predict(X_test), y_test))
print(report(mlp.predict(X_test), y_test))


# Benchmarking

In [None]:
# TODO utilize other classifiers and compare performance