# Notebook Setup
Installs necessary requirements and configures proper notebook behavior  
Note: The notebook assumes you are using the base conda environment

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
    
print("Notebook setup has completed")

Notebook setup has completed


# Environment Setup
Downloads configured dataset and performs necessary environment bootstrapping

In [8]:
from bootstrap import setup


# Please rename .env-template to .env and adjust values as needed
setup()

print("Environment setup has completed")

del setup

Environment setup has completed


# Sandbox
Available data split into training, testing, and validation sets

In [12]:
from preprocessing import get_dataframe, get_subsets


df = get_dataframe()  # In the form [features, labels]

# TODO: Transform raw text into a representation suitable for a MLP
# df can has as many feature columns as needed, but the last column must be the label column

X_train, X_test, X_validate, y_train, y_test, y_validate = get_subsets(df)


# Implementation

In [5]:
from typing import Callable, Iterable, Union

import numpy as np


class MultiLayerPerceptron:
    def __init__(
        self,
        epochs: int,
        lr: float | Callable[[int], float],
        hidden_layers: int,
        neurons_per_layer: int,
        activation: Union[Callable, Iterable[Callable]]
    ):
        if isinstance(activation, Iterable):
            assert len(activation) == hidden_layers
        
        self.num_epochs = epochs
        self.lr = (lambda x: lr) if isinstance(lr, float) else lr
        self.hidden_layers = hidden_layers
        self.neurons_per_layer = neurons_per_layer
        self.activation = activation
        self._units = None
    
    def epochs(self):
        for i in range(self.num_epochs):
            yield i, self.lr(i)
    
    def fit(self, X: np.ndarray, y: np.ndarray) -> None:        
        for epoch_num, lr in self.epochs():
            pass
    
    def predict(self, X: np.ndarray) -> np.ndarray:
        pass



# Training

In [14]:
from sympy import Lambda, Symbol
from training import matrix_train


x = Symbol("x")


# each axis must be an iterable. if you want to use a constant, wrap it in an iterable of len 1
hyperparameter_matrix = {
    "epochs": np.logspace(np.log10(100), np.log10(100000), num=20, dtype="int64"),
    "lr": np.logspace(np.log10(.00001), np.log10(.1), num=20),
    "hidden_layers": [5, 6, 7],
    "neurons_per_layer": [3],
    "activation": [Lambda(x, x**2)],
}

best_params = matrix_train(hyperparameter_matrix, MultiLayerPerceptron, X_train, y_train, X_test, y_test)

print(best_params)
# mlp = MultiLayerPerceptron(**best_params)


ModuleNotFoundError: No module named 'sympy'

# Performance

In [None]:
from analysis import accuracy, confusion, report


mlp = MultiLayerPerceptron(**best_params)
mlp.fit(X_train, y_train)

# print(accuracy(mlp.predict(X_test), y_test))
# print(confusion(mlp.predict(X_test), y_test))
print(report(mlp.predict(X_test), y_test))


# Benchmarking

In [30]:
# TODO utilize other classifiers and compare performance
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.svm import SVC

In [29]:
v = CountVectorizer()
X_train_format = v.fit_transform(X_train[:10_000, 0])

In [32]:
clf = SVC(kernel='linear', random_state=42)
clf = clf.fit(X_train_format, y_train[:10_000])

In [35]:
X_validate_format = v.transform(X_validate[:, 0])
clf.score(X_validate_format, y_validate)

0.8509

In [38]:
x = np.array(["This company is great"])
x = v.transform(x)
clf.predict(x)

array(['positive'], dtype=object)