In [1]:
# work in progress

In [3]:
import sys
from pathlib import Path
PROJECT_DIR = Path.cwd()
if PROJECT_DIR.stem == 'notebooks':
    PROJECT_DIR = PROJECT_DIR.parent
    sys.path.insert(0, '..')
    %load_ext autoreload
    %autoreload 2 # for local development purposes

In [3]:
import importlib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch import nn
from skorch import NeuralNetClassifier
from skorch.callbacks import TensorBoard
from torch.utils.tensorboard import SummaryWriter

from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GroupKFold, cross_val_score


from sklearn.metrics import (
    accuracy_score,
    matthews_corrcoef, 
    confusion_matrix, 
    ConfusionMatrixDisplay
    )

print(f"{torch.__version__ = }")
print(f"{torch.cuda.is_available() = }")

torch.__version__ = '1.13.1'
torch.cuda.is_available() = False


### Utils

##### Preprocessing

In [4]:
from src.features.scalers_3d import StandardScaler3D, MinMaxScaler3D, RobustScaler3D

# TODO: add group standardization? or just use robust scaler?
# -> could be important for transfer learning
# https://stackoverflow.com/questions/55601928/apply-multiple-standardscalers-to-individual-groups

from src.features.reshape_features_to_2d import reshape_features_to_2D

##### Pytorch models

In [5]:
# Define callbacks
# TODO: take a look into basic usage notebook from the skorch documentation
# change the path to the tensorboard logs (runs folder)
writer = SummaryWriter()
callbacks = []
callbacks.append(TensorBoard(writer))

In [6]:
# Define MLP with pytorch
class MultiLayerPerceptron(nn.Module):
    def __init__(self, num_units=100):
        super().__init__()

        self.dense0 = nn.Linear(data.shape[1]*data.shape[2], num_units)
        self.nonlin = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.dense1 = nn.Linear(num_units, num_units)
        self.output = nn.Linear(num_units, 2)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, X, **kwargs):
        X = self.nonlin(self.dense0(X))
        X = self.dropout(X)
        X = self.nonlin(self.dense1(X))
        X = self.softmax(self.output(X))
        return X


# Create MLP with skorch
mlp = NeuralNetClassifier(
    MultiLayerPerceptron,
    max_epochs=10,
    lr=0.1,
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
    callbacks=callbacks
)


In [7]:
# Define LSTM with pytorch
class LongShortTermMemory(nn.Module):
    def __init__(self, num_units=10):
        super().__init__()

        self.lstm = nn.LSTM(
            input_size=data.shape[2],
            hidden_size=num_units,
            num_layers=1,
            batch_first=True
        )
        self.output = nn.Linear(num_units, 2)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, X, **kwargs):
        X, _ = self.lstm(X)
        X = self.softmax(self.output(X[:, -1, :]))
        return X


# Create LSTM with skorch
lstm = NeuralNetClassifier(
    LongShortTermMemory,
    max_epochs=10,
    lr=0.1,
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
    callbacks=callbacks
)


TODO

- check if softmax with default nll-criterion gives the same result as cross-entropy loss

### Load data

In [8]:
DATA_DIR = PROJECT_DIR / 'data'
DUMMY_DIR = DATA_DIR / 'dummy'

data = np.load(DUMMY_DIR / "data.npy").astype(np.float32)
target = np.load(DUMMY_DIR / "target.npy").astype(np.int64)
groups = np.load(DUMMY_DIR / "groups.npy").astype(np.int64)

print(f"{data.shape = }\n{target.shape = }\n{groups.shape = }")


data.shape = (4000, 500, 3)
target.shape = (4000,)
groups.shape = (4000,)


## Modeling

In [9]:
# Create pipelines with sklearn
mlp_pipe = make_pipeline(
    StandardScaler3D(),
    FunctionTransformer(reshape_features_to_2D), 
    mlp)

lstm_pipe = make_pipeline(
    MinMaxScaler3D(),
    lstm)

In [10]:
mlp_pipe.fit(data, target)

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m0.4225[0m       [32m0.8363[0m        [35m0.2892[0m  0.2846
      2        [36m0.3639[0m       0.7875        0.3066  0.8328
      3        [36m0.3525[0m       0.8350        [35m0.2855[0m  0.3814
      4        [36m0.3486[0m       0.7950        0.2951  0.4150
      5        [36m0.3480[0m       0.7887        0.2960  0.2967
      6        [36m0.3448[0m       0.8213        0.2895  0.5782
      7        0.3471       0.8213        0.2879  0.2992
      8        0.3465       0.8325        [35m0.2854[0m  0.2807
      9        0.3453       0.8337        [35m0.2849[0m  0.3647
     10        [36m0.3444[0m       0.8200        [35m0.2839[0m  0.2463


In [None]:
lstm_pipe.fit(data, target)

# Test data

In [None]:
# Load data
data_test = np.load(DUMMY_DIR / "data_test.npy").astype(np.float32)
target_test = np.load(DUMMY_DIR / "target_test.npy").astype(np.float32)

print(f"{data_test.shape = }\n{target_test.shape = }")

In [None]:
# Metrics for MLP classifier

target_predicted = mlp_pipe.predict(data_test)

cm = confusion_matrix(target_test, target_predicted)
acc = accuracy_score(target_test, target_predicted)
mcc = matthews_corrcoef(target_test, target_predicted)

ConfusionMatrixDisplay(
    confusion_matrix=cm, 
    display_labels=mlp_pipe.classes_,
    ).plot()

print("Metrics for MLP classifier:")
print(f"{acc = }\n{mcc = }")


In [None]:
mlp_score = mlp_pipe.score(data_test, target_test)
print(f"{mlp_score = }")

lstm_score = lstm_pipe.score(data_test, target_test)
print(f"{lstm_score = }")


# GroupKFold cross-validation 
(doing it right)

In [None]:
cv = GroupKFold(n_splits=5)
test_score = cross_val_score(
    mlp_pipe,
    data, target,
    groups=groups,
    cv=cv,
    n_jobs=5)
print(f"The average accuracy is "
      f"{test_score.mean():.3f} ± "
      f"{test_score.std():.3f}")


In [None]:
cv = GroupKFold()
test_score = cross_val_score(
    lstm_pipe,
    data, 
    target, 
    groups=groups,
    cv=cv, 
    n_jobs=5)
print(f"The average accuracy is "
      f"{test_score.mean():.3f} ± "
      f"{test_score.std():.3f}")


# Grid search
use randomizedgridsearch: https://scikit-learn.org/stable/modules/grid_search.html#grid-search

# More stuff (WIP)

In [None]:
from sklearn.datasets import load_iris
from matplotlib import pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
import numpy as np

# Number of random trials
NUM_TRIALS = 30

# Load the dataset
iris = load_iris()
X_iris = iris.data
y_iris = iris.target

# Set up possible values of parameters to optimize over
p_grid = {"C": [1, 10, 100], "gamma": [0.01, 0.1]}

# We will use a Support Vector Classifier with "rbf" kernel
svm = SVC(kernel="rbf")

# Arrays to store scores
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

# Loop for each trial
for i in range(NUM_TRIALS):

    # Choose cross-validation techniques for the inner and outer loops,
    # independently of the dataset.
    # E.g "GroupKFold", "LeaveOneOut", "LeaveOneGroupOut", etc.
    inner_cv = KFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=svm, param_grid=p_grid, cv=outer_cv)
    clf.fit(X_iris, y_iris)
    non_nested_scores[i] = clf.best_score_

    # Nested CV with parameter optimization
    clf = GridSearchCV(estimator=svm, param_grid=p_grid, cv=inner_cv)
    nested_score = cross_val_score(clf, X=X_iris, y=y_iris, cv=outer_cv)
    nested_scores[i] = nested_score.mean()

score_difference = non_nested_scores - nested_scores

print(
    "Average difference of {:6f} with std. dev. of {:6f}.".format(
        score_difference.mean(), score_difference.std()
    )
)

# Plot scores on each trial for nested and non-nested CV
plt.figure()
plt.subplot(211)
(non_nested_scores_line,) = plt.plot(non_nested_scores, color="r")
(nested_line,) = plt.plot(nested_scores, color="b")
plt.ylabel("score", fontsize="14")
plt.legend(
    [non_nested_scores_line, nested_line],
    ["Non-Nested CV", "Nested CV"],
    bbox_to_anchor=(0, 0.4, 0.5, 0),
)
plt.title(
    "Non-Nested and Nested Cross Validation on Iris Dataset",
    x=0.5,
    y=1.1,
    fontsize="15",
)

# Plot bar chart of the difference.
plt.subplot(212)
difference_plot = plt.bar(range(NUM_TRIALS), score_difference)
plt.xlabel("Individual Trial #")
plt.legend(
    [difference_plot],
    ["Non-Nested CV - Nested CV Score"],
    bbox_to_anchor=(0, 1, 0.8, 0),
)
plt.ylabel("score difference", fontsize="14")

plt.show()

In [None]:
# # with grid search
# from sklearn.model_selection import GridSearchCV

# # deactivate skorch-internal train-valid split and verbose logging
# net.set_params(train_split=False, verbose=0)
# params = {
#     'lr': [0.01, 0.02],
#     'max_epochs': [10, 20],
#     'module__num_units': [100, 1000],
# }
# gs = GridSearchCV(net, params, refit=False, cv=3, scoring='accuracy', verbose=2)

# gs.fit(data, target)
# print("best score: {:.3f}, best params: {}".format(gs.best_score_, gs.best_params_))

# Test set

In [None]:
# Load data
data_test = np.load(Path(r"data/data_test.npy"))
target_test = np.load(Path(r"data/target_test.npy"))

# Scale data
data_test = StandardScaler3D().fit_transform(data_test)

# Reshape X to 1-D for MLP
data_test = np.reshape(data_test, (
    data_test.shape[0],
    data_test.shape[2]*data_test.shape[1]
), order="F")

plt.plot(data_test[142]);
print(f"{data_test.shape = }")

## Sklearn implementation


In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score, GroupKFold

model = MLPClassifier(
    random_state=1, 
    max_iter=500, 
    early_stopping=True)

cv = GroupKFold()
test_score = cross_val_score(model, data, target, groups=groups, cv=cv,
                             n_jobs=5)
print(f"The average accuracy is "
      f"{test_score.mean():.3f} ± "
      f"{test_score.std():.3f}")

In [None]:
all_scores = pd.DataFrame(
    [test_score],
    index=["KFold with groups"],
).T

all_scores.plot.hist(bins=10, edgecolor="black", alpha=0.7)
plt.xlabel("Accuracy score")
_ = plt.title("Distribution of the test scores")

# Tests

In [None]:
# TODO: compare 3d standardization with old version, which was done by hand

In [None]:
# test for tranform_to_1d

# Reshape X to 1-D for MLP
data = np.reshape(data, (
    data.shape[0],
    data.shape[2]*data.shape[1]
), order="F")

plt.plot(data[142]);
print(f"{data.shape = }")

In [None]:
from sklearn.model_selection import GroupKFold
X = range(100)
y = target[:100]
groups_ = groups[:1000:10]
gkf = GroupKFold(n_splits=6)
for train, test in gkf.split(X, y, groups=groups_):
    print("%s %s" % (train, test))
    print(f"{train.shape = }\n{test.shape = }")