In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import numpy as np
from torch import nn
from skorch import NeuralNetClassifier
from skorch.callbacks import TensorBoard
from torch.utils.tensorboard import SummaryWriter

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold, cross_val_score


In [2]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import FunctionTransformer


class StandardScaler3D(BaseEstimator, TransformerMixin):
    """ Feature-wise scaling
        reshape data temporarily to [samples * time-steps , features] for feature-colums
        Code modified from https://stackoverflow.com/a/61617645
    """

    def __init__(self):
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        self.scaler.fit(X.reshape(X.shape[0], -1))
        return self

    def transform(self, X):
        return self.scaler.transform(X.reshape(X.shape[0], -1)).reshape(X.shape)


class MinMaxScaler3D(BaseEstimator, TransformerMixin):
    """ Feature-wise scaling
        reshape data temporarily to [samples * time-steps , features] for feature-colums
        Code modified from https://stackoverflow.com/a/61617645
    """

    def __init__(self):
        self.scaler = MinMaxScaler()

    def fit(self, X, y=None):
        self.scaler.fit(X.reshape(X.shape[0], -1))
        return self

    def transform(self, X):
        return self.scaler.transform(X.reshape(X.shape[0], -1)).reshape(X.shape)


def feature_matrix_to_vector(X):
    """ Concatenate arrays along the second axis"""
    return np.reshape(X, (
        X.shape[0],
        X.shape[2]*X.shape[1]
    ), order="F")


# TODO
# rewrite for train and test set scaling
# for example, using model.fit(train_data)
# and then apply model.scale_ and model.mean_ to test_data.

# hint: actually, this is standard in sklearn
# use this scaler and do a unit test with manually calculated data for train & test set

# TODO: Visualize data (1 trial)


In [3]:
# Load data
data = np.load(Path(r"data/data.npy")).astype(np.float32)
target = np.load(Path(r"data/target.npy")).astype(np.int64)
groups = np.load(Path(r"data/groups.npy")).astype(np.int64)

print(f"{data.shape = }\n{target.shape = }\n{groups.shape = }")


data.shape = (4000, 500, 3)
target.shape = (4000,)
groups.shape = (4000,)


## 1.1 Baseline model (MLP)

## Skorch implementation

In [4]:
# Define callbacks
writer = SummaryWriter()
callbacks = []
callbacks.append(TensorBoard(writer))


In [5]:
# Define MLP
class MultiLayerPerceptron(nn.Module):
    def __init__(self, num_units=100):
        super().__init__()

        self.dense0 = nn.Linear(data.shape[1]*data.shape[2], num_units)
        self.nonlin = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.dense1 = nn.Linear(num_units, num_units)
        self.output = nn.Linear(num_units, 2)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, X, **kwargs):

        X = self.nonlin(self.dense0(X))
        X = self.dropout(X)
        X = self.nonlin(self.dense1(X))
        X = self.softmax(self.output(X))
        return X

# Create MLP
MLP = NeuralNetClassifier(
    MultiLayerPerceptron,
    max_epochs=10,
    lr=0.1,
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
    callbacks=callbacks
)


In [6]:
# Define LSTM
class LongShortTermMemory(nn.Module):
    def __init__(self, num_units=10):
        super().__init__()

        self.lstm = nn.LSTM(
            input_size=data.shape[2],
            hidden_size=num_units,
            num_layers=1,
            batch_first=True
        )
        self.output = nn.Linear(num_units, 2)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, X, **kwargs):
            
            X, _ = self.lstm(X)
            X = self.softmax(self.output(X[:, -1, :]))
            return X
    
# Create LSTM
LSTM = NeuralNetClassifier(
    LongShortTermMemory,
    max_epochs=10,
    lr=0.1,
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
    callbacks=callbacks
)

TODO

- check if softmax with default nll-criterion gives the same result as cross-entropy loss

In [43]:
# Create pipeline
MLP_pipe = make_pipeline(
    StandardScaler3D(),
    FunctionTransformer(feature_matrix_to_vector),
    MLP)

LSTM_pipe = make_pipeline(
    MinMaxScaler3D(),
    LSTM)


In [44]:
MLP_pipe.fit(data, target)


  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m0.4016[0m       [32m0.8325[0m        [35m0.2927[0m  0.2009
      2        [36m0.3461[0m       0.8187        0.2942  0.1457
      3        [36m0.3370[0m       0.8237        0.2957  0.1489
      4        [36m0.3341[0m       0.8200        0.3029  0.1494
      5        [36m0.3315[0m       0.7963        0.3070  0.1489
      6        [36m0.3262[0m       0.8063        0.3083  0.1642
      7        [36m0.3168[0m       0.8150        0.3037  0.1491
      8        [36m0.3163[0m       0.8237        0.2985  0.1319
      9        0.3189       0.7937        0.3213  0.1490
     10        [36m0.3110[0m       0.8213        0.3045  0.1297


In [None]:
LSTM_pipe.fit(data, target)

# Test data

In [None]:
# Load data
test_data = np.load(Path(r"data/test_data.npy")).astype(np.float32)
test_target = np.load(Path(r"data/test_target.npy")).astype(np.int64)

print(f"{test_data.shape = }\n{test_target.shape = }")

In [None]:
MLP_score = MLP_pipe.score(test_data, test_target)
print(f"{MLP_score = }")

LSTM_score = LSTM_pipe.score(test_data, test_target)
print(f"{LSTM_score = }")


# CV tests

In [None]:
cv = GroupKFold(n_splits=5)
test_score = cross_val_score(
    LSTM_pipe,
    data, target,
    groups=groups, 
    cv=cv,
    n_jobs=5)
print(f"The average accuracy is "
      f"{test_score.mean():.3f} ± "
      f"{test_score.std():.3f}")

In [None]:
from sklearn.model_selection import GroupKFold
X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10]
y = ["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"]
groups = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
gkf = GroupKFold(n_splits=3)
for train, test in gkf.split(X, y, groups=groups):
    print("%s %s" % (train, test))

In [None]:
cv = GroupKFold()
test_score = cross_val_score(
    LSTM_pipe,
    data, 
    target, 
    groups=groups,
    cv=cv, 
    n_jobs=5)
print(f"The average accuracy is "
      f"{test_score.mean():.3f} ± "
      f"{test_score.std():.3f}")


In [None]:
from sklearn.datasets import load_iris
from matplotlib import pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
import numpy as np

# Number of random trials
NUM_TRIALS = 30

# Load the dataset
iris = load_iris()
X_iris = iris.data
y_iris = iris.target

# Set up possible values of parameters to optimize over
p_grid = {"C": [1, 10, 100], "gamma": [0.01, 0.1]}

# We will use a Support Vector Classifier with "rbf" kernel
svm = SVC(kernel="rbf")

# Arrays to store scores
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

# Loop for each trial
for i in range(NUM_TRIALS):

    # Choose cross-validation techniques for the inner and outer loops,
    # independently of the dataset.
    # E.g "GroupKFold", "LeaveOneOut", "LeaveOneGroupOut", etc.
    inner_cv = KFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=svm, param_grid=p_grid, cv=outer_cv)
    clf.fit(X_iris, y_iris)
    non_nested_scores[i] = clf.best_score_

    # Nested CV with parameter optimization
    clf = GridSearchCV(estimator=svm, param_grid=p_grid, cv=inner_cv)
    nested_score = cross_val_score(clf, X=X_iris, y=y_iris, cv=outer_cv)
    nested_scores[i] = nested_score.mean()

score_difference = non_nested_scores - nested_scores

print(
    "Average difference of {:6f} with std. dev. of {:6f}.".format(
        score_difference.mean(), score_difference.std()
    )
)

# Plot scores on each trial for nested and non-nested CV
plt.figure()
plt.subplot(211)
(non_nested_scores_line,) = plt.plot(non_nested_scores, color="r")
(nested_line,) = plt.plot(nested_scores, color="b")
plt.ylabel("score", fontsize="14")
plt.legend(
    [non_nested_scores_line, nested_line],
    ["Non-Nested CV", "Nested CV"],
    bbox_to_anchor=(0, 0.4, 0.5, 0),
)
plt.title(
    "Non-Nested and Nested Cross Validation on Iris Dataset",
    x=0.5,
    y=1.1,
    fontsize="15",
)

# Plot bar chart of the difference.
plt.subplot(212)
difference_plot = plt.bar(range(NUM_TRIALS), score_difference)
plt.xlabel("Individual Trial #")
plt.legend(
    [difference_plot],
    ["Non-Nested CV - Nested CV Score"],
    bbox_to_anchor=(0, 1, 0.8, 0),
)
plt.ylabel("score difference", fontsize="14")

plt.show()

In [None]:
# # with grid search
# from sklearn.model_selection import GridSearchCV

# # deactivate skorch-internal train-valid split and verbose logging
# net.set_params(train_split=False, verbose=0)
# params = {
#     'lr': [0.01, 0.02],
#     'max_epochs': [10, 20],
#     'module__num_units': [100, 1000],
# }
# gs = GridSearchCV(net, params, refit=False, cv=3, scoring='accuracy', verbose=2)

# gs.fit(data, target)
# print("best score: {:.3f}, best params: {}".format(gs.best_score_, gs.best_params_))

# Test set

In [None]:
# Load data
test_data = np.load(Path(r"data/test_data.npy"))
test_target = np.load(Path(r"data/test_target.npy"))

# Scale data
test_data = StandardScaler3D().fit_transform(test_data)

# Reshape X to 1-D for MLP
test_data = np.reshape(test_data, (
    test_data.shape[0],
    test_data.shape[2]*test_data.shape[1]
), order="F")

plt.plot(test_data[142]);
print(f"{test_data.shape = }")

## Sklearn implementation


In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score, GroupKFold

model = MLPClassifier(
    random_state=1, 
    max_iter=500, 
    early_stopping=True)

cv = GroupKFold()
test_score = cross_val_score(model, data, target, groups=groups, cv=cv,
                             n_jobs=5)
print(f"The average accuracy is "
      f"{test_score.mean():.3f} ± "
      f"{test_score.std():.3f}")

In [None]:
all_scores = pd.DataFrame(
    [test_score],
    index=["KFold with groups"],
).T

all_scores.plot.hist(bins=10, edgecolor="black", alpha=0.7)
plt.xlabel("Accuracy score")
_ = plt.title("Distribution of the test scores")

# Tests

In [None]:
# test for tranform_to_1d

# Reshape X to 1-D for MLP
data = np.reshape(data, (
    data.shape[0],
    data.shape[2]*data.shape[1]
), order="F")

plt.plot(data[142]);
print(f"{data.shape = }")