In [None]:
%load_ext autoreload
%autoreload 2

import torch
import numpy as np
import matplotlib.pyplot as plt

from model import AutoEncoder
from generate import sin_cos, arma, wind
from train import train
from functional import *
from utils import *

import seaborn as sns
import scipy
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.metrics import calinski_harabasz_score
import pandas as pd

torch.manual_seed(4444)
np.random.seed(4444)

# Create Dataset

In [None]:
n_train = 700
n_valid = 100
n_test = 100
n = n_train+n_valid+n_test

length = 64 # each observation is a vector of size (1,length)

## Generate documents

In [None]:
#X = arma(n, length)
X = wind(num_elems=length)
#X = sin_cos(n, length)

X = torch.from_numpy(X)

X_train, X_valid, X_test = X[:n_train], X[n_train:n_train+n_valid], X[n_train+n_valid:]

In [None]:
X.shape

# Model

In [None]:
M = 1 # number of filters per conv
Lf = 3 # size of the filters
bottleneck_nn = 6
model = AutoEncoder(length=length, Lf=Lf, M=M, bottleneck_nn=bottleneck_nn)

## Train

In [None]:
train_losses, valid_losses = train(model, X_train, X_valid, iters=3000, early_stopping_rounds=30)

In [None]:
plt.title("Cost")
plt.plot(train_losses, label="train")
plt.plot(valid_losses, label="validation")
plt.legend()

## Load/Save model

In [None]:
#torch.save(model.state_dict(), "../saved_weights/wind64_model")
#model.load_state_dict(torch.load("../saved_weights/wind6_model"))

# Testing

## Average and std correlation

In [None]:
pred1 = model(X_test)

print(f"test loss: {my_mse(X_test,model(X_test)) + my_l2(model)}")

pred1 = pred1.detach().numpy()

# mean cors
cors = [scipy.stats.spearmanr(pred1[i,0], X_test[i,0]).correlation for i in range(n_test)]
print("correlation avg and std:", np.mean(cors), np.std(cors))
print()

# plot
n_plots = 4
start = n_plots*0

fig, axs = plt.subplots(nrows=2, ncols=n_plots, figsize=(25,5))
for i in range(n_plots):
    
    axs[0,i].axis("off")
    axs[0,i].set_title("Original")
    axs[0,i].plot(X_test[start+i,0])

    axs[1,i].axis("off")
    axs[1,i].set_title("Reconstructed")
    axs[1,i].plot(pred1[start+i,0])
    print("spearman:", scipy.stats.spearmanr(pred1[start+i,0], X_test[start+i,0]).correlation)

# Latent space

In [None]:
if bottleneck_nn == 2:
    latent_space(model, n=10)

# Choose bootleneck

In [None]:
vals = choose_bottleneck(X_test, X_train, X_valid, length, M, Lf)

In [None]:
plt.plot([np.mean(x) for x in vals], "-o", [1]*len(vals))

# Checking important filters

In [None]:
num_filter = 4*M
w_per_filter = length-Lf+1 # weights per filter
w = np.array([[torch.mean(torch.abs(model.full1.weight[j,i*w_per_filter:(i+1)*w_per_filter])).item() for i in range(num_filter)] for j in range(bottleneck_nn)])

x_axis_labels = [f"{i}-d:{2**(i//M)}" for i in range(w.shape[1])] # number of filter - d:dilatation
sns.heatmap(w, cmap="coolwarm", xticklabels=x_axis_labels) # y-axis => neuron of the bottleneck, x-axis => each position is one filter ordered by dilatation

In [None]:
clustering = KMeans(n_clusters=2)
clustering.fit(w.T) # tranposed -> cluster por columnas
clustering.labels_

# Clustering using bootleneck

In [None]:
bns = model.forward(X_test, get_bottleneck=True).detach().numpy()

In [None]:
chs = []
for i in range(2,15):
    clustering = KMeans(n_clusters=i)
    clustering.fit(bns) 
    clus = clustering.labels_
    ch = calinski_harabasz_score(bns,clus)
    chs.append(ch)
plt.plot(range(2,len(chs)+2), chs, "o-")

In [None]:
sns.pairplot(pd.DataFrame(bns))