This notebook performs initial data analysis over the datasets used in the paper (from the UCR datasets).

In [1]:
import os

from tslearn.datasets import UCR_UEA_datasets
from sklearn.manifold import TSNE

import os
import numpy as np
from scipy.spatial.distance import pdist, squareform
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.graphics.tsaplots import plot_acf

In [2]:
reports_save_path = './AnalysisResults/'
if not os.path.exists(reports_save_path):
    os.makedirs(reports_save_path)
reports_save_path = './AnalysisResults/ExploratoryDataAnalysis/'
if not os.path.exists(reports_save_path):
    os.makedirs(reports_save_path)

In [3]:
datasets_dict = {
    "BeetleFly": 8,
    "BirdChicken": 8,
    "Computers": 10,
    "Earthquakes": 8,
    "MoteStrain": 4,
    "PhalangesOutlinesCorrect": 4,
    "ProximalPhalanxOutlineCorrect": 4,
    "ShapeletSim": 10,
    "ItalyPowerDemand": 4,
    "WormsTwoClass": 10,}

datasets_list = list(datasets_dict.keys())

In [4]:
def plot_tsne(distance_matrix, y_sorted, ax=plt) : 
    tsne = TSNE(random_state=0, metric="precomputed", init='random')
    for_plot = tsne.fit_transform(distance_matrix)
    classes = np.unique(y_sorted)
    for class_name in classes : 
        ax.scatter(for_plot[y_sorted == class_name, 0], for_plot[y_sorted == class_name, 1], label = str(class_name))
    ax.legend()
    return ax

In [5]:
def plot_report(dataset_name='MoteStrain'):
    N = 5
    f, ax = plt.subplots(N, 2, figsize=(10, 17))
    f.suptitle(dataset_name)
    X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset(dataset_name)
    X = np.nan_to_num(np.concatenate((X_train, X_test)))
    y = np.concatenate((y_train, y_test))
    classes = np.unique(y)
    part = 0
    for i, class_name in enumerate(classes[:2]) : 
        class_indicator = (y == class_name)
        random_sample = np.random.choice(np.where(class_indicator)[0], 10)
        ax[part, i].set_title("Class {}".format(class_name))
        for sample_index in random_sample : 
            ax[part, i].plot(X[sample_index].flatten())
    part += 1
    index_label = np.argsort(y)
    X_univariate_reindexed = X[index_label, :, 0]
    ax_ = ax[part, 0]
    ax_.set_title("Correlation matrix")
    affinity = np.corrcoef(X_univariate_reindexed)
    sns.heatmap(affinity, ax=ax_)
    
    ax_ = ax[part, 1]
    ax_.set_title("Correlation matrix - TSNE projection")
    ax_ = plot_tsne(1-affinity, y[index_label], ax=ax_)
    
    part += 1
    ax_ = ax[part, 0]
    dists = squareform(pdist(X_univariate_reindexed, metric = "euclidean"))
    ax_.set_title("Euclidean distance matrix")
    sns.heatmap(dists, ax=ax_)
    
    ax_ = ax[part, 1]
    ax_.set_title("Euclidean distance matrix - TSNE projection")
    ax_ = plot_tsne(dists, y[index_label], ax=ax_)

    classes = np.unique(y)    
    for i in range(2):
        part += 1
        for j, class_name in enumerate(classes[:2]) : 
            class_indicator = (y == class_name)
            random_sample = np.random.choice(np.where(class_indicator)[0], 1)
            plot_acf(X[random_sample[0]].flatten(), title=f"Class {class_name} - Sample {i}", ax=ax[part, j])
    f.align_ylabels(ax[0, :])
    plt.tight_layout()
    plt.savefig(reports_save_path + f"/{dataset_name}.pdf")
    plt.show()    

In [None]:
for dataset in tqdm(datasets_list) :
    plot_report(dataset)