# Time Series Clustering with K-Means and Euclidean Distance Matrix for total (n2) with Engineered Features


This notebook provides a clustering for the total (n2) column of the acceleration data of scripted trips only. The exact approach can be seen here.

----
**Specification of experiment:**
- scripted trips only
- n2 column of acceleration data was used
- Euclidean Distance was calculated
- Feature Engineering with Quantiles, Standard deviation, maximum break/acceleration length

-----
**Results:**

1) KMeans Clustering with 3 Clusters:
 - 'Estimated number of clusters: 3',
 - 'True number of clusters: 3' *,
 - 'Homogeneity: 0.897',
 - 'Completeness: 0.510',
 - 'V-measure: 0.650',
 - 'Adjusted MI: 0.509',
 - 'Silhouette Coefficient: 0.800'


*Assume that transport modes are true clusters

In [None]:
# Load the "autoreload" extension
%load_ext autoreload

# always reload modules marked with "%aimport"
%autoreload 1

import os
import sys
from dotenv import load_dotenv, find_dotenv
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale
#Visualisation Libraries
%matplotlib inline
# Uncomment if you want interactive 3D plots --> does not work in the github rendering
#%matplotlib notebook
from copy import deepcopy

import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
matplotlib.style.use('ggplot')
import seaborn as sns

from IPython.display import display_markdown

# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

%aimport visualization.visualize
from visualization.visualize import get_color_encoding
from visualization.visualize import plot_timeseries_clustering
from visualization.visualize import get_plot_timeseries_clustering_variables
from visualization.visualize import get_distribution_of_cluster_labels_for
from visualization.visualize import plot_distribution_of_cluster_labels_for_target
from visualization.visualize import plot_all_trips_with_cluster_coloring
from visualization.visualize import plot_2D_tsne_with_coloring_per_targets



%aimport data.preprocessing
from data.preprocessing import Preprocessor
%aimport data.download
from data.download import DatasetDownloader
%aimport utils.utilities
from utils.utilities import get_cluster_labels
%aimport features.build_features
from features.build_features import calculate_maximum_break_length
from features.build_features import calculate_maximum_acceleration_length
%aimport models.cluster
from models.cluster import get_clustering_performance

In [None]:
# Set to true if all trips should be plotted at the end
plot_all_trips = True

data_dir = os.path.join(os.path.abspath(DatasetDownloader.get_data_dir()))
file_path = os.path.join(data_dir, "preprocessed","preprocessed_data.dat")

dfs = Preprocessor.restore_preprocessed_data_from_disk(file_path)

In [None]:
categorical_columns = ["mode","notes","scripted","token","trip_id"]

trips_cut_per_30_sec = Preprocessor.get_cut_trip_snippets_for_targets(dfs,["total"])[0]
scripted_trips_only = trips_cut_per_30_sec[trips_cut_per_30_sec["scripted"] == 1]
scripted_trips_only = scripted_trips_only.reset_index(drop=True)

distance_matrix = Preprocessor.calculate_distance_for_n2(scripted_trips_only, metric="euclidean")
distances_only = distance_matrix.drop(categorical_columns,axis=1)

### Choose which features to use

In [None]:
features_used = summary

### Prepare features for clustering

In [None]:
features_used = features_used.drop(categorical_columns,axis=1)
features_used = pd.DataFrame(scale(features_used),columns=features_used.columns)
features_used.head(2)

------
# Model Building

## Create Model with Clustering by --Name--

Use summaries of each distance

In [None]:
from sklearn.cluster import KMeans
random_state = 0

kmeans = KMeans(n_clusters=3, random_state=random_state).fit(features_used)
cluster_labels = kmeans.labels_
distance_matrix["cluster_labels"]=cluster_labels

-----
## Summary Statistics for --Name-- Clustering:

Here the performance is evaluated in regard to the "true labels" which here corresponds to the transport mode.

In [None]:
cluster_performance = get_clustering_performance(features_used, cluster_labels,
                                                 true_labels=distance_matrix["mode"])
for line in cluster_performance:
    display_markdown(" - {}".format(line),raw=True)

## Visualise Clustering on 2D - TSNE Plot:

In [None]:
color_encodings = distance_matrix.loc[:,["cluster_labels","mode","token"]]
plot_2D_tsne_with_coloring_per_targets(features_used, color_encodings)

## Inspect the distribution of modes to cluster label for all trips

From this we can see that it the clustering is not distinct between "WALK" and "TRAM"

In [None]:
mode_dist_df = get_distribution_of_cluster_labels_for("mode", distance_matrix)
mode_dist_df

### Plot distribution of cluster labels for transport modes

In [None]:
plot_distribution_of_cluster_labels_for_target("mode", distance_matrix)

### Plot distribution of cluster labels for tokens

In [None]:
plot_distribution_of_cluster_labels_for_target("token", distance_matrix)

------
## Plot all trips with their clustering labels

In [None]:
if plot_all_trips:
    trips_unpacked = Preprocessor.unpack_all_trips(dfs)
    plot_all_trips_with_cluster_coloring(trips_unpacked, distance_matrix)