## Parameter optimization for t-SNE

In [1]:
# Load the "autoreload" extension
%load_ext autoreload

# always reload modules marked with "%aimport"
%autoreload 1

import os
import sys
from dotenv import load_dotenv, find_dotenv
import numpy as np
import pandas as pd
import hdbscan
import scipy
#Visualisation Libraries
%matplotlib inline
# Uncomment if you want interactive 3D plots --> does not work in the github rendering
#%matplotlib notebook
from copy import deepcopy

import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
matplotlib.style.use('ggplot')
import seaborn as sns
# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

%aimport visualization.visualize
from visualization.visualize import get_color_encoding
from visualization.visualize import plot_timeseries_clustering
from visualization.visualize import get_plot_timeseries_clustering_variables
%aimport data.preprocessing
from data.preprocessing import Preprocessor
%aimport data.download
from data.download import DatasetDownloader
%aimport utils.utilities
from utils.utilities import get_cluster_labels

%aimport models.cluster
from models.cluster import get_clustering_performance
%aimport models.dimensionality_reduction
from models.dimensionality_reduction.TSNEModel import TSNEModel
from models.dimensionality_reduction.BayesianTSNEOptimizer import BayesianTSNEOptimizer

Load data from disk.

In [2]:
# Load data from disk.
data_dir = os.path.join(os.path.abspath(DatasetDownloader.get_data_dir()))
file_path = os.path.join(data_dir, "preprocessed","preprocessed_data.dat")
dfs = Preprocessor.restore_preprocessed_data_from_disk(file_path)

Calculate distances.

In [3]:
trips_cut_per_30_sec = Preprocessor.get_cut_trip_snippets_for_total(dfs)
euclidean_distances = Preprocessor.calculate_distance_for_n2(trips_cut_per_30_sec, metric="euclidean")

Prepare distance data for fitting of t-SNE model.

In [4]:
categorical_columns = ["mode", "notes", "scripted", "token", "trip_id"]
segment_distance_matrix = euclidean_distances.drop(categorical_columns,axis=1)

Next steps: Integrate BayesianTSNEOptimizer, start optimization (record results and ingest at next start as initialization values).

In [None]:
# Define parameter ranges, fix static variables.
param_ranges = deepcopy(TSNEModel.PARAMETER_RANGES)
param_ranges["metric"] = (TSNEModel.CATEGORICAL_VALUES["metric"].index("precomputed"),)
param_ranges["init_method"] = (TSNEModel.CATEGORICAL_VALUES["init_method"].index("random"),)
param_ranges["random_state"] = (42,)
param_ranges["n_components"] = (3,)
param_ranges["n_iter"] = (7000,)

# Initialize new BO object.
boOpt = BayesianTSNEOptimizer(
    high_dim_data=segment_distance_matrix, 
    cluster_memberships=euclidean_distances["mode"].values, 
    parameters=param_ranges
)

# Load existing results.
history = BayesianTSNEOptimizer.load_result_dict("tsne_results")
if history is not None:
    print("Number of models generated so far: ", len(history["values"]))

# Execute optimization; initialize with existing results.
# Use higher init_fraction if not many initialization datapoints are available.
results = boOpt.run(num_iterations=50, init_fraction=0.3, init_values=history, kappa=8.0)
# Save merged result set (new results and existing ones).
all_results = BayesianTSNEOptimizer.merge_result_dictionaries(results, history)
BayesianTSNEOptimizer.persist_result_dict(
    results=all_results,
    filename="tsne_results"
)

[31mInitialization[0m
[94m-------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |     angle |   early_exaggeration |   learning_rate |   min_grad_norm |   perplexity | 
    1 | 00m39s | [35m   0.31545[0m | [32m   0.3670[0m | [32m             20.7664[0m | [32m      1362.7504[0m | [32m         0.0746[0m | [32m     83.7103[0m | 
    2 | 00m47s |    0.30508 |    0.1838 |              46.1613 |       1190.6741 |          0.0692 |      88.4468 | 
    3 | 01m24s | [35m   0.33150[0m | [32m   0.1819[0m | [32m             27.9298[0m | [32m       124.7082[0m | [32m         0.0151[0m | [32m     63.0446[0m | 
    4 | 01m02s |    0.30647 |    0.4377 |              29.9633 |       1489.2382 |          0.0269 |      22.1565 | 
    5 | 01m04s | [35m   0.33906[0m | [32m   0.2580[0m | [32m             12.1081[0m | [32m       224.7585[0m | [32m         0.0865[0m | [32m     46

Sort results by score, pick highest.

In [None]:
all_results_sorted_idx = np.argsort(all_results["values"])
max_score_index = all_results_sorted_idx[-1]
best_param_set = all_results["params"][max_score_index]
print(best_param_set)

(Re-)Generate model with given parameter set, since we didn't store the results for each run.

In [None]:
tsne = TSNEModel(num_dimensions=3,
                 perplexity=best_param_set["perplexity"],
                 early_exaggeration=best_param_set["early_exaggeration"],
                 learning_rate=best_param_set["learning_rate"],
                 num_iterations=int(round(best_param_set["n_iter"])),
                 min_grad_norm=best_param_set["min_grad_norm"],
                 random_state=42,
                 angle=best_param_set["angle"],
                 metric='precomputed',
                 init_method='random')
# Fit t-SNE model.
tsne_results = tsne.run(segment_distance_matrix.values)

In [None]:
transport_modes = {
    'WALK': 'blue',
    'METRO': 'red',
    'TRAM': 'green'
}
tokens = {
    '355007075245007': 'x',
    '358568053229914': 'o',
    '868049020858898': 'v'
}


fig, ax = plt.subplots(2, 3, figsize=(20, 10))

for transport_mode, transport_mode_color in transport_modes.items():
    transport_mode_scripted = euclidean_distances[
        (euclidean_distances["mode"] == transport_mode) &
        (euclidean_distances["notes"].str.contains('scripted'))
    ]
    transport_mode_unscripted = euclidean_distances[
        (euclidean_distances["mode"] == transport_mode) &
        (~(euclidean_distances["notes"].str.contains('scripted', na=False)))
    ]
    
    for token, token_symbol in tokens.items():
        transport_mode_scripted_for_token = transport_mode_scripted[
            transport_mode_scripted["token"] == token
        ].index.values
        transport_mode_unscripted_for_token = transport_mode_unscripted[
            transport_mode_unscripted["token"] == token
        ].index.values
        
        ax[0, 0].scatter(
            tsne_results[transport_mode_scripted_for_token, 0], 
            tsne_results[transport_mode_scripted_for_token, 1], 
            c=transport_mode_color, 
            marker=token_symbol, 
            alpha=0.5
        )
        ax[0, 1].scatter(
            tsne_results[transport_mode_scripted_for_token, 0], 
            tsne_results[transport_mode_scripted_for_token, 2], 
            c=transport_mode_color, 
            marker=token_symbol, 
            alpha=0.5
        )
        ax[0, 2].scatter(
            tsne_results[transport_mode_scripted_for_token, 1], 
            tsne_results[transport_mode_scripted_for_token, 2], 
            c=transport_mode_color, 
            marker=token_symbol, 
            alpha=0.5
        )
        
        ax[1, 0].scatter(
            tsne_results[transport_mode_unscripted_for_token, 0], 
            tsne_results[transport_mode_unscripted_for_token, 1], 
            c=transport_mode_color, 
            marker=token_symbol, 
            alpha=0.5
        )
        ax[1, 1].scatter(
            tsne_results[transport_mode_unscripted_for_token, 0], 
            tsne_results[transport_mode_unscripted_for_token, 2], 
            c=transport_mode_color, 
            marker=token_symbol, 
            alpha=0.5
        )
        ax[1, 2].scatter(
            tsne_results[transport_mode_unscripted_for_token, 1], 
            tsne_results[transport_mode_unscripted_for_token, 2], 
            c=transport_mode_color, 
            marker=token_symbol, 
            alpha=0.5
        )

ax[0, 0].set_title('Scripted')
ax[0, 1].set_title('Scripted')
ax[0, 2].set_title('Scripted')
ax[1, 0].set_title('Unscripted')
ax[1, 1].set_title('Unscripted')
ax[1, 2].set_title('Unscripted')
#ax[0].legend(loc='upper center', bbox_to_anchor=(1, 0.5))
#ax[1].legend(loc='upper center', bbox_to_anchor=(1, 0.5))