## Parameter optimization for t-SNE

In [1]:
# Load the "autoreload" extension
%load_ext autoreload

# always reload modules marked with "%aimport"
%autoreload 1

import os
import sys
from dotenv import load_dotenv, find_dotenv
import numpy as np
import pandas as pd
#Visualisation Libraries
%matplotlib inline
# Uncomment if you want interactive 3D plots --> does not work in the github rendering
#%matplotlib notebook
from copy import deepcopy

import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
matplotlib.style.use('ggplot')
import seaborn as sns
# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

%aimport visualization.visualize
from visualization.visualize import get_color_encoding
from visualization.visualize import plot_timeseries_clustering
from visualization.visualize import get_plot_timeseries_clustering_variables
%aimport data.preprocessing
from data.preprocessing import Preprocessor
%aimport data.download
from data.download import DatasetDownloader
%aimport utils.utilities
from utils.utilities import get_cluster_labels

%aimport models.cluster
from models.cluster import get_clustering_performance
%aimport models.dimensionality_reduction
from models.dimensionality_reduction.TSNEModel import TSNEModel

Load data from disk.

In [2]:
# Load data from disk.
data_dir = os.path.join(os.path.abspath(DatasetDownloader.get_data_dir()))
file_path = os.path.join(data_dir, "preprocessed","preprocessed_data.dat")
dfs = Preprocessor.restore_preprocessed_data_from_disk(file_path)

Calculate distances.

In [3]:
trips_cut_per_30_sec = Preprocessor.get_cut_trip_snippets_for_total(dfs)
euclidean_distances = Preprocessor.calculate_distance_for_n2(trips_cut_per_30_sec, metric="euclidean")

Generate t-SNE model.

In [4]:
tsne = TSNEModel(num_dimensions=2,
                 perplexity=20,
                 early_exaggeration=5.0,
                 learning_rate=100,
                 num_iterations=500,
                 min_grad_norm=0.01,
                 random_state=42,
                 angle=0.2,
                 metric='precomputed',
                 init_method='random')

Prepare distance data for fitting of t-SNE model.

In [5]:
categorical_columns = ["mode", "notes", "scripted", "token", "trip_id"]
segment_distance_matrix = euclidean_distances.drop(categorical_columns,axis=1)

Fit t-SNE model to data and calculate quality measures.

In [7]:
tsne_results = tsne.run(segment_distance_matrix.values)
quality_measures = tsne.calculate_quality_measures(segment_distance_matrix.values)

Next steps: Integrate BayesianTSNEOptimizer, start optimization (record results and ingest at next start as initialization values).