In [None]:
!pip install tsfresh hdbscan giotto-tda gudhi kaleido
!pip install --upgrade --user numpy

In [2]:
import numpy as np
import scipy.io as spio
import pandas as pd
from tqdm import tqdm
from glob import glob
from random import randint, seed
import math
import os

from sklearn.metrics import homogeneity_score, v_measure_score
from scipy.optimize import linear_sum_assignment
from sklearn.metrics.cluster import _supervised
from sklearn.decomposition import PCA
from sklearn.cluster import SpectralClustering, AgglomerativeClustering
from hdbscan import HDBSCAN

import kaleido
import imageio
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from gtda.time_series import SingleTakensEmbedding, TakensEmbedding
from gtda.plotting import plot_point_cloud, plot_diagram
from gtda.homology import VietorisRipsPersistence, WeakAlphaPersistence
from gtda.diagrams import PersistenceEntropy
from gudhi import bottleneck_distance

from tsfresh import extract_features

Uploading data

In [3]:
matdata = spio.loadmat('snp452-data.mat')

code2name = {}
code2class = {}
code2data = {}

data = matdata['X']

codes = []
classes = []

for i in tqdm(range(452)):
    code = matdata['stock'][0][i][0][0][0][0][1:-1]
    codes += [code]

    name = matdata['stock'][0][i][0][0][1][0][1:-1]
    cls = matdata['stock'][0][i][0][0][2][0][1:-1]
    classes += [cls]

    dt = matdata['X'][:, i]
    #print(dt.shape)

    code2name[code] = name
    code2class[code] = cls
    code2data[code] = dt

print()
print(set(classes))
print(len(classes))

100%|██████████| 452/452 [00:00<00:00, 133321.06it/s]


{'Telecommunications Services', 'Utilities', 'Energy', 'Financials', 'Information Technology', 'Health Care', 'Industrials', 'Consumer Staples', 'Consumer Discretionary', 'Materials'}
452





Calculate log-returns

In [5]:
log_return_df = np.log(1 + pd.DataFrame(data.T).pct_change(axis=1).dropna(axis=1, how='all'))
log_return_data = log_return_df.to_numpy()

#code_col = []
#time_col = []
#price_col = []
#for i, row in tqdm(enumerate(log_return_data)):
#    code_col += [codes[i]] * len(row)
#    time_col += list(range(len(row)))
#    price_col += list(row)

#df = pd.DataFrame(data={"code": code_col, "time": time_col, "price": price_col})
#print(df.head())

#extracted_features = extract_features(df, column_id='code', column_sort='time')
#extracted_features.to_csv("log_return_data_tsfresh_features.csv")

Check the size of each class

In [6]:
class2num = dict(zip(list(set(classes)), [0] * 10))

for code in codes:
    class2num[code2class[code]] += 1

class2num

{'Telecommunications Services': 6,
 'Utilities': 32,
 'Energy': 37,
 'Financials': 74,
 'Information Technology': 64,
 'Health Care': 46,
 'Industrials': 59,
 'Consumer Staples': 35,
 'Consumer Discretionary': 70,
 'Materials': 29}

Pick a subset to test Statistical clustering

In [7]:
subdata_ind = []

for i, code in enumerate(codes):
    if code2class[code] in ['Telecommunications Services', 'Materials', 'Utilities']:
        subdata_ind += [i]

len(subdata_ind)

67

In [8]:
log_subdata = log_return_data[subdata_ind]
log_subdata.shape

(67, 1257)

Statistical clustering based on Energy distance

In [None]:
from scipy.spatial import distance_matrix

def energy_distance(Y, Z):
    n = Y.shape[0]
    d1 = distance_matrix(Y, Z)
    d2 = distance_matrix(Y, Y)
    d3 = distance_matrix(Z, Z)

    return d1.sum() * 2 / n ** 2 - d2.sum() / n ** 2 - d3.sum() / n **2

def d_jk(data, h, j, k):
    Y = []
    Z = []

    for t in range(data.shape[1] - h):
        Y += [data[j][t : t + h]]
        Z += [data[k][t : t + h]]

    Y = np.array(Y)
    Z = np.array(Z)

    return energy_distance(Y, Z)

def dist_matrix(data, h):
    D = np.zeros((data.shape[0], data.shape[0]))

    for j in range(data.shape[0]):
        print(j)
        for k in tqdm(range(j, data.shape[0])):
            d = d_jk(data, h, j, k)
            #D[j, k] = D[k, j] = d
            pass

    return D

h = 1

D = dist_matrix(log_subdata, h)
D.shape

In [None]:
!pip install git+https://github.com/fbkarsdorp/diachronic-text-analysis@master
!pip install ete3

In [11]:
from HACluster import Clusterer
import numpy as np
from scipy.cluster.hierarchy import to_tree
from sklearn.metrics import pairwise_distances

clst = Clusterer(D, 'ward', num_clusters=3)
clst.cluster(verbose=0)

In [12]:
len(clst._dendrogram[0].leaves()), len(clst._dendrogram[1].leaves()), len(clst._dendrogram[2].leaves())

(65, 1, 1)

Utilities functions

In [13]:
def smoothing(df, window_n, func:str):
  if func == 'mean':
    return df.rolling(window_n, axis=1).mean().dropna(axis=1, how='all')
  elif func == 'min':
    return df.rolling(window_n, axis=1).min().dropna(axis=1, how='all')
  elif func == 'max':
    return df.rolling(window_n, axis=1).max().dropna(axis=1, how='all')
  elif func == 'median':
    return df.rolling(window_n, axis=1).median().dropna(axis=1, how='all')
  else:
    assert 1 == 0, "Error !!!"

def diff(df, order):
  diff_df = df.diff(axis=1).dropna(axis=1, how='all')
  if order == 1:
    return diff_df
  elif order == 2:
    return diff_df.diff(axis=1).dropna(axis=1, how='all')
  else:
    assert 1 == 2, "No"

def mynormalize(df, func='mean'):
  if func == 'minmax':
    return (df-df.min())/(df.max()-df.min())
  elif func == 'mean':
    return (df-df.mean())/df.std()
  else:
    assert 1 == 2, "No"

Plot and save a few raw time series

In [14]:
N = 452

for i, ts in enumerate(data.T[:N:200]):
    #ts = ts[1:] - ts[:-1]
    fig = px.line(
        x=range(len(ts)),
        y=ts,
        width=800,
        height=400
    )
    fig.show()
    fig.write_image(str(i) + ".png")

Smooth time series

In [15]:
smooth_df = smoothing(pd.DataFrame(data.T), 50, 'mean')
smooth = smooth_df.to_numpy()

Animation related utilities functions

In [16]:
def rotation_matrix(axis, theta):
    """
    Return the rotation matrix associated with counterclockwise rotation about
    the given axis by theta radians.
    """
    axis = np.asarray(axis)
    axis = axis / math.sqrt(np.dot(axis, axis))
    a = math.cos(theta / 2.0)
    b, c, d = -axis * math.sin(theta / 2.0)
    aa, bb, cc, dd = a * a, b * b, c * c, d * d
    bc, ad, ac, ab, bd, cd = b * c, a * d, a * c, a * b, b * d, c * d
    return np.array([[aa + bb - cc - dd, 2 * (bc + ad), 2 * (bd - ac)],
                     [2 * (bc - ad), aa + cc - bb - dd, 2 * (cd + ab)],
                     [2 * (bd + ac), 2 * (cd - ab), aa + dd - bb - cc]])

axis = [0, 0, 1]
theta = -np.pi / 100

def animation_save(embeddings_pca, title):
    os.makedirs(title, exist_ok=True)

    fig = go.Figure(data=[go.Scatter3d(x=[], y=[], z=[], mode="markers")])

    X = np.vstack([(rotation_matrix(axis, theta * k) @ embeddings_pca.T).T[:, 0] for k in range(200)])
    Y = np.vstack([(rotation_matrix(axis, theta * k) @ embeddings_pca.T).T[:, 1] for k in range(200)])
    Z = np.vstack([(rotation_matrix(axis, theta * k) @ embeddings_pca.T).T[:, 2] for k in range(200)])

    lng = embeddings_pca.shape[0]

    frames = [go.Frame(data=[go.Scatter3d(
        x=X[k],
        y=Y[k],
        z=Z[k],
        marker=dict(color='rgba(0, 0, 255, 1)', line=dict(width=0), size=np.ones_like(X[k]) * sz * 1.5))],
        traces=[0],
        name=f'frame{k}'
    ) for k in range(200)]

    fig.update(frames=frames)

    fig.update_layout(
        updatemenus=[dict(type="buttons",
            buttons=[
                dict(label="Play",
                method="animate",
                args=[
                    None, dict(frame=dict(duration=30, redraw=True, fromcurrent=True, mode='immediate'), transition=dict(
                                      duration=0,
                                      easing="linear"
                                  ))
                ])
            ])
        ])

    fig.show()

    for k in tqdm(range(200)):
        fig = px.scatter_3d(
            x=X[k],
            y=Y[k],
            z=Z[k])
        fig.update_traces(
            marker=dict(color='rgba(0, 0, 255, 1)',
                        line=dict(width=0),
                        size=np.ones_like(X[k]) * sz * 1.5)
        )

        fig.write_image(title + "/" + str(k) + ".png")

    with imageio.get_writer(title + '_topology.gif', mode='I') as writer:
        for k in range(200):
            image = imageio.imread(title + "/" + str(k) + ".png")
            writer.append_data(image)

1) Plotting and saving smoothed time series

2) Extracting time delay embeddings

3) Using PCA for visialization

4) Saving plots/animation of point clouds of time delay embeddings after PCA

5) Obtaining persistence diagrams from Vietoris-Rips filtrations of point clouds of time delay embeddings after PCA

In [17]:
max_embedding_dimension = 30
max_time_delay = 35
stride = 5

N = 452
point_clouds = []

for i, ts in enumerate(smooth[:N:200]):
    #ts = ts[1:] - ts[:-1]
    sz = 3

    fig = px.line(
        x=range(len(ts)),
        y=ts,
        width=800,
        height=400
    )
    fig.show()
    fig.write_image(str(i) + "_smooth.png")

    embedder = SingleTakensEmbedding(
        parameters_type="search",
        time_delay=max_time_delay,
        dimension=max_embedding_dimension,
        stride=stride,
    )

    embeddings = embedder.fit_transform(ts)
    #print(embeddings.shape)
    pca = PCA(n_components=min(embeddings.shape[1], 3))
    embeddings_pca = pca.fit_transform(embeddings)

    if embeddings_pca.shape[1] == 3:
        fig = px.scatter_3d(
            x=embeddings_pca[:, 0],
            y=embeddings_pca[:, 1],
            z=embeddings_pca[:, 2],
            size=np.ones_like(embeddings_pca[:, 0]) * sz * 1.5,
            size_max=sz * 1.5,
            #width=800,
            #height=400
        )
        animation_save(embeddings_pca, str(i))
    else:
        fig = px.scatter(
            x=embeddings_pca[:, 0],
            y=embeddings_pca[:, 1],
            size=np.ones_like(embeddings_pca[:, 0]) * sz,
            size_max=sz,
            #width=800,
            #height=400
        )
        fig.write_image(str(i) + "_topology.png")

        embeddings_pca_ = np.zeros((embeddings_pca.shape[0], 3))
        embeddings_pca_[:, 1:] = embeddings_pca
        embeddings_pca = embeddings_pca_

    fig.update_traces(marker=dict(color='rgba(0, 0, 255, 1)', line=dict(width=0)))
    fig.show()

    point_clouds += [embeddings_pca]

VR = VietorisRipsPersistence(homology_dimensions=[0, 1, 2])
diagrams = VR.fit_transform(point_clouds)

Output hidden; open in https://colab.research.google.com to view.

Plotting and saving diagrams

In [18]:
fig = plot_diagram(diagrams[0])
fig.write_image("0_diag.png")

fig = plot_diagram(diagrams[1])
fig.write_image("1_diag.png")

fig = plot_diagram(diagrams[2])
fig.write_image("2_diag.png")

Calculating Persistence Entropy of diagrams

In [19]:
PE = PersistenceEntropy()
features = PE.fit_transform(diagrams)
#print(features.shape)

In [20]:
#pd.DataFrame(features).to_csv("PE.csv")
#features = pd.read_csv("PE.csv", index_col=0)
#features = features.to_numpy()

Plotting and saving Persistence Entropy for different time series classes

In [21]:
os.makedirs("PE", exist_ok=True)

fig = go.Figure(data=[go.Scatter3d(x=[], y=[], z=[], mode="markers")])

X = np.vstack([(rotation_matrix(axis, theta * k) @ features.T).T[:, 0] for k in range(200)])
Y = np.vstack([(rotation_matrix(axis, theta * k) @ features.T).T[:, 1] for k in range(200)])
Z = np.vstack([(rotation_matrix(axis, theta * k) @ features.T).T[:, 2] for k in range(200)])
lng = embeddings_pca.shape[0]

colors = ['#0d0887', '#46039f', '#7201a8', '#9c179e', '#bd3786', '#d8576b', '#ed7953', '#fb9f3a', '#fdca26', '#f0f921']
class2color = dict(zip(set(classes), colors))

frames = [go.Frame(data=[go.Scatter3d(
    x=X[k],
    y=Y[k],
    z=Z[k],
    marker=dict(color=[class2color[code2class[code]] for code in codes], line=dict(width=0), size=np.ones_like(X[k]) * sz * 1.5))],
    traces=[0],
    name=f'frame{k}'
) for k in range(200)]

fig.update(frames=frames)

fig.update_layout(
    updatemenus=[dict(type="buttons",
    buttons=[
        dict(label="Play",
             method="animate",
             args=[
                 None, dict(frame=dict(duration=30, redraw=True, fromcurrent=True, mode='immediate'),
                            transition=dict(
                                duration=0,
                                easing="linear"
                                ))
                 ])
        ])
    ])

fig.show()

for k in tqdm(range(200)):
    fig = px.scatter_3d(
        x=X[k],
        y=Y[k],
        z=Z[k])
    fig.update_traces(
        marker=dict(color=[class2color[code2class[code]] for code in codes],
            line=dict(width=0),
                size=np.ones_like(X[k]) * sz * 1.5)
        )

    fig.write_image("PE/" + str(k) + ".png")

with imageio.get_writer('PE.gif', mode='I') as writer:
    for k in range(200):
        image = imageio.imread("PE/" + str(k) + ".png")
        writer.append_data(image)

100%|██████████| 200/200 [07:42<00:00,  2.31s/it]


