## Clustering of time series data

In [None]:
import pandas as pd
import numpy as np
import glob
from tslearn.clustering import KShape
from tslearn.preprocessing import TimeSeriesScalerMeanVariance

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def read_filedata(filenames):
    #Read file data and return data frame
    dfs = []
    for filename in filenames:
        original_df = pd.read_csv(filename, index_col=None, header=0)
        dfs.append(original_df)
    return dfs

def time_series_data(dataframes, target_col=None):
    #Read the data frames and make them array in chronological order
    tsdata = []
    for i, df in enumerate(dataframes):
        tsdata.append(df[target_col].values.tolist()[:])
        #Check the maximum length of each time series data
        len_max = 0
        for ts in tsdata:
            if len(ts) > len_max:
                len_max = len(ts)
        # Add last data to align the lengths of time series data
        for i, ts in enumerate(tsdata):
            len_add = len_max - len(ts)
            tsdata[i] = ts + [ts[-1]] * len_add
    
    tsdata = np.array(tsdata)
    return tsdata

def transform_vector(time_series_array):
    #Convert to vector
    stack_list = []
    for j in range(len(time_series_array)):
        data = np.array(time_series_array[j])
        data = data.reshape((1, len(data))).T
        stack_list.append(data)
    # Make one dimensional array
    stack_data = np.stack(stack_list, axis=0)
    return stack_data


#filenames = sorted(glob.glob('sample/sample_data*.csv'))
#df = read_filedata(filenames=filenames)
#tsdata = time_series_data(dataframes=df, target_col='data')
#stack_data = transform_vector(time_series_array=tsdata)

In [None]:
# Import data
import csv

results = []

with open('test_CL0.csv','r') as f:
    lines = csv.reader(f)
    for line in lines:
        results.append([[float(i)] for i in line])

stack_data = results

In [None]:
# Soft-DTW-k-means
import numpy
import matplotlib.pyplot as plt

from tslearn.clustering import TimeSeriesKMeans
from tslearn.datasets import CachedDatasets
from tslearn.preprocessing import TimeSeriesScalerMeanVariance, TimeSeriesResampler

seed = 0
np.random.seed(seed)

stack_data = TimeSeriesScalerMeanVariance(mu=0.0, std=1.0).fit_transform(stack_data)

sdtw_km = TimeSeriesKMeans(n_clusters=4, metric="softdtw", metric_params={"gamma_sdtw": .01},
                           verbose=True, random_state=seed)
y_pred = sdtw_km.fit_predict(stack_data)

#Clustering and visualization
plt.figure(figsize=(30,36))
for yi in range(4):
    plt.subplot(4, 1, 1 + yi)
    for xx in stack_data[y_pred == yi]:
        plt.plot(xx.ravel(), "k-", alpha=.2)
    plt.plot(sdtw_km.cluster_centers_[yi].ravel(), "r-")
    plt.title("Cluster %d" % (yi + 1))

plt.tight_layout()
plt.show()

In [None]:
# k-shape
seed = 0
np.random.seed(seed)
#To calculate cross-correlation, you need to normalize.
#TimeSeriesScalerMeanVariance is the class that normalizes the data.
stack_data = TimeSeriesScalerMeanVariance(mu=0.0, std=1.0).fit_transform(stack_data)

#Instantiation of the KShape class 
ks = KShape(n_clusters=5, n_init=10, verbose=True, random_state=seed)
y_pred = ks.fit_predict(stack_data)

#Clustering and visualization 
plt.figure(figsize=(30,36))
for yi in range(5):
    plt.subplot(5, 1, 1 + yi)
    for xx in stack_data[y_pred == yi]:
        plt.plot(xx.ravel(), "k-", alpha=.2)
    #plt.plot(ks.cluster_centers_[yi].ravel(), "r-")
    plt.title("Cluster %d" % (yi + 1))

plt.tight_layout()
plt.show()

In [None]:
#Export Clusters Results

y_pred
df = pd.DataFrame(y_pred)
df.to_csv("file_path.csv")

## Calculation of the number of clusters by the elbow method

In [None]:
distortions = []

#Calculate 1 to 10 clusters 
for i  in range(1,11):
    ks = KShape(n_clusters=i, n_init=10, verbose=True, random_state=seed)
    #Execute clustering calculation
    ks.fit(stack_data)
    #ks.fit You can get SSE with #inertia_
    distortions.append(ks.inertia_)

plt.plot(range(1,11), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()