In [None]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
from scipy.signal import detrend
import seaborn as sns
from dtaidistance import dtw
import numpy as np
from itertools import combinations



In [None]:
file = open('../data/raw/timeseries_samples.pickle', 'rb')
data = pickle.load(file)

In [None]:
# scale
for k, v in data.items():
    v.value = (v.value - v.value.mean()) / v.value.std()

# detrend

for k, v in data.items():
    v.value = detrend(v.value)

# smooth

for k, v in data.items():
    v = v.rolling(5).mean()

In [None]:
data_df = pd.concat(data.values(), axis=1)
data_df.columns = data.keys()

In [None]:
data_df['freq_5min'] = data_df.reset_index()['datetime'].apply(lambda x: x.to_period('5min')).values
data_df['freq_hour'] = data_df.reset_index()['datetime'].apply(lambda x: x.to_period('H')).values
data_df['freq_date'] = data_df.reset_index()['datetime'].apply(lambda x: x.to_period('D')).values
data_df['freq_week'] = data_df.reset_index()['datetime'].apply(lambda x: x.to_period('W')).values
data_df['freq_month'] = data_df.reset_index()['datetime'].apply(lambda x: x.to_period('M')).values

In [None]:
data_df.head()

In [None]:
data_df.groupby('freq_month').mean().plot()
# data_df[data.keys()].plot(subplots=True, layout=(2, 5), figsize=(12, 8))
# sns.lineplot(data_df.groupby('freq_month').mean())

In [None]:
data_df.groupby('freq_week').mean().plot()

In [None]:
data_df.groupby('freq_week').mean().plot()

In [None]:
data_df.groupby('freq_date').mean().plot()

In [None]:
distances = []
for device1, device2 in list(combinations(data.keys(), 2)):
    _df = data_df.groupby('freq_month').median()[[device1, device2]].dropna()
    # print(_df.shape)
    distance = dtw.distance_fast(_df[device1].values, _df[device2].values, use_pruning=True)
    distances.extend(((device1, device2, distance), (device2, device1, distance)))
distances.extend((device1, device1, 1e-10) for device1 in data.keys())

In [None]:
pd.DataFrame(distances, columns=['device1', 'device2', 'dta_distance']).dta_distance.describe()

In [None]:
pd.DataFrame(distances, columns=['device1', 'device2', 'dta_distance']).pivot(index='device1', columns=['device2'], values=['dta_distance'])

In [None]:
from dtaidistance import dtw_visualisation as dtwvis

x = data_df.groupby('freq_date').mean()[['device_0', 'device_2']].dropna()

fig, ax = plt.subplots(2,1)#,figsize=(1280/96, 720/96))
path = dtw.warping_path(x['device_0'].values, x['device_2'].values)
dtwvis.plot_warping(x['device_0'].values, x['device_2'].values, path, 
                    fig=fig, axs=ax)
# ax[0].set_title('DTW Warping Path Between Brazil and India')
fig.tight_layout()

In [None]:
data_df.groupby('freq_date')[['device_0', 'device_1', ]].mean()

In [None]:
from sklearn.cluster import SpectralClustering

In [None]:
clustering = SpectralClustering(n_clusters=2, assign_labels='discretize', affinity='precomputed_nearest_neighbors')

In [None]:
from scipy.sparse import csr_matrix
labels = clustering.fit_predict(
        pd.DataFrame(distances, columns=['device1', 'device2', 'dta_distance']).pivot(index='device1', columns=['device2'], values=['dta_distance']).values
)
labels

In [None]:
# from sklearn.metrics import silhouette_score

In [None]:
# vs = []
# for k, v in data.items():
#     v['device'] = k
#     vs.append(v)

# x = pd.concat(vs)
# x.head()

# from tsfresh import extract_features
# from tsfresh.feature_extraction import MinimalFCParameters

# df = x.reset_index()
# extracted_features = extract_features(df, column_id='device', column_sort='datetime',
#                                       default_fc_parameters=MinimalFCParameters())

# silhouette_score(extracted_features, labels)

In [None]:
# from sklearn.cluster import KMeans
# KMeans(2).fit_predict(extracted_features)

In [None]:
# vs = []
# for k, v in data.items():
#     v['device'] = k
#     vs.append(v)

In [None]:
# x = pd.concat(vs)
# x.head()

In [None]:
# from tsfresh import extract_features
# from tsfresh.feature_extraction import MinimalFCParameters

# df = x.reset_index()
# extracted_features = extract_features(df, column_id='device', column_sort='datetime',
#                                       default_fc_parameters=MinimalFCParameters())

In [None]:
# extracted_features

In [None]:
# from sklearn.cluster import DBSCAN
# from sklearn.preprocessing import StandardScaler

# # prep = StandardScaler()
# kmeans = DBSCAN(eps=3, min_samples=2)

# # scaled_data = prep.fit_transform(extracted_features)
# # kmeans.fit(scaled_data)

In [None]:
# scaled_data = prep.fit_transform(extracted_features)

In [None]:
# kmeans.fit(extracted_features.values)

In [None]:
# # pd.DataFrame(distances, columns=['device1', 'device2', 'dta_distance']).pivot(index='device1', columns=['device2'], values=['dta_distance']).sparse.to_coo()
# x = pd.DataFrame(distances, columns=['device1', 'device2', 'dta_distance']).pivot(index='device1', columns=['device2'], values=['dta_distance']).fillna(0)
# sparse_df = x.astype(pd.SparseDtype("float64",0))
# clustering.fit(sparse_df.sparse.to_coo().tocsr())

In [None]:
# ## Dynamic time warping

# for k, v in data.items():
#     for k1, v1 in data.items():
#         print(k, k1, dtw.distance_fast(v.value.values, v1.value.values, use_pruning=True))

In [None]:
# from statsmodels.tsa import stattools
# from statsmodels.tsa import seasonal

In [None]:
# x = data_df[['freq_date', 'device_0']].set_index('freq_date')
# x.index = x.index.to_timestamp()
# s = seasonal.seasonal_decompose(x.dropna(), period=1)

In [None]:
# s.seasonal.plot()

In [None]:
# for k, v in data.items():
#     print(k, len(v))
#     v['freq_5min'] = v.reset_index()['datetime'].apply(lambda x: str(x.to_period('5min'))).values
#     v['freq_hour'] = v.reset_index()['datetime'].apply(lambda x: str(x.to_period('H'))).values
#     v['freq_date'] = v.reset_index()['datetime'].apply(lambda x: str(x.to_period('D'))).values
#     v['freq_week'] = v.reset_index()['datetime'].apply(lambda x: str(x.to_period('W'))).values
#     v['freq_month'] = v.reset_index()['datetime'].apply(lambda x: str(x.to_period('M'))).values

In [None]:
# data['device_0']

In [None]:
# data_freq_5min = [
#     v[['freq_5min', 'value']].groupby('freq_5min').mean()
#     for k, v in data.items()
# ]


In [None]:
# x = pd.concat(data.values(), axis=1)
# # x.columns = [f"device_{x}" for x in range(10)]
# x

In [None]:
# x.device_3.plot()

In [None]:
# ## prepare data

# vs = []
# for k, v in data.items():
#     _v = v.reset_index()
#     _v.datetime = _v.datetime.round('1s')
#     _v.set_index('datetime', inplace=True)
#     v = _v


In [None]:
# x = pd.concat(vs, axis=1)
# x.columns = data.keys()

In [None]:
# fig, ax = plt.subplots(figsize = (12, 6))

# for k, v in data.items():
#     ax.plot(v, alpha=0.5)

In [None]:
# ## scale data

# for k, v in data.items():
#     v.value = (v.value - v.value.mean()) / v.value.std()

In [None]:
# ## detrending

# from scipy.signal import detrend

# for k, v in data.items():
#     v.value = detrend(v.value)

In [None]:
# ## smooth

# for k, v in data.items():
#     v = v.rolling(5).mean().dropna()

In [None]:
# fig, ax = plt.subplots(figsize = (12, 6))

# for k, v in data.items():
#     ax.plot(v, alpha=0.5)

In [None]:
# ## Euclidean distance
# from scipy.spatial import distance

# for k, v in data.items():
#     for k1, v1 in data.items():
#         print(k, k1, distance.cdist(v.value.values.reshape(-1, 1), v1.value.values.reshape(-1, 1)))

In [None]:
# ## Dynamic time warping

# from dtaidistance import dtw
# import numpy as np

# for k, v in data.items():
#     for k1, v1 in data.items():
#         print(k, k1, dtw.distance_fast(v.value.values, v1.value.values, use_pruning=True))
