In [None]:
# Import modules

import sys
import pandas as pd
import numpy as np
import random
import pickle
import datetime
import matplotlib.pyplot as plt
%matplotlib inline
import tsfresh
from sklearn import preprocessing
from sklearn.cluster import KMeans
from scipy import stats

%run MyFunctions.ipynb

In [None]:
# my_features = ['Hb__absolute_sum_of_changes', 'Hb__autocorrelation__lag_0', 'Hb__autocorrelation__lag_1', 'Hb__autocorrelation__lag_2',
#                'Hb__autocorrelation__lag_3', 'Hb__autocorrelation__lag_4', 'Hb__autocorrelation__lag_5', 'Hb__autocorrelation__lag_6',
#                'Hb__autocorrelation__lag_7', 'Hb__autocorrelation__lag_8', 'Hb__autocorrelation__lag_9', 'Hb__kurtosis',
#                'Hb__linear_trend__attr_"intercept"', 'Hb__linear_trend__attr_"slope"', 'Hb__linear_trend__attr_"stderr"', 
#                'Hb__mean_abs_change', 'Hb__sample_entropy', 'Hb__skewness', 'Hb__variance']
df_weekly_f = pd.read_json('X:/df_weekly_f_5000.json')
df_weekly_m = pd.read_json('X:/df_weekly_m_5000.json')

In [None]:
y_f = df_weekly_f.transpose().reset_index().rename(columns={'index':'Time'})
y_f = pd.melt(y_f, id_vars=y_f.columns[0], value_vars=y_f.columns[1:], var_name='KeyID', value_name='Hb')
y_f = y_f.dropna()
extracted_features_f = tsfresh.extract_features(y_f, column_id="KeyID", column_sort="Time", column_value='Hb')
df_features_f =  extracted_features_f.loc[:, my_features]
df_features_f.to_json('X:/df_features_f.json')

In [None]:
y_m = df_weekly_m.transpose().reset_index().rename(columns={'index':'Time'})
y_m = pd.melt(y_m, id_vars=y_m.columns[0], value_vars=y_m.columns[1:], var_name='KeyID', value_name='Hb')
y_m = y_m.dropna()
extracted_features_m = tsfresh.extract_features(y_m, column_id="KeyID", column_sort="Time", column_value='Hb')
df_features_m =  extracted_features_m.loc[:, my_features]
df_features_m.to_json('X:/df_features_m.json')

In [None]:
df_weekly_f = pd.read_json('X:/df_weekly_f_5000.json')
df_weekly_m = pd.read_json('X:/df_weekly_m_5000.json')

df_features_f = pd.read_json('X:/df_features_f.json')
df_features_m = pd.read_json('X:/df_features_m.json')

df_features_f = df_features_f.loc[:, ['Hb__linear_trend__attr_"intercept"', 'Hb__linear_trend__attr_"slope"']]
df_features_m = df_features_m.loc[:, ['Hb__linear_trend__attr_"intercept"', 'Hb__linear_trend__attr_"slope"']]

In [None]:
df_features_f = normalizeFeatures(df_features_f)
df_features_m = normalizeFeatures(df_features_m)

centroidsf = calcCentroids(df_features_f, df_weekly_f, [2, 3, 4, 5])
centroidsm = calcCentroids(df_features_m, df_weekly_m, [2, 3, 4, 5])

with open('X:/clusterresultsglobal2/centroidsf.pkl', 'wb') as f:
     pickle.dump(centroidsf, f)
        
with open('X:/clusterresultsglobal2/centroidsm.pkl', 'wb') as f:
     pickle.dump(centroidsm, f)

In [None]:
def calcDistances(df_features, num_clust, centroids, df, w=5):
    kmeans = KMeans(n_clusters=num_clust)
    kmeans.fit(df_features)
    labels = kmeans.predict(df_features)

    distances = pd.DataFrame(index=df_features.index, columns=['cluster', 'distance'])
    for c_id in range(0, num_clust):
        c_row = centroids.loc[c_id]
        s_ids = df_features.index[labels == c_id]
        for s_id in s_ids:
            s_row = df.loc[s_id]
            dist = DTWDistance(s_row, c_row, w)
            distances.loc[distances.index == s_id, 'cluster'] = c_id
            distances.loc[distances.index == s_id, 'distance'] = dist
    return distances

In [None]:
distances_f2 = calcDistances(df_features_f, 2, centroidsf[1], df_weekly_f)
distances_f2.to_json('X:/clusterresultsglobal2/distances_f2.json')

distances_f3 = calcDistances(df_features_f, 3, centroidsf[1], df_weekly_f)
distances_f3.to_json('X:/clusterresultsglobal/distances_f3.json')

distances_f4 = calcDistances(df_features_f, 4, centroidsf[2], df_weekly_f)
distances_f4.to_json('X:/clusterresultsglobal/distances_f4.json')

distances_f5 = calcDistances(df_features_f, 5, centroidsf[3], df_weekly_f)
distances_f5.to_json('X:/clusterresultsglobal/distances_f5.json')

In [None]:
distances_m2 = calcDistances(df_features_m, 2, centroidsm[0], df_weekly_m)
distances_m2.to_json('X:/clusterresultsglobal/distances_m2.json')

distances_m3 = calcDistances(df_features_m, 3, centroidsm[1], df_weekly_m)
distances_m3.to_json('X:/clusterresultsglobal/distances_m3.json')

distances_m4 = calcDistances(df_features_m, 4, centroidsm[2], df_weekly_m)
distances_m4.to_json('X:/clusterresultsglobal/distances_m4.json')

distances_m5 = calcDistances(df_features_m, 5, centroidsm[3], df_weekly_m)
distances_m5.to_json('X:/clusterresultsglobal/distances_m5.json')

In [None]:
df_weekly_f.head()

In [None]:
X = list(df_weekly_f.columns)

In [None]:
def getSlopeIntercept(X, Y):
    Y = Y[~np.isnan(Y)]
    X = X[:len(Y)]
    fit = stats.linregress(X, Y)
    slope = fit[0]
    intercept = fit[1]
    return intercept, slope

In [None]:
getSlopeIntercept(X, df_weekly_f.loc['DK.00000162'])

In [None]:
df_features_f['intercept2'] = df_weekly_f.apply(lambda x: getSlopeIntercept(X, x)[0], axis=1)
df_features_f['slope2'] = df_weekly_f.apply(lambda x: getSlopeIntercept(X, x)[1], axis=1)
df_features_m['intercept2'] = df_weekly_m.apply(lambda x: getSlopeIntercept(X, x)[0], axis=1)
df_features_m['slope2'] = df_weekly_m.apply(lambda x: getSlopeIntercept(X, x)[1], axis=1)

In [None]:
df_features_f.columns = ['intercept1', 'slope1', 'intercept2', 'slope2']

In [None]:
df_features_f['slopediff'] = abs(df_features_f['slope1'] - df_features_f['slope2'])
df_features_f['sloperatio'] = df_features_f['slope1'] / df_features_f['slope2']

In [None]:
df_features_f.sort_values('slopediff', ascending=False).head()