# Modelling

## 1. Import packeges

In [21]:
import pandas as pd
import numpy as np
from tslearn.clustering import TimeSeriesKMeans
import matplotlib.pyplot as plt
import plotly.express as px
import datetime
import plotly.graph_objects as go
import plotly_calplot

In [22]:
data_input_path = '/Users/szejozsef00/Desktop/MSC/MSC 2. félév/DS Lab I/project/data/processed/'

## 2. Load the data

In [23]:
df = pd.read_csv(data_input_path + 'standardized_data.csv',sep=';',decimal=',')
df['DATETIME'] = pd.to_datetime(df['DATETIME'])
df = df.fillna(0)

## 3. Transform the data

In [24]:
# Add Date column
df['DATE'] = pd.to_datetime(df['DATETIME']).dt.date

In [25]:
df['DATE'].max()

datetime.date(2010, 9, 24)

In [26]:
min_date_1  = datetime.date(2010,6,1)
max_date_1  = datetime.date(2010,9,1)

In [27]:
all_df = df.copy(deep=True)

In [28]:
df = all_df[(all_df['DATE'] >= min_date_1) & (all_df['DATE'] < max_date_1)]

In [29]:
# Melt the dataframe to DATE-DATETIME-LOCATION-VALUE format
melted_fact_df = pd.melt(df, id_vars=['DATE','DATETIME'],var_name='LOCATION', value_name='VALUE')
melted_fact_df['LOCATION'] = melted_fact_df['LOCATION'].astype(int)
lmelted_fact_df = melted_fact_df.sort_values('LOCATION')
melted_fact_df.head(5)

In [None]:
# Groupby the dataframe to DATE-LOCATION-VALUE_LIST format
daily_fact_df = melted_fact_df.groupby(['DATE','LOCATION'])['VALUE'].apply(np.array).reset_index(name='VALUE_LIST')
daily_fact_df.head(5)

Unnamed: 0,DATE,LOCATION,VALUE_LIST
0,2010-06-01,0,"[-0.16701062, -0.0088891145, 0.057786137, 0.05..."
1,2010-06-01,1,"[0.6189931, 0.7809212, 0.90916926, 0.9611215, ..."
2,2010-06-01,2,"[-1.5094813, -1.5149962, -1.6476423, -1.625408..."
3,2010-06-01,3,"[0.34800974, 0.5028136, 0.54332435, 0.5931182,..."
4,2010-06-01,4,"[0.12918377, 0.27380937, 0.26099712, 0.2584206..."


In [None]:
# Melt the dataframe to LOCATION-VALUE_LIST format
location_fact_df = daily_fact_df.groupby(['LOCATION'])['VALUE_LIST'].apply(np.array).reset_index(name='VALUE_LIST')
# stack the data to (450,288) shape
location_fact_df['VALUE_LIST'] = location_fact_df['VALUE_LIST'].apply(lambda x : np.stack(x,axis = 0))
# reshape the data to (450,288,1) shape
location_fact_df['VALUE_LIST'] = location_fact_df['VALUE_LIST'].apply(lambda x : x.reshape(92,288,1))
location_fact_df.head(5)

Unnamed: 0,LOCATION,VALUE_LIST
0,0,"[[[-0.16701062], [-0.0088891145], [0.057786137..."
1,1,"[[[0.6189931], [0.7809212], [0.90916926], [0.9..."
2,2,"[[[-1.5094813], [-1.5149962], [-1.6476423], [-..."
3,3,"[[[0.34800974], [0.5028136], [0.54332435], [0...."
4,4,"[[[0.12918377], [0.27380937], [0.26099712], [0..."


## 4. Modelling

In [None]:
min_date  = melted_fact_df['DATE'].min()
max_date  = melted_fact_df['DATE'].max()
date_interval = pd.date_range(min_date,max_date, freq='D')

# control_datetimes = pd.date_range(
#     start = melted_fact_df['DATETIME'].min(),
#     end = melted_fact_df['DATETIME'].max(),
#     freq = "5T"
# )

# #vertikálisan is vágni DATE mentén
# #

In [None]:
cluster_number = 4
location_number = 20

In [None]:
# DTW-k-means
cluster_centers = pd.DataFrame()
location_clusters = pd.DataFrame()


for loc in range(0,location_number):
    print(f"LOCATION {loc}, DTW k-means")
    sdtw_km = TimeSeriesKMeans(n_clusters=cluster_number,
                            max_iter = 3,
                            metric="dtw",
                            # metric_params={"gamma": .01},
                            verbose=False)
    curr_input = location_fact_df[location_fact_df['LOCATION'] == loc]['VALUE_LIST'].values[0]
    curr_y_pred = sdtw_km.fit_predict(curr_input)
    print("Best:", sdtw_km.inertia_)

    curr_location_clusters = pd.DataFrame(curr_y_pred,columns=['CLUSTERS'])
    curr_location_clusters.insert(0,"LOCATION",loc)
    curr_location_clusters.insert(0,"DATE",date_interval)

    location_clusters = pd.concat([location_clusters,curr_location_clusters])


    curr_cluster_centers = pd.DataFrame({'CLUSTERS' : range(0,cluster_number), 'CLUSTER_CENTERS': sdtw_km.cluster_centers_.reshape(cluster_number,288).tolist()})
    curr_cluster_centers.insert(0,"LOCATION",loc)

    cluster_centers = pd.concat([cluster_centers,curr_cluster_centers])


LOCATION 0, DTW k-means
Best: 4.141245665528949
LOCATION 1, DTW k-means
Best: 5.260433079217711
LOCATION 2, DTW k-means
Best: 5.167480125730722
LOCATION 3, DTW k-means
Best: 4.325992152342479
LOCATION 4, DTW k-means
Best: 7.925355244184322
LOCATION 5, DTW k-means
Best: 5.254550198370817
LOCATION 6, DTW k-means
Best: 5.360870792155921
LOCATION 7, DTW k-means
Best: 6.297435583401762
LOCATION 8, DTW k-means
Best: 5.206574328703065
LOCATION 9, DTW k-means
Best: 4.595514579874001
LOCATION 10, DTW k-means
Best: 4.001199492054047
LOCATION 11, DTW k-means
Best: 3.3311625329536807
LOCATION 12, DTW k-means
Best: 5.579794364134184
LOCATION 13, DTW k-means
Best: 3.230133391092172
LOCATION 14, DTW k-means
Best: 2.5997223926515485
LOCATION 15, DTW k-means
Best: 2.47608983636638
LOCATION 16, DTW k-means
Best: 0.11364326107181015
LOCATION 17, DTW k-means
Best: 4.531863925852884
LOCATION 18, DTW k-means
Best: 0.7467907944112406
LOCATION 19, DTW k-means
Best: 2.9370925847527505


In [None]:
location_clusters.columns = ['DATE','LOCATION','CLUSTERS']

In [None]:
location_clusters['CLUSTERS'] = location_clusters['CLUSTERS'].apply(lambda x : x + 1)
location_clusters.head()

Unnamed: 0,DATE,LOCATION,CLUSTERS
0,2010-06-01,0,1
1,2010-06-02,0,1
2,2010-06-03,0,3
3,2010-06-04,0,1
4,2010-06-05,0,1


In [None]:
#change date type
location_clusters['DATE'] = pd.to_datetime(location_clusters['DATE'])
#Set index as interact_day
# location_clusters.set_index('DATE', inplace = True)

In [None]:
location_clusters

Unnamed: 0,DATE,LOCATION,CLUSTERS
0,2010-06-01,0,1
1,2010-06-02,0,1
2,2010-06-03,0,3
3,2010-06-04,0,1
4,2010-06-05,0,1
...,...,...,...
87,2010-08-27,19,1
88,2010-08-28,19,1
89,2010-08-29,19,3
90,2010-08-30,19,3


In [None]:
for loc in range(0,location_number):
    # generate the plot
    curr = location_clusters[location_clusters['LOCATION'] == loc].copy(deep=True)
    fig = plotly_calplot.calplot(
        curr,
        x="DATE",
        y="CLUSTERS",
        years_title=True,
        showscale=True,
        colorscale='viridis',
        title=f"LOCATION {loc} clustering result",
        start_month=5,
        end_month=10
    )


    fig.update_layout({
        'height': 300,
        'width' : 1200,
        'margin': {'b': 20, 't': 60},
    })

    fig.show()


    # ax = calplot.calplot(
    #     data = curr['CLUSTERS'],
    #     how = None,
    #     cmap = 'Dark2',
    #     figsize = (16, 8),
    #     suptitle = "Title"
    # )


## Scratch

In [None]:
# cluster_centers.to_csv(data_input_path + 'clust_cent_test_first_100.csv',index=False, sep=';')
# location_clusters.to_csv(data_input_path + 'loc_clus_test_first_100.csv',index=False, sep=';')

# cluster_centers = pd.read_csv(data_input_path + 'clust_cent_test_first_100.csv',sep=';',decimal='.')
# location_clusters = pd.read_csv(data_input_path + 'loc_clus_test_first_100.csv',sep=';',decimal='.')

# daily_fact_df['DATE'] = pd.to_datetime(daily_fact_df['DATE'])
# location_clusters['DATE'] = pd.to_datetime(location_clusters['DATE'])
# clustered_daily_fact_df = daily_fact_df.merge(location_clusters,on=['DATE','LOCATION'],how='left')
# clustered_daily_fact_df

# clustered_daily_fact_df.to_csv(data_input_path + 'clustered_daily_fact_first_100.csv',index=False, sep=';')