# Task: Extract and visualize the Seasonal patterns out of the data
## Data: Monthly "page impressions" and "conversion rate" of 100 e-Commerce product categories (100 csv-files)

---

# EDA

In [46]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tslearn.clustering import TimeSeriesKMeans, KShape, KernelKMeans
from tslearn.clustering import silhouette_score as ts_silhouette_score
from sklearn.metrics import silhouette_score

### Loading the data

In [13]:
##Create a dataframe of all categories and features
#*.csv Filenames are: "pis_" + a range of 1 to 100
data = pd.read_csv("./data/pis_1.csv", quoting=2)
data["cat_id"] = 1
data.sort_values(by=["month"], axis=0, ascending=True, inplace=True)

for i in range(2,101):
    add_data = pd.read_csv(f"./data/pis_{i}.csv", quoting=2)
    add_data["cat_id"] = i
    add_data.sort_values(by=["month"], axis=0, ascending=True, inplace=True)
    data = pd.concat([data, add_data], ignore_index=True)
data.columns = ["cat_index", "category", "month", "year", "pageimpressions", "CR", "cat_id"]
data.drop(['cat_index', 'year'], axis=1, inplace=True)
data.head(13)

Unnamed: 0,category,month,pageimpressions,CR,cat_id
0,Herren-Halbschuhe,1.0,86496.0,0.239572,1
1,Herren-Halbschuhe,2.0,75529.0,0.240967,1
2,Herren-Halbschuhe,3.0,,0.232252,1
3,Herren-Halbschuhe,4.0,92861.0,0.226737,1
4,Herren-Halbschuhe,5.0,93876.0,0.24461,1
5,Herren-Halbschuhe,6.0,73075.0,0.209826,1
6,Herren-Halbschuhe,7.0,81014.0,0.205211,1
7,Herren-Halbschuhe,8.0,86341.0,0.210549,1
8,Herren-Halbschuhe,9.0,92675.0,0.219239,1
9,Herren-Halbschuhe,10.0,99431.0,0.22141,1


In [14]:
data.describe()

Unnamed: 0,month,pageimpressions,CR,cat_id
count,1200.0,1170.0,1200.0,1200.0
mean,6.5,67183.402564,0.250398,50.5
std,3.453492,27991.398914,0.056206,28.878105
min,1.0,8404.0,0.105283,1.0
25%,3.75,49982.0,0.213111,25.75
50%,6.5,61557.5,0.239441,50.5
75%,9.25,78468.25,0.274,75.25
max,12.0,230439.0,0.464537,100.0


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   category         1200 non-null   object 
 1   month            1200 non-null   float64
 2   pageimpressions  1170 non-null   float64
 3   CR               1200 non-null   float64
 4   cat_id           1200 non-null   int64  
dtypes: float64(3), int64(1), object(1)
memory usage: 47.0+ KB


Check the distribution of nan-values:

In [16]:
data.loc[data['pageimpressions'].isna(), :]['month'].value_counts()

10.0    6
1.0     4
2.0     4
3.0     3
8.0     3
11.0    3
6.0     2
5.0     2
4.0     2
12.0    1
Name: month, dtype: int64

In [17]:
# impute nan-values with mean of neighbouring values:
data.interpolate(inplace=True)
data.head()

Unnamed: 0,category,month,pageimpressions,CR,cat_id
0,Herren-Halbschuhe,1.0,86496.0,0.239572,1
1,Herren-Halbschuhe,2.0,75529.0,0.240967,1
2,Herren-Halbschuhe,3.0,84195.0,0.232252,1
3,Herren-Halbschuhe,4.0,92861.0,0.226737,1
4,Herren-Halbschuhe,5.0,93876.0,0.24461,1


Calculate "lead-out" values by multiplying "page impressions" and "conversian rate":

In [18]:
data['lead-out'] = round(data["pageimpressions"] * data["CR"])
data.head()

Unnamed: 0,category,month,pageimpressions,CR,cat_id,lead-out
0,Herren-Halbschuhe,1.0,86496.0,0.239572,1,20722.0
1,Herren-Halbschuhe,2.0,75529.0,0.240967,1,18200.0
2,Herren-Halbschuhe,3.0,84195.0,0.232252,1,19554.0
3,Herren-Halbschuhe,4.0,92861.0,0.226737,1,21055.0
4,Herren-Halbschuhe,5.0,93876.0,0.24461,1,22963.0


In [19]:
#data.to_csv("./data/data.csv", sep=";")

### Comparing page impressions of some categories

In [20]:
modeleisenbahnen = data.loc[data['category'] == "Modelleisenbahnen"]
fig = px.bar(modeleisenbahnen, x="month", y="pageimpressions", template='plotly_dark',
                  title='Page-impressions in category "Modeleisenbahnen"')
fig.update_layout(xaxis = dict(tickmode = 'linear', dtick = 1))
fig.show()

In [21]:
motorsensen = data.loc[data['category'] == "Motorsensen"]
fig = px.bar(motorsensen, x="month", y="pageimpressions", template='plotly_dark',
                 title='Page-impressions in category "Motorsensen"')
fig.update_layout(xaxis = dict(tickmode = 'linear', dtick = 1))
fig.show()

In [22]:
#gelenkundmuskel = data.loc[data['category'] == "Gelenk- & Muskelpräparate"]
gelenkundmuskel = data.loc[data['cat_id'] == 9]
fig = px.bar(gelenkundmuskel, x="month", y="pageimpressions", template='plotly_dark',
                    title='Page-impressions in category "Gelenk- & Muskelpräparate"')
fig.update_layout(xaxis = dict(tickmode = 'linear', dtick = 1))
fig.show()

Bar Plot with two y-axes for "page impressions" & "conversion rate":

In [23]:
cat_name = "Kinderroller"
cat = data.loc[data['category'] == cat_name]

fig = go.Figure(
    data=[
        go.Bar(name='page impressions', x=cat["month"], y=cat["pageimpressions"], yaxis='y', offsetgroup=1),
        go.Bar(name='conversion rate', x=cat["month"], y=cat["CR"], yaxis='y2', offsetgroup=2)
    ],
    layout={
        'yaxis': {'title': 'page impressions'},
        'yaxis2': {'title': 'conversion rate', 'overlaying': 'y', 'side': 'right'}
    }
)
# Change the bar mode
fig.update_xaxes(title_text="months", dtick=[len(cat.index)])
fig.update_layout(title_text=f"Page Impressions and Conversion Rate of Category: {cat_name}", 
                barmode='group', template='plotly_dark',
                legend=dict(yanchor="bottom",y=0.99, xanchor="right",x=0.99))
fig.show()

Plot and save all categories:

In [24]:
#Plot and save page impressions of all categories:
for cat_id in range(1,101):
    cat = data.loc[data['cat_id'] == cat_id]
    cat_name = cat.category.iloc[0]

    fig = go.Figure(
        data=[
            go.Bar(name='page impressions', x=cat["month"], y=cat["pageimpressions"], yaxis='y', offsetgroup=1),
            go.Bar(name='conversion rate', x=cat["month"], y=cat["CR"], yaxis='y2', offsetgroup=2)
        ],
        layout={
            'yaxis': {'title': 'page impressions'},
            'yaxis2': {'title': 'conversion rate', 'overlaying': 'y', 'side': 'right'}
        }
    )
    # Change the bar mode
    fig.update_xaxes(title_text="months", dtick=[len(cat.index)])
    fig.update_layout(title_text=f"Page Impressions and Conversion Rate of Category: {cat_name}", 
                    barmode='group', template='plotly_dark',
                    legend=dict(yanchor="bottom",y=0.99, xanchor="right",x=0.99))
    #fig.show()
    #fig.write_image(f"./data/plots/cat_PI_{cat_id}.png", format='png', scale=1, width=1200, height=600)

### Page impressions & conversion rate of all categories (monthly mean)

In [25]:
data["lead-out"] = round(data['pageimpressions'] * data['CR'])
data.head()


Unnamed: 0,category,month,pageimpressions,CR,cat_id,lead-out
0,Herren-Halbschuhe,1.0,86496.0,0.239572,1,20722.0
1,Herren-Halbschuhe,2.0,75529.0,0.240967,1,18200.0
2,Herren-Halbschuhe,3.0,84195.0,0.232252,1,19554.0
3,Herren-Halbschuhe,4.0,92861.0,0.226737,1,21055.0
4,Herren-Halbschuhe,5.0,93876.0,0.24461,1,22963.0


In [26]:
monthly_pi = data['pageimpressions'].groupby(data['month']).mean()
monthly_cr = data['CR'].groupby(data['month']).mean()
monthly_lo = data['lead-out'].groupby(data['month']).mean()

fig = make_subplots(specs=[[{"secondary_y": True, "type": "xy"}]])
fig.add_trace(go.Scatter(y=monthly_pi, x=monthly_pi.index, name="page impressions"), secondary_y=False)
fig.add_trace(go.Scatter(y=monthly_lo, x=monthly_lo.index, name="lead-out"), secondary_y=False)
fig.add_trace(go.Scatter(y=monthly_cr*100, x=monthly_cr.index, name="conversion rate"), secondary_y=True)
fig.update_layout(title_text="Page impressions, conversion rate & lead-out (all categories, mean)", template='plotly_dark')
fig.update_xaxes(title_text="months", dtick=[1,len(monthly_cr.index)])
fig.update_yaxes(title_text="page impressions / lead-out", secondary_y=False, range=[10000,90000])
fig.update_yaxes(title_text="conversion rate %", secondary_y=True, range=[24,26])
fig.show()

# Clustering Seasonality Patterns

https://tslearn.readthedocs.io/en/stable/auto_examples/clustering/plot_kmeans.html#sphx-glr-auto-examples-clustering-plot-kmeans-py

Prepare Data for Clustering:
- page impressions
- conversion rate
- lead-out (pi * cr)

In [82]:
data_pi = data[['category', 'month', 'pageimpressions']]
data_cr = data[['category', 'month', 'CR']]
data_lo = data[['category', 'month', 'lead-out']]
data_pi.head(3), data_cr.head(3), data_lo.head(3)


(            category  month  pageimpressions
 0  Herren-Halbschuhe    1.0          86496.0
 1  Herren-Halbschuhe    2.0          75529.0
 2  Herren-Halbschuhe    3.0          84195.0,
             category  month        CR
 0  Herren-Halbschuhe    1.0  0.239572
 1  Herren-Halbschuhe    2.0  0.240967
 2  Herren-Halbschuhe    3.0  0.232252,
             category  month  lead-out
 0  Herren-Halbschuhe    1.0   20722.0
 1  Herren-Halbschuhe    2.0   18200.0
 2  Herren-Halbschuhe    3.0   19554.0)

In [83]:
data_pi_wide =  pd.pivot_table(data_pi, index='month', columns='category', values='pageimpressions')
#data_pi_wide.to_csv("./data/data_pi_wide.csv", sep=";")
data_cr_wide =  pd.pivot_table(data_cr, index='month', columns='category', values='CR')
#data_cr_wide.to_csv("./data/data_cr_wide.csv", sep=";")
data_lo_wide =  pd.pivot_table(data_lo, index='month', columns='category', values='lead-out')
#data_lo_wide.to_csv("./data/data_lo_wide.csv", sep=";")
data_pi_wide.head()

category,Akkus & Batterien,All in One PCs,"Augen-, Ohren- & Nasenmedikamente",Autobatterien,Bestecke,Betriebssysteme,Bohrer & Bits,Bridgekameras,"Champagner, Sekt & Prosecco",Clogs,...,Trekkingsandalen,Trinkflaschen,USB Sticks,Umhängetaschen,VR-Brillen,Verstärker,WC,Waschtrockner,Xbox One Spiele,Zubehör für Küchengeräte
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,85837.0,79256.0,66158.0,140228.0,109343.0,73941.0,65603.0,72261.0,59294.0,54682.0,...,25949.0,71637.0,96297.0,48267.0,77844.0,102167.0,72377.0,77931.0,77307.0,81669.0
2.0,57004.0,64025.0,60818.0,81306.0,68012.0,68449.5,52766.0,59766.0,45497.0,47439.0,...,27387.0,34986.0,67531.0,36728.0,40863.0,74299.0,62557.0,52693.0,54913.0,59523.0
3.0,60308.0,69544.0,63310.0,78226.0,71481.0,62958.0,74365.0,76072.0,48162.0,60777.0,...,37763.0,44189.0,83430.0,43493.0,50260.0,80303.0,70651.0,64458.0,66850.0,64530.0
4.0,52951.0,59271.0,61108.0,76142.0,63153.0,48650.0,66463.0,67330.0,47216.0,66324.0,...,54522.0,46096.5,65050.0,44078.0,36173.0,63780.0,54638.0,51924.0,52446.0,55452.0
5.0,53609.0,64724.0,60403.0,69188.0,64833.0,50789.0,67628.0,63150.0,52014.0,63495.0,...,59141.0,48004.0,66729.0,44663.0,39560.0,63375.0,54674.0,54839.0,49444.0,52876.0


In [29]:
# prepare data as 2dim-array for the use in the model:
X = data_pi_wide.transpose().values
X

array([[ 85837. ,  57004. ,  60308. , ...,  76197. ,  90985. ,  93308. ],
       [ 79256. ,  64025. ,  69544. , ...,  88703. , 134463. , 109622. ],
       [ 66158. ,  60818. ,  63310. , ...,  53851. ,  60295. ,  56467. ],
       ...,
       [ 77931. ,  52693. ,  64458. , ...,  72817. , 101172. ,  70408. ],
       [ 77307. ,  54913. ,  66850. , ...,  85936.5,  94533. ,  89577. ],
       [ 81669. ,  59523. ,  64530. , ...,  59478. , 104043. , 118892. ]])

In [30]:
model = TimeSeriesKMeans(n_clusters=6, metric="euclidean", max_iter=100, n_init=2).fit(X)

In [31]:
# Dataframe to map categories to their cluster labels
df_cluster = pd.DataFrame(list(zip(data_pi_wide.columns, model.labels_)), columns=['category', 'cluster'])

# dictionaries and lists for use in plots:
cluster_cat_dict = df_cluster.groupby(['cluster'])['category'].apply(lambda x: [x for x in x]).to_dict()
cluster_len_dict = df_cluster['cluster'].value_counts().to_dict()
clusters_all = [cluster for cluster in cluster_len_dict]
clusters_all.sort()
#print(f"df_cluster: {df_cluster}")
#print(f"clusters_all: {clusters_all}")
#print(f"cluster_cat_dict: {cluster_cat_dict}")
#print(f"cluster_len_dict: {cluster_len_dict}")

Make a quality assesment of each cluster with a correlation matrix:

In [32]:
cluster_quality_dict = {}
for cluster_number in clusters_all:
    # get quality score based on the correlation between categories in the cluster
    # For clusters with only one item x_corr_mean is set to 0
    if len(cluster_cat_dict[cluster_number]) > 1:
        x_corr = data_pi_wide[cluster_cat_dict[cluster_number]].corr().abs()
        # get the mean of the values in the upper triangle of the correlation matrix (and round to .2)
        x_corr_mean = round(x_corr.values[np.triu_indices_from(x_corr.values,1)].mean(), 2)
    else:
        x_corr_mean = 0
        # add it to the cluster-quality-dictionary
    cluster_quality_dict[cluster_number] = x_corr_mean
    
correlation_mean = sum(cluster_quality_dict.values())/len(cluster_quality_dict)
cluster_quality_dict, f"Mean cluster quality: {correlation_mean}"

({0: 0.84, 1: 0.83, 2: 0.84, 3: 0.55, 4: 0.66, 5: 0.56},
 'Mean cluster quality: 0.7133333333333333')

In [33]:
# Dataframe for cluster quality and size:
df_cluster_quality = pd.DataFrame.from_dict(cluster_len_dict, orient='index', columns=['n'])
df_cluster_quality.index.names = ['cluster']
df_cluster_quality['quality_score'] = df_cluster_quality.index.map(cluster_quality_dict)

df_cluster_quality = df_cluster_quality.sort_values('quality_score', ascending=False)
print(f"Mean quality: {df_cluster_quality['quality_score'].mean()}")
print(f"Median quality: {df_cluster_quality['quality_score'].median()}")
df_cluster_quality

Mean quality: 0.7133333333333334
Median quality: 0.745


Unnamed: 0_level_0,n,quality_score
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
2,7,0.84
0,3,0.84
1,8,0.83
4,11,0.66
5,43,0.56
3,28,0.55


Plot each cluster as a line plot:

In [80]:
def plot_clusters(df, labels, renderer, title):
    df_cluster = pd.DataFrame(list(zip(df.columns, labels)), columns=['category', 'cluster'])

    # dictionaries and lists for use in plots:
    cluster_cat_dict = df_cluster.groupby(['cluster'])['category'].apply(lambda x: [x for x in x]).to_dict()
    cluster_len_dict = df_cluster['cluster'].value_counts().to_dict()
    clusters_all = [cluster for cluster in cluster_len_dict]

    for cluster_number in clusters_all:
        if len(cluster_cat_dict[cluster_number]) > 1:
            x_corr = df[cluster_cat_dict[cluster_number]].corr().abs()
            # get the mean of the values in the upper triangle of the correlation matrix (and round to .2)
            x_corr_mean = round(x_corr.values[np.triu_indices_from(x_corr.values,1)].mean(), 2)
        else:
            x_corr_mean = 0 # for clusters of only 1 item
        # plot each cluster
        plot_title = f'{title} cluster {cluster_number} (quality={x_corr_mean}, n={cluster_len_dict[cluster_number]})'
        fig = go.Figure()
        cols = cluster_cat_dict[cluster_number]
        ind = df.index
        for i, col in enumerate(cols):
            fig.add_trace(
                go.Scatter(
                    x=ind, y=df[col], name=col, line={'width':1}, hoverlabel={'namelength':-1}, showlegend=True # line=dict(width=1) #namelength=-1
                )
            )    
        fig.update_xaxes(title_text="months", dtick=[1,len(df.index)+1])
        fig.update_layout(xaxis_rangeslider_visible=False)
        fig.update_layout(title_text=plot_title, template="plotly_dark", height=600)
        fig.show(renderer=renderer)


In [81]:
plot_clusters(data_pi_wide, model.labels_, renderer="browser", title="Page impressions")
# opens in browser!

### Normalize Data

In [59]:
def normalize_df(df):
    df_ = df.reset_index()
    df_norm = (df_ - df_.min()) / (df_.max() - df_.min())
    # normalize by mean:
    #df_norm = (df_ / df_.mean()*100)
    df_norm.drop(['month'], axis=1, inplace=True)
    df_norm = pd.concat((df_norm, df_.month), axis=1)
    df_norm.set_index("month", inplace=True)
    return df_norm

In [64]:
# Scaling example with range
def min_max_scale(X, range=(0, 100)):
    mi, ma = range
    X_std = (X - X.min()) / (X.max() - X.min())
    X_scaled = X_std * (ma - mi) + mi
    return X_scaled

#print(min_max_scale(data_pi_wide))

In [65]:
data_pi_wide_norm = normalize_df(data_pi_wide)
data_pi_wide_norm

Unnamed: 0_level_0,Akkus & Batterien,All in One PCs,"Augen-, Ohren- & Nasenmedikamente",Autobatterien,Bestecke,Betriebssysteme,Bohrer & Bits,Bridgekameras,"Champagner, Sekt & Prosecco",Clogs,...,Trekkingsandalen,Trinkflaschen,USB Sticks,Umhängetaschen,VR-Brillen,Verstärker,WC,Waschtrockner,Xbox One Spiele,Zubehör für Küchengeräte
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,0.856407,0.323651,1.0,1.0,0.674159,0.698537,0.373614,0.881421,0.412078,0.263929,...,0.08782,0.861849,1.0,0.209747,0.532727,1.0,1.0,0.573693,0.664688,0.516817
2.0,0.302235,0.137054,0.70302,0.28276,0.195287,0.57972,0.0,0.492641,0.040772,0.0,...,0.09898,0.0,0.344036,0.0,0.059958,0.51843,0.690348,0.110754,0.228778,0.229344
3.0,0.365738,0.204668,0.841611,0.245268,0.235479,0.460903,0.628627,1.0,0.112493,0.486026,...,0.179505,0.216409,0.706588,0.122969,0.18009,0.622181,0.945574,0.326559,0.461137,0.294339
4.0,0.224336,0.078812,0.719148,0.2199,0.138989,0.151326,0.398644,0.727994,0.087034,0.688154,...,0.309567,0.261264,0.28746,0.133602,0.0,0.336657,0.440639,0.096649,0.180756,0.1765
5.0,0.236983,0.145617,0.67994,0.135251,0.158454,0.197607,0.43255,0.597934,0.216158,0.585067,...,0.345414,0.306119,0.325747,0.144236,0.0433,0.329658,0.441775,0.150118,0.122321,0.143061
6.0,0.0,0.0,0.132139,0.0,0.0,0.0,0.073314,0.445938,0.0,0.833072,...,1.0,0.457955,0.0,0.056004,0.134694,0.0,0.0,0.0,0.0,0.0
7.0,0.197063,0.079706,0.092097,0.087242,0.145697,0.224674,0.490963,0.659292,0.207304,1.0,...,0.962221,0.652142,0.315782,0.341204,0.072486,0.136533,0.262006,0.057248,0.026259,0.054675
8.0,0.237675,0.164459,0.0,0.189334,0.1883,0.327318,0.197881,0.297894,0.192583,0.822323,...,0.695221,0.847129,0.404853,0.378994,0.092218,0.207503,0.44477,0.15043,0.104588,0.015382
9.0,0.353841,0.248172,0.009677,0.281104,0.188879,0.400926,0.138712,0.131056,0.129447,0.496921,...,0.239123,0.784626,0.435455,0.23394,0.111951,0.380013,0.514521,0.30739,0.66533,0.0631
10.0,0.671126,0.439387,0.315555,0.490195,0.248815,0.724977,1.0,0.0,0.322057,0.386073,...,0.080401,0.455839,0.47153,0.295125,0.14037,0.582056,0.4711,0.479887,0.832665,0.22876


In [74]:
plot_clusters(data_pi_wide_norm, model.labels_, renderer="browser", title="Page impressions normalized")

### Clustering "conversion rate" data the same way

In [75]:
data_cr = data[['category', 'month', 'CR']]
data_cr_wide =  pd.pivot_table(data_cr,index='month',columns='category',values='CR')
#data_cr_wide.to_csv("./data/data_cr_wide.csv", sep=";")


In [76]:
X = data_cr_wide.transpose().values
model_cr = TimeSeriesKMeans(n_clusters=6, metric="euclidean", max_iter=100, n_init=2).fit(X)

In [79]:
plot_clusters(data_cr_wide, model_cr.labels_, renderer="browser", title="Conversion rate")

# Quality Assesment of clustering methods

## For a rough quality assesment of a clustering method we can get the mean of the correlation within all clusters:

In [28]:
def correlation_mean(df, labels):
    df_cluster = pd.DataFrame(list(zip(df.columns, labels)), columns=['category', 'cluster'])
    # dictionaries and lists for use in plots:
    cluster_cat_dict = df_cluster.groupby(['cluster'])['category'].apply(lambda x: [x for x in x]).to_dict()
    cluster_len_dict = df_cluster['cluster'].value_counts().to_dict()
    clusters_all = [cluster for cluster in cluster_len_dict]
    clusters_all.sort()
    cluster_quality_dict = {}
    for cluster_number in clusters_all:
        # get quality score based on the correlation between categories in the cluster
        # For clusters with only one item x_corr_mean is set to 0
        if len(cluster_cat_dict[cluster_number]) > 1:
            x_corr = df[cluster_cat_dict[cluster_number]].corr().abs()
            # get the mean of the values in the upper triangle of the correlation matrix (and round to .2)
            x_corr_mean = round(x_corr.values[np.triu_indices_from(x_corr.values,1)].mean(), 2)
        else:
            x_corr_mean = 0
            # add it to the cluster-quality-dictionary
        cluster_quality_dict[cluster_number] = x_corr_mean
        
    correlation_mean = sum(cluster_quality_dict.values()) / len(cluster_quality_dict)
    return correlation_mean

In [29]:
def method_quality_corr(df, norm=False, random_state=13, n_init=2, max_iter=25):
    if norm == True:
        x = normalize_df(df).transpose().values
    else:
        x = df.transpose().values
    
    results_df = pd.DataFrame(
            columns=["n_clusters", "KMeans euclidean", "KMeans dtw", "KMeans soft dtw", "KernelMeans", "KShape"])

    for n_clusters in range(2,13):
        kmeans_euc_model = TimeSeriesKMeans(n_clusters=n_clusters, metric="euclidean", max_iter=max_iter, n_init=n_init, random_state=random_state).fit(x)
        kmeans_euc_labels = kmeans_euc_model.labels_
        kmeans_euc_corr_mean = correlation_mean(df, kmeans_euc_labels)

        kmeans_dtw_model = TimeSeriesKMeans(n_clusters=n_clusters, metric="dtw", max_iter=max_iter, n_init=n_init, random_state=random_state).fit(x)
        kmeans_dtw_labels = kmeans_dtw_model.labels_
        kmeans_dtw_corr_mean = correlation_mean(df, kmeans_dtw_labels)

        kmeans_sdtw_model = TimeSeriesKMeans(n_clusters=n_clusters, metric="softdtw", max_iter=max_iter, n_init=n_init, random_state=random_state).fit(x)
        kmeans_sdtw_labels = kmeans_sdtw_model.labels_
        kmeans_sdtw_corr_mean = correlation_mean(df, kmeans_sdtw_labels)

        kernel_means_model = KernelKMeans(n_clusters=n_clusters, kernel="gak", max_iter=max_iter, n_init=n_init, random_state=random_state).fit(x)
        kernel_means_labels = kernel_means_model.labels_
        kernel_means_corr_mean = correlation_mean(df, kernel_means_labels)

        kshape_model = KShape(n_clusters=n_clusters, max_iter=max_iter, n_init=n_init, random_state=random_state).fit(x)
        kshape_labels = kshape_model.labels_
        kshape_corr_mean = correlation_mean(df, kshape_labels)

        results = {
            "n_clusters": n_clusters, 
            "KMeans euclidean": kmeans_euc_corr_mean, 
            "KMeans dtw": kmeans_dtw_corr_mean, 
            "KMeans soft dtw": kmeans_sdtw_corr_mean, 
            "KernelMeans": kernel_means_corr_mean, 
            "KShape": kshape_corr_mean
        }
        results_df = pd.concat([results_df, pd.DataFrame.from_records([results])], ignore_index=True)

    results_df.set_index('n_clusters', inplace=True)    
    return results_df

Calculate Clusters and quality assessment for 2 to 12 clusters. The results are saved to csv-files for later use.

In [30]:
# not normalized data:
data_pi_wide = pd.read_csv("./dash_app/datasets/data_pi_wide.csv", sep=";", index_col="month")

method_quality_df = method_quality_corr(
    data_pi_wide, 
    norm=False,
    random_state=15,
    n_init=6,
    max_iter=100,
)
method_quality_df.to_csv("./dash_app/datasets/method_quality_corr.csv", sep=";")

In [31]:
# normalized data:
method_quality_df = method_quality_corr(
   data_pi_wide, 
   norm=True,
   random_state=16,
   n_init=8,
   max_iter=100,
)
method_quality_df.to_csv("./dash_app/datasets/method_quality_corr_norm.csv", sep=";")

A plot of the results for not normalized data:

In [32]:
df = pd.read_csv("./dash_app/datasets/method_quality_corr.csv", sep=";", index_col="n_clusters")

fig = go.Figure()
plot_title = "Clustering quality measurement of all methods with correlation mean (not normalized data)"
# Loop dataframe columns and plot columns to the figure
for i in range(0, len(df.columns)):
    col_name = df.columns.values[i]
    fig.add_trace(go.Scatter(x=df.index, y=df[col_name], mode='lines', name=col_name))
fig.update_xaxes(title_text="n_clusters", dtick=[2,len(df.index)+1])
fig.update_layout(xaxis_rangeslider_visible=False)
fig.update_layout(title_text=plot_title, template="plotly_dark", height=500)
fig.show()

A plot of the results for not normalized data:

In [33]:
df = pd.read_csv("./dash_app/datasets/method_quality_corr_norm.csv", sep=";", index_col="n_clusters")

fig = go.Figure()
plot_title = "Clustering quality measurement of all methods with correlation mean (normalized data)"
# Loop dataframe columns and plot columns to the figure
for i in range(0, len(df.columns)):
    col_name = df.columns.values[i]
    fig.add_trace(go.Scatter(x=df.index, y=df[col_name], mode='lines', name=col_name))#, line_shape="spline"))
fig.update_xaxes(title_text="n_clusters", dtick=[2,len(df.index)+1])
fig.update_layout(xaxis_rangeslider_visible=False)
fig.update_layout(title_text=plot_title, template="plotly_dark", height=500)
fig.show()

## Quality Assesment of clustering methods with "silhouette score"

https://scikit-learn.org/stable/modules/clustering.html#silhouette-coefficient

If the ground truth labels are not known, evaluation must be performed using the model itself. The Silhouette Coefficient (sklearn.metrics.silhouette_score) is an example of such an evaluation, where a higher Silhouette Coefficient score relates to a model with better defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores:

    a: The mean distance between a sample and all other points in the same class.
    b: The mean distance between a sample and all other points in the next nearest cluster.

The Silhouette Coefficient s for a single sample is then given as:

$$s = \frac{b - a}{max(a, b)}$$



The Silhouette Coefficient for a set of samples is given as the mean of the Silhouette Coefficient for each sample.

In [34]:
def method_quality(df, norm=False, random_state=13, n_init=2, max_iter=25):
    if norm == True:
        x = normalize_df(df).transpose().values
    else:
        x = df.transpose().values
    
    results_df = pd.DataFrame(
            columns=["n_clusters", "KMeans euclidean", "KMeans dtw", "KMeans soft dtw", "KernelMeans", "KShape"])

    for n_clusters in range(2,13):
        kmeans_euc_model = TimeSeriesKMeans(n_clusters=n_clusters, metric="euclidean", max_iter=max_iter, n_init=n_init, random_state=random_state).fit(x)
        kmeans_euc_labels = kmeans_euc_model.labels_
        kmeans_euc_score = silhouette_score(x, kmeans_euc_labels)

        kmeans_dtw_model = TimeSeriesKMeans(n_clusters=n_clusters, metric="dtw", max_iter=max_iter, n_init=n_init, random_state=random_state).fit(x)
        kmeans_dtw_labels = kmeans_dtw_model.labels_
        kmeans_dtw_score = silhouette_score(x, kmeans_dtw_labels)

        kmeans_sdtw_model = TimeSeriesKMeans(n_clusters=n_clusters, metric="softdtw", max_iter=max_iter, n_init=n_init, random_state=random_state).fit(x)
        kmeans_sdtw_labels = kmeans_sdtw_model.labels_
        kmeans_sdtw_score = silhouette_score(x, kmeans_sdtw_labels)

        kernel_means_model = KernelKMeans(n_clusters=n_clusters, kernel="gak", max_iter=max_iter, n_init=n_init, random_state=random_state).fit(x)
        kernel_means_labels = kernel_means_model.labels_
        kernel_means_score = silhouette_score(x, kernel_means_labels)

        kshape_model = KShape(n_clusters=n_clusters, max_iter=max_iter, n_init=n_init, random_state=random_state).fit(x)
        kshape_labels = kshape_model.labels_
        kshape_score = silhouette_score(x, kshape_labels)

        results = {
            "n_clusters": n_clusters, 
            "KMeans euclidean": kmeans_euc_score, 
            "KMeans dtw": kmeans_dtw_score, 
            "KMeans soft dtw": kmeans_sdtw_score, 
            "KernelMeans": kernel_means_score, 
            "KShape": kshape_score
        }
        results_df = pd.concat([results_df, pd.DataFrame.from_records([results])], ignore_index=True)

    results_df.set_index('n_clusters', inplace=True)    
    return results_df

In [35]:
# not normalized data:
data_pi_wide = pd.read_csv("./dash_app/datasets/data_pi_wide.csv", sep=";", index_col="month")

method_quality_df = method_quality(
    data_pi_wide, 
    norm=False,
    random_state=15,
    n_init=6,
    max_iter=100,
)
method_quality_df.to_csv("./dash_app/datasets/method_quality.csv", sep=";")


2-Dimensional data passed. Assuming these are 100 1-dimensional timeseries


2-Dimensional data passed. Assuming these are 100 1-dimensional timeseries


2-Dimensional data passed. Assuming these are 100 1-dimensional timeseries


2-Dimensional data passed. Assuming these are 100 1-dimensional timeseries


2-Dimensional data passed. Assuming these are 100 1-dimensional timeseries


2-Dimensional data passed. Assuming these are 100 1-dimensional timeseries


2-Dimensional data passed. Assuming these are 100 1-dimensional timeseries


2-Dimensional data passed. Assuming these are 100 1-dimensional timeseries


2-Dimensional data passed. Assuming these are 100 1-dimensional timeseries


2-Dimensional data passed. Assuming these are 100 1-dimensional timeseries


2-Dimensional data passed. Assuming these are 100 1-dimensional timeseries


In a future version, the Index constructor will not infer numeric dtypes when passed object-dtype sequences (matching Series behavior)



In [36]:
# normalized data:
method_quality_df = method_quality(
   data_pi_wide, 
   norm=True,
   random_state=16,
   n_init=8,
   max_iter=100,
)
method_quality_df.to_csv("./dash_app/datasets/method_quality_norm.csv", sep=";")


2-Dimensional data passed. Assuming these are 100 1-dimensional timeseries


2-Dimensional data passed. Assuming these are 100 1-dimensional timeseries


2-Dimensional data passed. Assuming these are 100 1-dimensional timeseries


2-Dimensional data passed. Assuming these are 100 1-dimensional timeseries


2-Dimensional data passed. Assuming these are 100 1-dimensional timeseries


2-Dimensional data passed. Assuming these are 100 1-dimensional timeseries


2-Dimensional data passed. Assuming these are 100 1-dimensional timeseries


2-Dimensional data passed. Assuming these are 100 1-dimensional timeseries


2-Dimensional data passed. Assuming these are 100 1-dimensional timeseries


2-Dimensional data passed. Assuming these are 100 1-dimensional timeseries


2-Dimensional data passed. Assuming these are 100 1-dimensional timeseries


In a future version, the Index constructor will not infer numeric dtypes when passed object-dtype sequences (matching Series behavior)



Plot the results for not normalized data:

In [37]:
df = pd.read_csv("./dash_app/datasets/method_quality.csv", sep=";", index_col="n_clusters")

fig = go.Figure()
plot_title = "Clustering quality measurement of all methods with silhouette score (not normalized data)"
# Loop dataframe columns and plot columns to the figure
for i in range(0, len(df.columns)):
    col_name = df.columns.values[i]
    fig.add_trace(go.Scatter(x=df.index, y=df[col_name], mode='lines', name=col_name))
fig.update_xaxes(title_text="n_clusters", dtick=[2,len(df.index)+1])
fig.update_layout(xaxis_rangeslider_visible=False)
fig.update_layout(title_text=plot_title, template="plotly_dark", height=500)
fig.show()

Results for normalized data:

In [38]:
df = pd.read_csv("./dash_app/datasets/method_quality_norm.csv", sep=";", index_col="n_clusters")

fig = go.Figure()
plot_title = "Clustering quality measurement of all methods with silhouette score (normalized data)"
# Loop dataframe columns and plot columns to the figure
for i in range(0, len(df.columns)):
    col_name = df.columns.values[i]
    fig.add_trace(go.Scatter(x=df.index, y=df[col_name], mode='lines', name=col_name))#, line_shape="spline"))
fig.update_xaxes(title_text="n_clusters", dtick=[2,len(df.index)+1])
fig.update_layout(xaxis_rangeslider_visible=False)
fig.update_layout(title_text=plot_title, template="plotly_dark", height=500)
fig.show()

### Some results of the quality assessment:

- KMeans euclidean is very fast and performs well
- KernelMeans performs well only on the normalized data
- Soft dtw is slow on the unnormalized data
- KShape behaves pretty randomly
- Silhouette score decreases with n_clusters increasing
- The correlation mean score increases with n_clusters on normalized data

### Some results of the seasonality-clustering:

There are several typical clusters of page impressions:
  - relatively low throughout the year and high before christmas (Spielzeug-Fahrzeuge, Kuscheltiere)
  - high in winter (Skihelme & Snowboardhelme, Thermostate)
  - high in summer (Gartenschläuche, Motorsensen, Sonnenpflege, Sonnenschirme)
  - high before summer (Gartenstühle, Fahrradschlösser, Dünger)
  - high around easter and before christmas (Kinderroller)
  - no suitable clusters (Autobatterien, Steuersoftware)