### В данном блоке проводим кластеризация на уровне пользователь-день по целевым событиям
### Далее планируется отследить в каких кластерах был пользователь и использовать его кластеры как новые "мета-признаки" для кластеризации

In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import MiniBatchKMeans

In [3]:
from sklearn.metrics import calinski_harabasz_score
import datetime as dt

In [4]:
df = pd.read_csv('data1m.csv', dtype = {'party_rk': str})

In [5]:
df['KEY'] = df['party_rk'] + df['day']

In [6]:
df = df.drop_duplicates().reset_index(drop=True)

In [7]:
lbe = LabelEncoder()
lbe.fit(df['event_name'])
df.loc[:,'event_name'] = lbe.transform(df.loc[:, 'event_name'])

In [8]:
ohe = OneHotEncoder()
ohe.fit(df[['event_name']])

OneHotEncoder()

In [9]:
sparse = ohe.transform(df[['event_name']])
event_names = [f'event_{i}' for i in range(1, sparse.shape[1]+1)]
event_names[:5]

['event_1', 'event_2', 'event_3', 'event_4', 'event_5']

In [10]:
sparse_df = pd.DataFrame(sparse.toarray(), columns=event_names)
df_modelling = pd.concat((df['KEY'], sparse_df), axis=1)

In [11]:
df_modelling = df_modelling.groupby('KEY').agg('sum')
df = df[['KEY', 'party_rk']].drop_duplicates().reset_index(drop=True)

In [12]:
# Рассчитываем KMeans и считаем метрики
np.random.seed(42)

for n_clusters in range(2,40):
    mbk = MiniBatchKMeans(init='k-means++', n_clusters=n_clusters, batch_size=300, n_init=n_clusters)
    cluster_labels = mbk.fit_predict(df_modelling[event_names])

    calinski_harabasz = calinski_harabasz_score(np.array(df_modelling[event_names]), cluster_labels)
    
    print(f"For n_clusters = {n_clusters}, The calinski_harabasz_score is :{calinski_harabasz}")

For n_clusters = 30, The calinski_harabasz_score is :96121.21361069386


In [13]:
# 30 и 37 кластеров показали себя лучше всех, остановимся на 30
cluster_labels = mbk.fit_predict(df_modelling[event_names])

In [14]:
vitrina = pd.merge(df_modelling.reset_index(), df[['KEY', 'party_rk']], on='KEY', how='left')
vitrina['clusters'] = cluster_labels
vitrina = vitrina[['party_rk', 'clusters']]

In [15]:
sparse = ohe.fit_transform(vitrina[['clusters']])
cluster_names = [f'cluster_{i}' for i in range(1, sparse.shape[1]+1)]

sparse = pd.DataFrame(sparse.toarray(), columns=cluster_names)
vitrina = pd.concat((vitrina[['party_rk']], sparse), axis=1)
vitrina.head()

Unnamed: 0,party_rk,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8,cluster_9,...,cluster_21,cluster_22,cluster_23,cluster_24,cluster_25,cluster_26,cluster_27,cluster_28,cluster_29,cluster_30
0,1000083,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1000101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1000129,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10001639,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10001639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [17]:
vitrina = vitrina.groupby('party_rk').agg('sum').reset_index()

In [19]:
vitrina['total_acts'] = vitrina[cluster_names].apply(sum, axis=1)

In [21]:
vitrina.to_csv('./vitrina.csv', index=False)