### Обучение автоэнкодера для снижения размерности признакового пространства (202 -> 2) и последующей кластеризации латентного пространства

In [2]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import MiniBatchKMeans

from sklearn.metrics import calinski_harabasz_score
import datetime as dt

In [4]:
import torch

from torch import nn
from torch.nn import functional as F

from torch.utils.data import TensorDataset, DataLoader

### Создаем витрину для обучения 

In [5]:
df = pd.read_csv('data1m.csv', dtype = {'party_rk': str})
df['KEY'] = df['party_rk'] + df['day']
df = df.drop_duplicates().reset_index(drop=True)

In [6]:
lbe = LabelEncoder()
lbe.fit(df['event_name'])
df.loc[:,'event_name'] = lbe.transform(df.loc[:, 'event_name'])

In [7]:
ohe = OneHotEncoder()
ohe.fit(df[['event_name']])

OneHotEncoder()

In [8]:
sparse = ohe.transform(df[['event_name']])
event_names = [f'event_{i}' for i in range(1, sparse.shape[1]+1)]
event_names[:5]

['event_1', 'event_2', 'event_3', 'event_4', 'event_5']

In [9]:
sparse_df = pd.DataFrame(sparse.toarray(), columns=event_names)
df_modelling = pd.concat((df['KEY'], sparse_df), axis=1)

In [13]:
df_modelling = df_modelling.groupby('KEY').agg('sum').reset_index()

### Кастуем к тензорам

In [25]:
X = torch.tensor(df_modelling[event_names].values, dtype=torch.float32)

In [27]:
batch_size=1024
dataloader = DataLoader(X, batch_size=batch_size)

### Создаем Auto Encoder

In [53]:
class AE(nn.Module):
    def __init__(self, input_len):
        super(AE, self).__init__()
        
        self.encoder = nn.Sequential(nn.Linear(input_len, 32),
                                     nn.ReLU(),

                                     nn.Linear(32, 2),
                                     nn.BatchNorm1d(2),
                                     nn.SELU())
                                     
        self.decoder = nn.Sequential(nn.Linear(2, 32),
                                     nn.BatchNorm1d(32),
                                     nn.SELU(),
                                     
                                     nn.Linear(32, input_len))
        
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x
    
    def get_latent(self, x):
        x = self.encoder(x)
        x = x.detach()
        return x

In [54]:
input_len = len(event_names)

model = AE(input_len=input_len)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3*10e-4)

### Обучаемся 

In [55]:
epochs = 10

for epoch in range(epochs):
    losses = []

    for X_batch in dataloader:
        model.train()
        
        outp = model.forward(X_batch)
        loss = criterion(outp, X_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        with torch.no_grad():
            model.eval()
            
            outp = model.forward(X_batch)
            losses.append(criterion(outp, X_batch))
            
    print(f'EPOCH {epoch+1}, MSE:', round(np.mean(losses), 4))

EPOCH 1, MSE: 0.0059
EPOCH 2, MSE: 0.0035
EPOCH 3, MSE: 0.0028
EPOCH 4, MSE: 0.0025
EPOCH 5, MSE: 0.0024
EPOCH 6, MSE: 0.0023
EPOCH 7, MSE: 0.0022
EPOCH 8, MSE: 0.0022
EPOCH 9, MSE: 0.0021
EPOCH 10, MSE: 0.0021


### Извлекаем скрытое пространство для нашей витрины и сохраняем результаты 

In [62]:
encoded_features = model.get_latent(X)

In [67]:
embeddings_df = pd.DataFrame(encoded_features.numpy(), columns=['emb_1', 'emb_2'])
embeddings_df.to_csv('./embeddings_df.csv', index=False)