In [1]:
import pandas as pd
import numpy as np
from help.preprocessing_okved import fix_okved
import gc

## Без агрегирования нижних уровней

In [2]:
okved_graph = pd.read_pickle('./data/okved_graph.pickle')
okved_data = pd.read_csv('./data/okved_2014_w_sections.csv', index_col=0)
# Словари id-шники - коды и коды - id-шники
id_to_code = okved_data['native_code'].to_dict()
id_to_code[0] = '0'
code_to_id = {v: u for u, v in id_to_code.items()}

# Обрезанные до максимум 1 точки коды
new_id_to_code = {}
for _id, code in id_to_code.items():
    new_id_to_code[_id] = fix_okved(code[:5])

pd.Series(new_id_to_code).map(len).value_counts()

okved_graph.edges()[1].unique().__len__()

2637

### Применим RGCN

In [3]:
from models.rgcn import *
from help.negative_sampler import NegativeSamplerRel

In [None]:
num_negs = 1
neg_share = True
device = torch.device('cuda')
num_hidden = 512
n_emb = 256
num_layers = 6
regularizer = None
num_bases = None
dropout = 0.1
lr = 0.001
num_epochs = 1000
best_loss = 1000000
last_improvement = 0
require_improvements = 50
best_state = None
n_rels = 3
reg_param = 0.01

In [None]:
g = okved_graph.clone().to(device)

nfeat = g.ndata['feat'].float().to(device)
in_feats = nfeat.shape[1]
n_edges = g.num_edges()

bsize = 2 ** 14 * 3
n_batch = n_edges // bsize  # Размер батча

rgcn = RGCN(in_feats=in_feats,  # Размер тензора на узле
            n_hidden=num_hidden,  # Количество скрытых слоев
            n_classes=n_emb,  # Размер эмбеддинга 
            n_layers=num_layers,  # Количество слоев
            activation=torch.nn.functional.relu,  # Функция активации
            dropout=dropout,  
            regularizer=regularizer,  # Regulazir для RGCN слоев. None, basis, и bdd
            n_rels=n_rels,  # Количество типов связей
            num_bases=num_bases  # Количество слагаемых в линейной комбинации
           ).to(device)

model = LinkPredictor(rgcn, n_rels=n_rels, reg_param=reg_param).to(device)
sampler = NegativeSamplerRel(k=num_negs)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 10, eta_min=1e-7)

In [None]:
%%time
for epoch in range(num_epochs):
    epoch_loss = []
    
    for i in range(0, n_edges - 1, bsize):
        
        g_batch = g.edge_subgraph(list(range(i, min(i + bsize, n_edges))))
        n_feat_batch = g_batch.ndata['feat'].float().to(device)
        neg_graph = sampler(g_batch, device)
        # Compute loss and prediction
        pred = model(g_batch, n_feat_batch)
        loss = model.get_loss(pred, g_batch, neg_graph)
        epoch_loss.append(loss.item())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        gc.collect()
        torch.cuda.empty_cache()

    loss = np.mean(epoch_loss)
    if epoch % 100 == 99:
        print(f'Epoch : {epoch + 1:02d}  |  Loss : {loss:.4f}')

    if loss < best_loss:
        best_loss = loss
        last_improvement = 0
        best_state = model.state_dict()
    else:
        last_improvement += 1

    if last_improvement > require_improvements:
        print(f"No improvement found during the {require_improvements} last iterations, stopping optimization.")
        model.load_state_dict(best_state)
        break

### Валидация

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.manifold import TSNE
from sklearn.model_selection import KFold
from tqdm import tqdm

embeddings_bert = np.load('./data/okved_embeddings.npy')
okved_consumption = pd.read_csv('./data/okved_consumption.csv')

In [None]:
model.eval()
model = model.to('cpu')
torch.cuda.empty_cache()
embeddings_model = model(g.cpu(), nfeat.cpu()).detach().numpy()[1:]

embeddings_model_2d = TSNE(n_components=2, init='random').fit_transform(embeddings_model)
embeddings_bert_2d = TSNE(n_components=2, init='random').fit_transform(embeddings_bert)

In [None]:
# Модель 128 скрытый слой, 10 слоев, 100 эпох, Модель: 0.04 ± 0.07
# torch.save(model.state_dict(), './weights/model_128x10x256x100.pt')
# torch.load('./weights/model_128x10x256x100.pt')

# Модель 512 скрытый слой, 6 слоев, 639 эпох, Модель: 0.04 ± 0.07
torch.save(model.state_dict(), './weights/model_512x6x256x639.pt')
# torch.load('./weights/model_512x6x256x639.pt')

In [None]:
import plotly.express as px
import plotly

plotly.offline.init_notebook_mode(connected=True)
sections = list(map(str, okved_data['section_id'].values))

fig = px.scatter(x=embeddings_model_2d[:, 0], y=embeddings_model_2d[:, 1], color=sections,
                 title='<b>TSNE embeddings. OUR MODEL</b>', hover_name=okved_data['name_section'])

fig

In [None]:
fig = px.scatter(x=embeddings_bert_2d[:, 0], y=embeddings_bert_2d[:, 1], color=sections,
                 title='<b>TSNE embeddings. BERT</b>', hover_name=okved_data['name_section'])

fig

In [None]:
prov_indices = okved_consumption['okved_provider'].map(fix_okved).map(code_to_id)
cons_indices = okved_consumption['okved_consumer'].map(fix_okved).map(code_to_id)

X_bert = np.column_stack((embeddings_bert[prov_indices], embeddings_bert[cons_indices]))
X_bert = StandardScaler().fit_transform(X_bert)

X_model = np.column_stack((embeddings_model[prov_indices], embeddings_model[cons_indices]))
X_model = StandardScaler().fit_transform(X_model)

y = okved_consumption['normalized_consumption']

### Линейная модель

In [None]:
model = LinearRegression().fit(X_model, y)
score = model.score(X_model, y)
print(f'Модель: {score: 0.4f}')

model = LinearRegression().fit(X_bert, y)
score = model.score(X_bert, y)
print(f'Берт: {score: 0.4f}')

### Многослойный перцептрон

In [None]:
from models.mlp_regressor import MLPRegressorTorch

X_model_th = torch.FloatTensor(X_model).to(device)
X_bert_th = torch.FloatTensor(X_bert).to(device)
y_th = torch.FloatTensor(y.values).to(device)

In [None]:
kf = KFold(n_splits=6)

model = MLPRegressorTorch(X_model.shape[1], (100,)).to(device)
val_scores = []
for train, val in kf.split(X_model_th):
    model.fit(X_model_th[train], y_th[train], val=[X_model_th[val], y_th[val]], epochs=200)
    val_scores.append(model.r2_score(X_model_th[val], y_th[val]))

print(f'AVG VAL model R2: {np.mean(val_scores): 0.4f}')

In [None]:
model = MLPRegressorTorch(X_bert.shape[1], (100,)).to(device)
val_scores = []
for train, val in kf.split(X_bert_th):
    model.fit(X_bert_th[train], y_th[train], val=[X_bert_th[val], y_th[val]], epochs=200)
    val_scores.append(model.r2_score(X_bert_th[val], y_th[val]))

print(f'AVG VAL bert R2: {np.mean(val_scores): 0.4f}')

In [None]:
bert_scores = []
model_scores = []

kf = KFold(n_splits=6)

for i in tqdm(range(10)):

    model = MLPRegressorTorch(X_model.shape[1], (100,)).to(device)  # эмбеддинги нашей моделью
    val_scores = []
    # Обучаем модель с помощью кросс валидации
    for train, val in kf.split(X_model_th):
        # Во время обучения контролируем переобучение с помощью валидационных данных
        model.fit(X_model_th[train], y_th[train],
                  val=[X_model_th[val], y_th[val]],
                  epochs=200)
        val_scores.append(model.r2_score(X_model_th[val], y_th[val]))

    model_scores.append(np.mean(val_scores))

    model = MLPRegressorTorch(X_bert.shape[1], (100,)).to(device)
    val_scores = []

    for train, val in kf.split(X_bert_th):
        model.fit(X_bert_th[train], y_th[train],
                  val=[X_bert_th[val], y_th[val]],
                  epochs=200)

        val_scores.append(model.r2_score(X_bert_th[val], y_th[val]))

    bert_scores.append(np.mean(val_scores))

In [17]:
print(f'Модель: {np.mean(model_scores):.2f} ± {np.std(model_scores):.2f}')
print(f'Берт: {np.mean(bert_scores):.2f} ± {np.std(bert_scores):.2f}')

Модель: 0.06 ± 0.06
Берт: 0.12 ± 0.10
