In [26]:
import pandas as pd
import numpy as np
from help.preprocessing_okved import fix_okved
import gc

## Без агрегирования нижних уровней

In [27]:
okved_graph = pd.read_pickle('./data/okved_graph.pickle')
okved_data = pd.read_csv('./data/okved_2014_w_sections.csv', index_col=0)
# Словари id-шники - коды и коды - id-шники
id_to_code = okved_data['native_code'].to_dict()
id_to_code[0] = '0'
code_to_id = {v: u for u, v in id_to_code.items()}

# Обрезанные до максимум 1 точки коды
new_id_to_code = {}
for _id, code in id_to_code.items():
    new_id_to_code[_id] = fix_okved(code[:5])

pd.Series(new_id_to_code).map(len).value_counts()

okved_graph.edges()[1].unique().__len__()

2637

### Применим RGCN

In [28]:
from models.rgcn import *
from help.negative_sampler import NegativeSamplerRel

In [29]:
num_negs = 1
neg_share = False
device = torch.device('cuda')
num_hidden = 312
num_layers = 7
regularizer = 'basis'
num_bases = 10
dropout = 0.1
lr = 0.0001
num_epochs = 200
best_loss = 1000000
last_improvement = 0
require_improvements = 50
best_state = None
n_rels = 3
reg_param = 0

In [30]:
g = okved_graph.clone().to(device)

nfeat = g.ndata['feat'].float().to(device)
in_feats = nfeat.shape[1]
n_edges = g.num_edges()

bsize = 131072
n_batch = n_edges // bsize  # Размер батча

rgcn = RGCN(in_feats, num_hidden, num_hidden, num_layers,
            torch.nn.functional.relu, dropout, regularizer, n_rels, num_bases).to(device)

model = LinkPredictor(rgcn, n_rels=n_rels, reg_param=reg_param).to(device)
sampler = NegativeSamplerRel(k=num_negs)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [31]:
%%time
for epoch in range(num_epochs):
    epoch_loss = []
    for i in range(0, n_edges - 1, bsize):
        g_batch = g.edge_subgraph(list(range(i, min(i + bsize, n_edges))))
        n_feat_batch = g_batch.ndata['feat'].float().to(device)
        neg_graph = sampler(g_batch, device)
        # Compute loss and prediction
        pred = model(g_batch, n_feat_batch)
        loss = model.get_loss(pred, g_batch, neg_graph)
        epoch_loss.append(loss.item())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        gc.collect()
        torch.cuda.empty_cache()

    loss = np.mean(epoch_loss)
    print(f'Epoch : {epoch:02d}  |  Loss : {loss:.4f}')

    if loss < best_loss:
        best_loss = loss
        last_improvement = 0
        best_state = model.state_dict()
    else:
        last_improvement += 1

    if last_improvement > require_improvements:
        print(f"No improvement found during the {require_improvements} last iterations, stopping optimization.")
        model.load_state_dict(best_state)
        break

Epoch : 00  |  Loss : 0.6892
Epoch : 01  |  Loss : 0.6792
Epoch : 02  |  Loss : 0.6740
Epoch : 03  |  Loss : 0.6706
Epoch : 04  |  Loss : 0.6660
Epoch : 05  |  Loss : 0.6613
Epoch : 06  |  Loss : 0.6528
Epoch : 07  |  Loss : 0.6428
Epoch : 08  |  Loss : 0.6320
Epoch : 09  |  Loss : 0.6220
Epoch : 10  |  Loss : 0.6094
Epoch : 11  |  Loss : 0.6030
Epoch : 12  |  Loss : 0.6023
Epoch : 13  |  Loss : 0.5943
Epoch : 14  |  Loss : 0.5853
Epoch : 15  |  Loss : 0.5807
Epoch : 16  |  Loss : 0.5773
Epoch : 17  |  Loss : 0.5704
Epoch : 18  |  Loss : 0.5689
Epoch : 19  |  Loss : 0.5660
Epoch : 20  |  Loss : 0.5624
Epoch : 21  |  Loss : 0.5591
Epoch : 22  |  Loss : 0.5550
Epoch : 23  |  Loss : 0.5543
Epoch : 24  |  Loss : 0.5507
Epoch : 25  |  Loss : 0.5487
Epoch : 26  |  Loss : 0.5469
Epoch : 27  |  Loss : 0.5435
Epoch : 28  |  Loss : 0.5437
Epoch : 29  |  Loss : 0.5414
Epoch : 30  |  Loss : 0.5382
Epoch : 31  |  Loss : 0.5355
Epoch : 32  |  Loss : 0.5340
Epoch : 33  |  Loss : 0.5320
Epoch : 34  | 

### Валидация

In [32]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.manifold import TSNE
from sklearn.model_selection import KFold
from tqdm import tqdm

embeddings_bert = np.load('./data/okved_embeddings.npy')
okved_consumption = pd.read_csv('./data/okved_consumption.csv')

In [36]:
model.eval()
torch.cuda.empty_cache()
embeddings_model = model(g, nfeat).detach().cpu().numpy()[1:]

embeddings_model_2d = TSNE(n_components=2, init='random').fit_transform(embeddings_model)
embeddings_bert_2d = TSNE(n_components=2, init='random').fit_transform(embeddings_bert)

RuntimeError: CUDA out of memory. Tried to allocate 524.00 MiB (GPU 0; 6.00 GiB total capacity; 4.69 GiB already allocated; 0 bytes free; 5.18 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [23]:
import plotly.express as px
import plotly

plotly.offline.init_notebook_mode(connected=True)

sections = list(map(str, okved_data['section_id'].values))
fig = px.scatter(x=embeddings_model_2d[:, 0], y=embeddings_model_2d[:, 1], color=sections,
                 title='<b>TSNE embeddings. OUR MODEL</b>')

fig

In [24]:
fig = px.scatter(x=embeddings_bert_2d[:, 0], y=embeddings_bert_2d[:, 1], color=sections,
                 title='<b>TSNE embeddings. BERT</b>')

fig

In [16]:
prov_indices = okved_consumption['okved_provider'].map(fix_okved).map(code_to_id)
cons_indices = okved_consumption['okved_consumer'].map(fix_okved).map(code_to_id)

X_bert = np.column_stack((embeddings_bert[prov_indices], embeddings_bert[cons_indices]))
X_bert = StandardScaler().fit_transform(X_bert)

X_model = np.column_stack((embeddings_model[prov_indices], embeddings_model[cons_indices]))
X_model = StandardScaler().fit_transform(X_model)

y = okved_consumption['normalized_consumption']

#### Линейная модель

In [17]:
model = LinearRegression().fit(X_model, y)
score = model.score(X_model, y)
print(f'Модель: {score: 0.4f}')

model = LinearRegression().fit(X_bert, y)
score = model.score(X_bert, y)
print(f'Берт: {score: 0.4f}')

Модель:  0.1210
Берт:  0.0996


### Многослойный перцептрон

In [18]:
from models.mlp_regressor import MLPRegressorTorch

X_model_th = torch.FloatTensor(X_model).to(device)
X_bert_th = torch.FloatTensor(X_bert).to(device)
y_th = torch.FloatTensor(y.values).to(device)

In [19]:
kf = KFold(n_splits=6)

model = MLPRegressorTorch(X_model.shape[1], (100,)).to(device)
val_scores = []
for train, val in kf.split(X_model_th):
    model.fit(X_model_th[train], y_th[train], val=[X_model_th[val], y_th[val]], epochs=200)
    val_scores.append(model.r2_score(X_model_th[val], y_th[val]))

print(f'AVG VAL model R2: {np.mean(val_scores): 0.4f}')

AVG VAL model R2: -0.0577


In [20]:
model = MLPRegressorTorch(X_bert.shape[1], (100,)).to(device)
val_scores = []
for train, val in kf.split(X_bert_th):
    model.fit(X_bert_th[train], y_th[train], val=[X_bert_th[val], y_th[val]], epochs=200)
    val_scores.append(model.r2_score(X_bert_th[val], y_th[val]))

print(f'AVG VAL bert R2: {np.mean(val_scores): 0.4f}')

AVG VAL bert R2:  0.1404


In [21]:
bert_scores = []
model_scores = []

kf = KFold(n_splits=6)

for i in tqdm(range(20)):

    model = MLPRegressorTorch(X_model.shape[1], (100,)).to(device)  # эмбеддинги нашей моделью
    val_scores = []
    # Обучаем модель с помощью кросс валидации
    for train, val in kf.split(X_model_th):
        # Во время обучения контролируем переобучение с помощью валидационных данных
        model.fit(X_model_th[train], y_th[train],
                  val=[X_model_th[val], y_th[val]],
                  epochs=200)
        val_scores.append(model.r2_score(X_model_th[val], y_th[val]))

    model_scores.append(np.mean(val_scores))

    model = MLPRegressorTorch(X_bert.shape[1], (100,)).to(device)
    val_scores = []

    for train, val in kf.split(X_bert_th):
        model.fit(X_bert_th[train], y_th[train],
                  val=[X_bert_th[val], y_th[val]],
                  epochs=200)

        val_scores.append(model.r2_score(X_bert_th[val], y_th[val]))

    bert_scores.append(np.mean(val_scores))

100%|██████████| 20/20 [01:18<00:00,  3.93s/it]


In [22]:
print(f'Модель: {np.mean(model_scores):.2f} ± {np.std(model_scores):.2f}')
print(f'Берт: {np.mean(bert_scores):.2f} ± {np.std(bert_scores):.2f}')

Модель: 0.08 ± 0.08
Берт: 0.13 ± 0.08
