In [1]:
import plotly
plotly.offline.init_notebook_mode(connected = True)

In [2]:
import pandas as pd
import numpy as np
from help.preprocessing_okved import fix_okved
import gc

# Стрижка дерева кодов ОКВЭД
Просто удаляем связи, ??**.**??**.**\* 

## Аггрегирование нижних уровней

In [3]:
okved_graph = pd.read_pickle('./data/okved_graph.pickle')
okved_data = pd.read_csv('./data/okved_2014_w_sections.csv', index_col=0)
okved_graph.edges()[1].unique().__len__()

2637

In [4]:
okved_data['native_code'].map(len).value_counts()  # Укоротим граф до 4 знаков

7    1184
8     617
5     488
4     259
2      88
Name: native_code, dtype: int64

In [5]:
# Словари id-шники - коды и коды - id-шники
id_to_code = okved_data['native_code'].to_dict()
id_to_code[0] = '0'
code_to_id = {v: u for u, v in id_to_code.items()}

# Обрезанные до максимум 1 точки коды
new_id_to_code = {}
for _id, code in id_to_code.items():
    new_id_to_code[_id] = fix_okved(code[:5])

pd.Series(new_id_to_code).map(len).value_counts()

5    1840
4     701
2      95
1       1
dtype: int64

In [6]:
mask = pd.Series(id_to_code)[pd.Series(id_to_code).str.len() < 5].index
okved_graph = okved_graph.subgraph(mask)
okved_graph.num_edges()

19308

### Применим RGCN

In [7]:
from models.rgcn import *
from help.negative_sampler import NegativeSamplerRel

In [8]:
num_negs = 1
neg_share = True
device = torch.device('cuda')
num_hidden = 512
n_emb = 256
num_layers = 3
regularizer = None
num_bases = None
dropout = 0.1
lr = 0.001
num_epochs = 100
best_loss = 1000000
last_improvement = 0
require_improvements = 50
best_state = None
n_rels = 3
reg_param = 0.01

In [9]:
g = okved_graph.clone().to(device)

nfeat = g.ndata['feat'].float().to(device)
in_feats = nfeat.shape[1]
n_edges = g.num_edges()

bsize = 19308
n_batch = n_edges // bsize  # Размер батча

rgcn = RGCN(in_feats=in_feats,  # Размер тензора на узле
            n_hidden=num_hidden,  # Количество скрытых слоев
            n_classes=n_emb,  # Размер эмбеддинга 
            n_layers=num_layers,  # Количество слоев
            activation=torch.nn.functional.relu,  # Функция активации
            dropout=dropout,  
            regularizer=regularizer,  # Regulazir для RGCN слоев. None, basis, и bdd
            n_rels=n_rels,  # Количество типов связей
            num_bases=num_bases  # Количество слагаемых в линейной комбинации
           ).to(device)

model = LinkPredictor(rgcn, n_rels=n_rels, reg_param=reg_param).to(device)
sampler = NegativeSamplerRel(k=num_negs)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 10, eta_min=1e-7)

In [10]:
%%time
for epoch in range(num_epochs):
    epoch_loss = []
    
    for i in range(0, n_edges - 1, bsize):
        
        g_batch = g.edge_subgraph(list(range(i, min(i + bsize, n_edges))))
        n_feat_batch = g_batch.ndata['feat'].float().to(device)
        neg_graph = sampler(g_batch, device)
        # Compute loss and prediction
        pred = model(g_batch, n_feat_batch)
        loss = model.get_loss(pred, g_batch, neg_graph)
        epoch_loss.append(loss.item())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        gc.collect()
        torch.cuda.empty_cache()

    loss = np.mean(epoch_loss)
    if epoch % 10 == 9:
        print(f'Epoch : {epoch + 1:02d}  |  Loss : {loss:.4f}')

    if loss < best_loss:
        best_loss = loss
        last_improvement = 0
        best_state = model.state_dict()
    else:
        last_improvement += 1

    if last_improvement > require_improvements:
        print(f"No improvement found during the {require_improvements} last iterations, stopping optimization.")
        model.load_state_dict(best_state)
        break

Epoch : 10  |  Loss : 0.5995
Epoch : 20  |  Loss : 0.5707
Epoch : 30  |  Loss : 0.5274
Epoch : 40  |  Loss : 0.5217
Epoch : 50  |  Loss : 0.5053
Epoch : 60  |  Loss : 0.4971
Epoch : 70  |  Loss : 0.4890
Epoch : 80  |  Loss : 0.4851
Epoch : 90  |  Loss : 0.4759
Epoch : 100  |  Loss : 0.4712
Wall time: 9.36 s


### Валидация

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.exceptions import ConvergenceWarning
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from tqdm import tqdm

embeddings_bert = np.load('./data/okved_embeddings.npy')
okved_consumption = pd.read_csv('./data/okved_consumption.csv')

In [12]:
model.eval()
embeddings_model = model(g, nfeat).detach().cpu().numpy()
embeddings_model_z = torch.zeros(2637, 256)
embeddings_model_z[mask] = torch.tensor(embeddings_model)
embeddings_model = embeddings_model_z[1:]

embeddings_model_2d = TSNE(n_components=2, init='random').fit_transform(embeddings_model)
embeddings_bert_2d = TSNE(n_components=2, init='random').fit_transform(embeddings_bert)

In [13]:
torch.save(model, './weights/RGCN_CUT_model.th')
torch.save(model.state_dict(), './weights/RGCN_CUT_weights.th')
torch.save(embeddings_model, './weights/RGCN_CUT_emb.th')

In [14]:
import plotly.express as px
import plotly

plotly.offline.init_notebook_mode(connected=True)
sections = list(map(str, okved_data['section_id'].values))

fig = px.scatter(x=embeddings_model_2d[:, 0], y=embeddings_model_2d[:, 1], color=sections,
                 title='<b>TSNE embeddings. OUR MODEL</b>', hover_name=okved_data['name_okved'])

fig

In [15]:
fig = px.scatter(x=embeddings_bert_2d[:, 0], y=embeddings_bert_2d[:, 1], color=sections,
                 title='<b>TSNE embeddings. BERT</b>', hover_name=okved_data['name_okved'])

fig

In [16]:
prov_indices = okved_consumption['okved_provider'].map(fix_okved).map(code_to_id)
cons_indices = okved_consumption['okved_consumer'].map(fix_okved).map(code_to_id)

X_bert = np.column_stack((embeddings_bert[prov_indices], embeddings_bert[cons_indices]))
X_bert = StandardScaler().fit_transform(X_bert)

X_model = np.column_stack((embeddings_model[prov_indices], embeddings_model[cons_indices]))
X_model = StandardScaler().fit_transform(X_model)

y = okved_consumption['normalized_consumption']

#### Линейная модель

In [17]:
model_linear = LinearRegression().fit(X_model, y)
score = model_linear.score(X_model, y)
print(f'Модель: {score: 0.4f}')

model_linear = LinearRegression().fit(X_bert, y)
score = model_linear.score(X_bert, y)
print(f'Берт: {score: 0.4f}')

Модель:  0.0698
Берт:  0.0996
