In [1]:
import pandas as pd
import numpy as np
from preprocessing_okved import fix_okved
import gc

# Задача 1
Агрегируем связи с более низких уровней на более высокие, создаем "укороченный" граф, решает link prediction, тестируем

## Версия 1. Стрижка дерева кодов ОКВЭД
Просто заменяем те коды, ??**.**??**.**\* на ??**.**??

## Аггрегирование нижних уровней

In [2]:
okved_graph = pd.read_pickle('../data/graph/okved_graph.pickle')
okved_data = pd.read_csv('../data/okved2/okved_2014_w_sections.csv', index_col=0)
okved_graph.edges()[1].unique().__len__()

2637

In [3]:
okved_data['native_code'].map(len).value_counts()  # Укоротим граф до 4 знаков

7    1184
8     617
5     488
4     259
2      88
Name: native_code, dtype: int64

In [4]:
# Словари id-шники - коды и коды - id-шники
id_to_code = okved_data['native_code'].to_dict()
id_to_code[0] = '0'
code_to_id = {v: u for u, v in id_to_code.items()}

# Обрезанные до максимум 1 точки коды
new_id_to_code = {}
for _id, code in id_to_code.items():
    new_id_to_code[_id] = fix_okved(code[:5])

pd.Series(new_id_to_code).map(len).value_counts()

5    1840
4     701
2      95
1       1
dtype: int64

In [5]:
def replace_code(code_id: int) -> int:
    """
    Меняет код на родительский или оставляет таким же. Коды родительские те, которые с 0 или 1 точкой
    """
    code_id = int(code_id)
    native_code = new_id_to_code[code_id]

    return code_to_id[native_code]

In [6]:
okved_graph.edges()[0].apply_(replace_code)
okved_graph.edges()[1].apply_(replace_code)

tensor([   3,    3,    3,  ..., 2614, 2615, 2616])

In [7]:
okved_graph.edges()[1].unique().__len__()

836

### Применим RGCN

In [8]:
import dgl
import dgl.nn as gnn
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from negative_sampler import NegativeSamplerRel


class RGCN(nn.Module):
    def __init__(self, in_feats, n_hidden, n_classes, n_layers, activation, dropout, n_rels):
        super().__init__()
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.n_classes = n_classes
        self.layers = nn.ModuleList()

        if n_layers > 1:
            self.layers.append(gnn.RelGraphConv(in_feats, n_hidden, num_rels=n_rels))
            for i in range(1, n_layers - 1):
                self.layers.append(gnn.RelGraphConv(n_hidden, n_hidden, num_rels=n_rels))

            self.layers.append(gnn.RelGraphConv(n_hidden, n_classes, num_rels=n_rels))
        else:
            self.layers.append(gnn.RelGraphConv(in_feats, n_classes, num_rels=n_rels))

        self.dropout = nn.Dropout(dropout)
        self.activation = activation

    def forward(self, g, x):
        h = x
        edge_types = g.edata['type']
        norm = g.edata['norm'].view(-1, 1)
        for l, layer in enumerate(self.layers):
            h = layer(g, h, edge_types, norm).to(device)
            if l != len(self.layers) - 1:
                h = self.activation(h)
                h = self.dropout(h)
        return h


class LinkPredictor(nn.Module):
    def __init__(self, rgcn, n_rels, reg_param=0.01):
        """
        Parameters
        ----------
        rgcn : RGCN
            Модель графовой нейронной сети
        n_rels : int

        reg_param : float
            Параметр регуляризации
        """
        super().__init__()
        self.rgcn = rgcn
        self.reg_param = reg_param
        self.w_relation = nn.Parameter(torch.Tensor(n_rels, self.rgcn.n_classes))
        nn.init.xavier_uniform_(self.w_relation, gain=nn.init.calculate_gain('relu'))

    def forward(self, g, x):
        """
        Parameters
        ----------
        g : dgl.DGLHeteroGraph
            граф кодов ОКВЭД
        x : torch.Tensor
            эмбеддинги описаний
        """
        return F.dropout(self.rgcn(g, x), p=0.2)

    def calc_score(self, embedding, graph):
        """
        Возвращает DistMult. https://pykeen.readthedocs.io/en/stable/api/pykeen.models.DistMult.html

        Parameters
        ----------
        embedding : torch.Tensor
            эмбеддинг узлов
        graph : dgl.DGLHeteroGraph
            граф кодов ОКВЭД
        """
        # DistMult
        source, target, num_relation = graph.edges(form='all')
        edge_types = graph.edata['type'][num_relation]  # edge type
        s = embedding[source]
        r = self.w_relation[edge_types]
        o = embedding[target]
        score = torch.sum(s * r * o, dim=1)
        return score

    def regularization_loss(self, embedding):
        """
        Возвращает l2 регуляризацию в квадрате
         
        Parameters
        ----------
        embedding : torch.Tensor
            эмбеддинг узлов
        """
        return torch.mean(embedding.pow(2)) + torch.mean(self.w_relation.pow(2))

    def get_loss(self, embedding, pos_graph, neg_graph):
        """
        Вычисляет полную ошибку, по положительным и отрицательным примерам
        
        Parameters
        ----------
        embedding : torch.Tensor
            эмбеддинг узлов
        pos_graph : dgl.DGLHeteroGraph
            граф кодов ОКВЭД
        neg_graph : dgl.DGLHeteroGraph
            граф случайно созданных связей между кодами ОКВЭД

        """
        pos_score = self.calc_score(embedding, pos_graph)
        neg_score = self.calc_score(embedding, neg_graph)
        score = torch.cat([pos_score, neg_score])
        label = torch.cat([torch.ones_like(pos_score), torch.zeros_like(neg_score)]).long()
        predict_loss = F.binary_cross_entropy_with_logits(score, label.float())

        reg_loss = self.regularization_loss(embedding)
        return predict_loss + self.reg_param * reg_loss


In [9]:
num_negs = 1
neg_share = False
device = torch.device('cuda')
num_hidden = 256
num_layers = 2
dropout = 0.2
lr = 0.001
num_epochs = 20
best_loss = 1000000
last_improvement = 0
require_improvements = 50
best_state = None
n_rels = 3
reg_param = 0

In [10]:
g = okved_graph.clone().to(device)

nfeat = g.ndata['feat'].float().to(device)
in_feats = nfeat.shape[1]
n_edges = g.num_edges()

bsize = 131072
n_batch = n_edges // bsize  # Размер батча

rgcn = RGCN(in_feats, num_hidden, num_hidden, num_layers, F.relu, dropout, n_rels)
model = LinkPredictor(rgcn, n_rels=n_rels, reg_param=reg_param).to(device)
sampler = NegativeSamplerRel(k=num_negs)
optimizer = optim.Adam(model.parameters(), lr=lr)

In [11]:
% % time
for epoch in range(num_epochs):
    epoch_loss = []
    for i in range(0, n_edges - 1, bsize):
        g_batch = g.edge_subgraph(list(range(i, min(i + bsize, n_edges))))
        nfeat_batch = g_batch.ndata['feat'].float().to(device)
        neg_graph = sampler(g_batch).to(device)

        # Compute loss and prediction
        pred = model(g_batch, nfeat_batch)
        loss = model.get_loss(pred, g_batch, neg_graph)
        epoch_loss.append(loss.item())
        optimizer.zero_grad()
        loss.backward()

        # nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # clip gradients
        optimizer.step()
        gc.collect()
        torch.cuda.empty_cache()

    loss = np.mean(epoch_loss)
    print(f'Epoch : {epoch:02d}  |  Loss : {loss:.4f}')

    if loss < best_loss:
        best_loss = loss
        last_improvement = 0
        best_state = model.state_dict()
    else:
        last_improvement += 1

    if last_improvement > require_improvements:
        print(f"No improvement found during the {require_improvements} last iterations, stopping optimization.")
        model.load_state_dict(best_state)
        break

UsageError: Line magic function `%` not found.


### Валидация

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.exceptions import ConvergenceWarning
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from tqdm import tqdm

embeddings_bert = np.load('../data/okved2/okved_embeddings.npy')
okved_consumption = pd.read_csv('../data/stats/okved_consumption.csv')

In [None]:
model.eval()
embeddings_model = model(g, nfeat).detach().cpu().numpy()[1:]

embeddings_model_2d = TSNE(n_components=2, init='random').fit_transform(embeddings_model)
embeddings_bert_2d = TSNE(n_components=2, init='random').fit_transform(embeddings_bert)

In [None]:
import plotly.express as px
import plotly

plotly.offline.init_notebook_mode(connected=True)

sections = okved_data['section_id'].values
fig = px.scatter(x=embeddings_model_2d[:, 0], y=embeddings_model_2d[:, 1], color=sections,
                 title='<b>TSNE embeddings. OUR MODEL</b>')

fig

In [None]:
fig = px.scatter(x=embeddings_bert_2d[:, 0], y=embeddings_bert_2d[:, 1], color=sections,
                 title='<b>TSNE embeddings. BERT</b>')

fig

In [None]:
prov_indices = okved_consumption['okved_provider'].map(fix_okved).map(code_to_id)
cons_indices = okved_consumption['okved_consumer'].map(fix_okved).map(code_to_id)

X_bert = np.column_stack((embeddings_bert[prov_indices], embeddings_bert[cons_indices]))
X_bert = StandardScaler().fit_transform(X_bert)

X_model = np.column_stack((embeddings_model[prov_indices], embeddings_model[cons_indices]))
X_model = StandardScaler().fit_transform(X_model)

y = okved_consumption['normalized_consumption']

#### Линейная модель

In [None]:
model = LinearRegression().fit(X_model, y)
score = model.score(X_model, y)
print(f'Модель: {score: 0.4f}')

model = LinearRegression().fit(X_bert, y)
score = model.score(X_bert, y)
print(f'Берт: {score: 0.4f}')

### Многослойный перцептрон

In [32]:
class MLPRegressorTorch(nn.Module):
    def __init__(self, input_dim: int, layer_dims: tuple[int, ...] = (100,), activation=nn.ReLU()):
        super(MLPRegressor_torch, self).__init__()
        self.layers = [nn.Linear(input_dim, layer_dims[0]), activation]
        for i in range(1, len(layer_dims) - 1):
            self.layers.append(nn.Linear(layer_dims[i], layer_dims[i + 1]))
            self.layers.append(activation)
        else:
            self.layers.append(nn.Linear(layer_dims[-1], 1))
            self.layers.append(activation)
        self.layers = nn.Sequential(*self.layers)
        self.criterion = nn.MSELoss()

    def forward(self, x):
        return self.layers(x)

    def r2_score(self, x, y):
        from sklearn.metrics import r2_score
        y_pred = self.forward(x).flatten()
        return r2_score(y.detach().cpu().numpy(), y_pred.detach().cpu().numpy())

    def get_loss(self, x, y):
        x = self.forward(x).flatten()
        return self.criterion(y, x)

    def fit(self, x, y, val: list = None, epochs: int = 1, verbose=False):
        self.train()
        optimizer = optim.Adam(self.parameters(), lr=0.0001)
        show_epochs = list(np.unique(np.geomspace(1, epochs, 15, dtype=int)))
        last_improvement = 0
        best_val_loss = 10 ** 10

        for i in range(epochs):
            loss = self.get_loss(x, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Валидация
            if val is not None:
                with torch.no_grad():
                    val_loss = self.get_loss(val[0], val[1]).item()

                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    last_improvement = 0
                    best_state = self.state_dict()
                else:
                    last_improvement += 1

            # Печать
            if verbose:
                if i in show_epochs + [epochs - 1]:
                    with torch.no_grad():
                        if val is not None:
                            val_loss = self.get_loss(val[0], val[1]).item()
                            print(f'epoch {i + 1: 6d}  :  loss : {loss.item(): .5f} | val loss : {val_loss: .5f}')
                        else:
                            print(f'epoch {i + 1: 6d}  :  loss : {loss.item(): .5f}')
        else:
            if last_improvement > 0:
                self.load_state_dict(best_state)


X_model_th = torch.FloatTensor(X_model).to(device)
X_bert_th = torch.FloatTensor(X_bert).to(device)
y_th = torch.FloatTensor(y.values).to(device)

In [36]:
kf = KFold(n_splits=6)

model = MLPRegressor_torch(X_model.shape[1], (100,)).to(device)
val_scores = []
for train, val in kf.split(X_model_th):
    model.fit(X_model_th[train], y_th[train], val=[X_model_th[val], y_th[val]], epochs=200)
    val_scores.append(model.r2_score(X_model_th[val], y_th[val]))

print(f'AVG VAL model R2: {np.mean(val_scores): 0.4f}')

AVG VAL model R2: -0.0375


In [35]:
model = MLPRegressor_torch(X_bert.shape[1], (100,)).to(device)
val_scores = []
for train, val in kf.split(X_bert_th):
    model.fit(X_bert_th[train], y_th[train], val=[X_bert_th[val], y_th[val]], epochs=200)
    val_scores.append(model.r2_score(X_bert_th[val], y_th[val]))

print(f'AVG VAL bert R2: {np.mean(val_scores): 0.4f}')

AVG VAL bert R2:  0.0254


In [114]:
bert_scores = []
model_scores = []

kf = KFold(n_splits=6)

for i in tqdm(range(20)):

    model = MLPRegressor_torch(X_model.shape[1], (100,)).to(device)  # эмбеддинги нашей моделью
    val_scores = []
    # Обучаем модель с помощью кросс валидации
    for train, val in kf.split(X_model_th):
        # Во время обучения контролируем переобучение с помощью валидационных данных
        model.fit(X_model_th[train], y_th[train],
                  val=[X_model_th[val], y_th[val]],
                  epochs=200)
        val_scores.append(model.r2_score(X_model_th[val], y_th[val]))

    model_scores.append(np.mean(val_scores))

    model = MLPRegressor_torch(X_bert.shape[1], (100,)).to(device)
    val_scores = []

    for train, val in kf.split(X_bert_th):
        model.fit(X_bert_th[train], y_th[train],
                  val=[X_bert_th[val], y_th[val]],
                  epochs=200)

        val_scores.append(model.r2_score(X_bert_th[val], y_th[val]))

    bert_scores.append(np.mean(val_scores))

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:16<00:00,  3.82s/it]


In [115]:
print(f'Модель: {np.mean(model_scores):.2f} ± {np.std(model_scores):.2f}')
print(f'Берт: {np.mean(bert_scores):.2f} ± {np.std(bert_scores):.2f}')

Модель: 0.02 ± 0.08
Берт: 0.14 ± 0.09
