In [1]:
import plotly
plotly.offline.init_notebook_mode(connected = True)

In [2]:
import pickle
import torch
import numpy
from torch_geometric.loader import HGTLoader
from torch_geometric.nn import Linear, HEATConv
import gc

In [3]:
with open('./data/okved_graph_without_dgl.pickle', 'rb') as file:
    nodes = pickle.load(file)
    u = pickle.load(file)
    v = pickle.load(file)
    edata = pickle.load(file)
    ndata = pickle.load(file)

uv = torch.zeros(2, u.shape[0], dtype=torch.int64)
uv[0, :] = u
uv[1, :] = v

## Создание графа

In [None]:
from torch_geometric.data import HeteroData

data = HeteroData()
# создаем узлы как сразу матрицы, в которую вложена информация об узлах
data['code'].x = ndata['feat'].type(torch.float32)

# создаем связи трех типов
data['code', 'type_0', 'code'].edge_index = uv[:, edata['type'] == 0]
data['code', 'type_1', 'code'].edge_index = uv[:, edata['type'] == 1]
data['code', 'type_2', 'code'].edge_index = uv[:, edata['type'] == 2]

# добавляем атрибуты на связи
data['code', 'type_0', 'code'].edge_attr = edata['norm'][edata['type'] == 0].type(torch.float32).reshape(-1, 1)
data['code', 'type_1', 'code'].edge_attr = edata['norm'][edata['type'] == 1].type(torch.float32).reshape(-1, 1)
data['code', 'type_2', 'code'].edge_attr = edata['norm'][edata['type'] == 2].type(torch.float32).reshape(-1, 1)

In [None]:
class NegativeSampler:

    def __init__(self, graph, batch_size=1):

        negative_pairs = torch.zeros(2, graph.num_nodes ** 2 - graph.num_edges, dtype=torch.int64)
        nums_uniq = [graph[type_]['edge_index'].shape[1] for type_ in graph.edge_types]
        self.sizes = [int(numpy.ceil(ui / sum(nums_uniq) * batch_size * 32)) for ui in nums_uniq]

        j = 0
        for k in range(graph.num_nodes):
            u_pos, v_pos = uv[:, uv[0] == k]
            v_neg = torch.isin(torch.arange(graph.num_nodes), v_pos, invert=True)
            v_neg = torch.arange(graph.num_nodes, dtype=torch.int64)[v_neg]
            try:
                negative_pairs[0, j:j + v_neg.shape[0]] = torch.tile(u_pos[0], v_neg.shape)
                negative_pairs[1, j:j + v_neg.shape[0]] = v_neg
            except RuntimeError:
                negative_pairs = negative_pairs[:, :j]
                break
            j += v_neg.shape[0]

        self.negative_pairs = negative_pairs
        self.negative_data = HeteroData()
        self.negative_data['code'].x = graph['code'].x

    def __call__(self):
        negative_data = self.negative_data.clone()
        negative_pairs = self.negative_pairs
        negative_data['code', 'type_0', 'code'].edge_index = negative_pairs[:, torch.randint(negative_pairs.shape[1],
                                                                                             (self.sizes[0],))]
        negative_data['code', 'type_1', 'code'].edge_index = negative_pairs[:, torch.randint(negative_pairs.shape[1],
                                                                                             (self.sizes[1],))]
        negative_data['code', 'type_2', 'code'].edge_index = negative_pairs[:, torch.randint(negative_pairs.shape[1],
                                                                                             (self.sizes[2],))]

        # добавляем атрибуты на связи
        negative_data['code', 'type_0', 'code'].edge_attr = torch.randn(self.sizes[0], 1)
        negative_data['code', 'type_1', 'code'].edge_attr = torch.randn(self.sizes[1], 1)
        negative_data['code', 'type_2', 'code'].edge_attr = torch.randn(self.sizes[2], 1)

        return negative_data

In [None]:
BATCH_SIZE = 2637 // 8  # При создании dataloader берется BATCH_SIZE узлов

negative_sampler = NegativeSampler(data, BATCH_SIZE)
negative_sampler()

In [None]:
# если на маке - mps, если cuda - то cuda, иначе cpu
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
data = data.to(device)

positive_loader = HGTLoader(
    data,
    # Для каждого типа связи, которых BATCH_SIZE штук, мы генерируем 256 соседей по 2 раза
    num_samples={key: [256] * 2 for key in data.node_types},
    # Use a batch size of 128 for sampling training nodes of type paper
    batch_size=BATCH_SIZE,
    input_nodes=('code', torch.arange(data.num_nodes)),
)


## Модель HEAT

In [None]:
class HeteroGNN(torch.nn.Module):
    """
    Модель, которая возвращает при вызове метода forward уверенность в том, что такая связь существует

    Methods
    -------
    forward(graph) - делает проход вперед, получая эмебеддинги узлов
    predict(graph) - на эмбеддингах строит прогноз, о том сущесвует ли связь
    """

    def __init__(self, hidden_channels, num_layers=1, emb_size=256):
        super().__init__()
        self.convs = torch.nn.ModuleList()
        self.convs.append(
            HEATConv(in_channels=312, out_channels=hidden_channels, num_node_types=1, num_edge_types=3,
                     edge_type_emb_dim=hidden_channels, edge_dim=1, edge_attr_emb_dim=hidden_channels))
        for _ in range(num_layers - 1):
            conv = HEATConv(in_channels=hidden_channels, out_channels=hidden_channels, num_node_types=1,
                            num_edge_types=3,
                            edge_type_emb_dim=hidden_channels, edge_dim=1, edge_attr_emb_dim=hidden_channels)
            self.convs.append(conv)

        self.convs.append(
            HEATConv(in_channels=hidden_channels, out_channels=emb_size, num_node_types=1, num_edge_types=3,
                     edge_type_emb_dim=hidden_channels, edge_dim=1, edge_attr_emb_dim=hidden_channels))

        self.lin = Linear(emb_size, 1)  # Полносвязный слой для предсказания существования связи

    def forward(self, graph: HeteroData):
        assert len(graph.node_types) == 1, 'Пока только для одного типа вершин'

        x = graph[graph.node_types[0]].x.clone()
        edge_indexes = torch.concat([graph[type_i]['edge_index'] for type_i in graph.edge_types], dim=1)
        node_types = torch.zeros(graph.num_nodes, dtype=torch.int64)
        edge_types = []
        edge_attr = []

        for k in range(len(graph.edge_types)):
            edge_types.append(torch.zeros_like(graph.edge_stores[k]['edge_attr'], dtype=torch.int64) + k)
            edge_attr.append(graph.edge_stores[k]['edge_attr'])

        edge_types = torch.concat(edge_types).flatten()
        edge_attr = torch.concat(edge_attr)

        for conv in self.convs:
            # [print(i.shape, end='\t') for i in (x, edge_indexes, node_types, edge_types, edge_attr)]
            # [print(i.dtype, end='\t') for i in (x, edge_indexes, node_types, edge_types, edge_attr)]

            x = conv(x, edge_indexes, node_types, edge_types, edge_attr)

        return x

    def predict(self, graph: HeteroData):
        x = self.lin(self.forward(graph))
        return torch.sigmoid(x)


model = HeteroGNN(hidden_channels=312, emb_size=256, num_layers=2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
criterion = torch.nn.BCELoss()


def train(positive, negative):
    positive = positive.clone()
    negative = negative.clone()
    model.train()
    optimizer.zero_grad()
    out = model.predict(positive)
    neg_out = model.predict(negative)
    pred = torch.concat([out, neg_out]).flatten()
    true = torch.zeros(pred.shape[0]).to(device)
    true[:out.shape[0]] = 1
    true[out.shape[0]:] = 0
    _loss = criterion(pred, true)
    _loss.backward()
    optimizer.step()
    return _loss.item()

In [None]:
with torch.no_grad():  # Проверим, что все окей
    pos = next(iter(positive_loader))
    pos = model.predict(pos)
    neg = model.predict(negative_sampler().to(device))

pos.shape, neg.shape

## Обучение HEAT

In [None]:
%%time
EPOCHS = 20

for epoch in range(EPOCHS):
    losses = []
    for pos in positive_loader:
        neg = negative_sampler().to(device)
        losses.append(train(pos, neg))
        
    if epoch % 3 == 0:
        print(f'epoch: {epoch + 1:2d} | loss: {numpy.mean(losses): .4f}')
    gc.collect()

In [10]:
with torch.no_grad():  # Проверим, что все окей
    pos = next(iter(positive_loader))
    pos = model.predict(pos)
    neg = model.predict(negative_sampler().to(device))

pos.shape, neg.shape

(torch.Size([841, 1]), torch.Size([2637, 1]))

## Обучение HEAT

In [11]:
%%time
EPOCHS = 20

for epoch in range(EPOCHS):
    losses = []
    for pos in positive_loader:
        neg = negative_sampler().to(device)
        losses.append(train(pos, neg))
        
    if epoch % 3 == 0:
        print(f'epoch: {epoch + 1:2d} | loss: {numpy.mean(losses): .4f}')
    gc.collect()

epoch:  1 | loss:  0.5397
epoch:  4 | loss:  0.1861
epoch:  7 | loss:  0.0206
epoch: 10 | loss:  0.0151
epoch: 13 | loss:  0.0134
epoch: 16 | loss:  0.0119
epoch: 19 | loss:  0.0109
Wall time: 35.4 s


In [12]:
model.to('cpu')
data.to('cpu')

with torch.no_grad():
    embeddings_model = model(data)

In [13]:
torch.save(model, './weights/HEATConv_model.th')
torch.save(model.state_dict(), './weights/HEATConv_weights.th')
torch.save(embeddings_model, './weights/HEATConv_emb.th')

In [14]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.manifold import TSNE
import pandas as pd

embeddings_bert = numpy.load('./data/okved_embeddings.npy')
okved_consumption = pd.read_csv('./data/okved_consumption.csv')
okved_data = pd.read_csv('./data/okved_2014_w_sections.csv')

In [15]:
embeddings_model_2d = TSNE(n_components=2, init='random', learning_rate=200).fit_transform(embeddings_model)
embeddings_bert_2d = TSNE(n_components=2, init='random', learning_rate=200).fit_transform(embeddings_bert)

In [16]:
import plotly.express as px
import plotly

plotly.offline.init_notebook_mode(connected=True)
sections = list(map(str, okved_data['section_id'].values))

fig = px.scatter(x=embeddings_model_2d[1:, 0], y=embeddings_model_2d[1:, 1], color=sections,
                 title='<b>TSNE embeddings. OUR MODEL</b>', hover_name=okved_data['name_okved'])

fig

In [17]:
fig = px.scatter(x=embeddings_bert_2d[:, 0], y=embeddings_bert_2d[:, 1], color=sections,
                 title='<b>TSNE embeddings. BERT</b>', hover_name=okved_data['name_okved'])

fig

In [18]:
from help.preprocessing_okved import fix_okved

id_to_code = okved_data['native_code'].to_dict()
id_to_code[0] = '0'
code_to_id = {v: u for u, v in id_to_code.items()}

prov_indices = okved_consumption['okved_provider'].map(fix_okved).map(code_to_id)
cons_indices = okved_consumption['okved_consumer'].map(fix_okved).map(code_to_id)

X_bert = numpy.column_stack((embeddings_bert[prov_indices], embeddings_bert[cons_indices]))
X_bert = StandardScaler().fit_transform(X_bert)

X_model = numpy.column_stack((embeddings_model[prov_indices], embeddings_model[cons_indices]))
X_model = StandardScaler().fit_transform(X_model)

y = okved_consumption['normalized_consumption']

## Линейная модель

In [19]:
model_linear = LinearRegression().fit(X_model, y)
score = model_linear.score(X_model, y)
print(f'Модель: {score: 0.4f}')

model_linear = LinearRegression().fit(X_bert, y)
score = model_linear.score(X_bert, y)
print(f'Берт: {score: 0.4f}')

Модель:  0.1134
Берт:  0.1070
