In [2]:
import torch
import pandas as pd
from torch import nn
from torch.nn import functional as F
from transformers import BertTokenizer, BertModel
import logging
import deeppavlov
logging.basicConfig(level=logging.INFO)

INFO:numexpr.utils:NumExpr defaulting to 8 threads.


In [3]:
tokenizer = BertTokenizer.from_pretrained('./rubert_cased_L-12_H-768_A-12_pt')
print(tokenizer)

INFO:transformers.tokenization_utils:Model name './rubert_cased_L-12_H-768_A-12_pt' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc, bert-base-german-dbmdz-cased, bert-base-german-dbmdz-uncased). Assuming './rubert_cased_L-12_H-768_A-12_pt' is a path or url to a directory containing tokenizer files.
INFO:transformers.tokenization_utils:Didn't find file ./rubert_cased_L-12_H-768_A-12_pt/added_tokens.json. We won't load it.
INFO:transformers.tokenization_utils:Didn't find file ./rubert_cased_L-12_H-768_A-12_pt/special_tokens_map.json. We won't load it.
INFO:transformers.tokenization_utils:Didn't find file ./ru

<transformers.tokenization_bert.BertTokenizer object at 0x7f119abd4950>


In [4]:
embedder = BertModel.from_pretrained('./rubert_cased_L-12_H-768_A-12_pt')

INFO:transformers.configuration_utils:loading configuration file ./rubert_cased_L-12_H-768_A-12_pt/config.json
INFO:transformers.configuration_utils:Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 119547
}

INFO:transformers.modeling_utils:loading weights file ./rubert_cased_L-12_H-768_A-12_pt/pytorch_model.

In [5]:
class TripletNet(nn.Module):
    def __init__(self, embedding_net):
        super().__init__()
        self.embedding_net = embedding_net

    def forward(self, x1, x2, x3):
        output1 = self.embedding_net(x1)
        output2 = self.embedding_net(x2)
        output3 = self.embedding_net(x3)
        return output1, output2, output3

    def get_embedding(self, x):
        return self.embedding_net(x)

In [6]:
class TripletLoss(nn.Module):
    """
    Triplet loss
    Takes embeddings of an anchor sample, a positive sample and a negative sample
    """

    def __init__(self, margin):
        super(TripletLoss, self).__init__()
        self.margin = margin

    def forward(self, anchor, positive, negative, size_average=True):
        distance_positive = (anchor - positive).pow(2).sum(1)  # .pow(.5)
        distance_negative = (anchor - negative).pow(2).sum(1)  # .pow(.5)
        losses = F.relu(distance_positive - distance_negative + self.margin)
        return losses.mean() if size_average else losses.sum()

In [7]:
model = TripletNet(embedder)

In [8]:
# model.cuda()

In [9]:
df = pd.read_csv('./datasets/lenta-ru-news.csv', nrows=500)

In [10]:
df.tail()

Unnamed: 0,url,title,text,topic,tags,date
495,https://lenta.ru/news/1999/09/27/dili/,Индонезийские войска оставляют за собой руины,Выводимые из Восточного Тимора индонезийские в...,Мир,Все,1999/09/27
496,https://lenta.ru/news/1999/09/27/ort/,ОРТ начало вещание в Европе,27 сентября Общественное российское телевидени...,Россия,Все,1999/09/27
497,https://lenta.ru/news/1999/09/27/nasa/,NASA: космос пора продавать частникам,Вслед за случившейся на прошлой неделе потерей...,Мир,Все,1999/09/27
498,https://lenta.ru/news/1999/09/27/shoigu/,Путину жаль расставаться с Шойгу,Сергей Шойгу рассматривает возможность ухода в...,Россия,Все,1999/09/27
499,https://lenta.ru/news/1999/09/27/cherkessk/,Госдума России вмешалась в конфликт в Карачаев...,Государственная Дума России вместе с главой Ка...,Россия,Все,1999/09/27


In [11]:
ner_model = deeppavlov.build_model(deeppavlov.configs.ner.ner_ontonotes_bert_mult, download=True)

2020-04-26 18:32:22.701 INFO in 'deeppavlov.download'['download'] at line 117: Skipped http://files.deeppavlov.ai/deeppavlov_data/bert/multi_cased_L-12_H-768_A-12.zip download because of matching hashes
INFO:deeppavlov.download:Skipped http://files.deeppavlov.ai/deeppavlov_data/bert/multi_cased_L-12_H-768_A-12.zip download because of matching hashes
2020-04-26 18:32:25.577 INFO in 'deeppavlov.download'['download'] at line 117: Skipped http://files.deeppavlov.ai/deeppavlov_data/ner_ontonotes_bert_mult_v1.tar.gz download because of matching hashes
INFO:deeppavlov.download:Skipped http://files.deeppavlov.ai/deeppavlov_data/ner_ontonotes_bert_mult_v1.tar.gz download because of matching hashes





[nltk_data] Downloading package punkt to /home/vsmaxim8/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vsmaxim8/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package perluniprops to
[nltk_data]     /home/vsmaxim8/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /home/vsmaxim8/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!

2020-04-26 18:32:26.842 INFO in 'deeppavlov.core.data.simple_vocab'['simple_vocab'] at line 115: [loading vocabulary from /home/vsmaxim8/.deeppavlov/models/ner_ontonotes_bert_mult/tag.dict]
INFO:deeppavlov.core.data.simple_vocab:[loading vocabulary from /home/vsmaxim8/.deeppavlov/models/ner_ontonotes_bert_mult/tag.dict]
























































The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use keras.layers.Dense instead.


Instructions for updating:
Use keras.layers.Dense instead.


Instructions for updating:
Please use `layer.__call__` method instead.


Instructions for updating:
Please use `layer.__call__` method instead.












Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API


Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API


























Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.


Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.














Instructions for updating:
Use standard file APIs to check for files with this prefix.


Instructions for updating:
Use standard file APIs to check for files with this prefix.
2020-04-26 18:32:54.652 INFO in 'deeppavlov.core.models.tf_model'['tf_model'] at line 51: [loading model from /home/vsmaxim8/.deeppavlov/models/ner_ontonotes_bert_mult/model]
INFO:deeppavlov.core.models.tf_model:[loading model from /home/vsmaxim8/.deeppavlov/models/ner_ontonotes_bert_mult/model]








INFO:tensorflow:Restoring parameters from /home/vsmaxim8/.deeppavlov/models/ner_ontonotes_bert_mult/model


INFO:tensorflow:Restoring parameters from /home/vsmaxim8/.deeppavlov/models/ner_ontonotes_bert_mult/model


In [18]:
# preprocessing dataset
df[['title', 'text']] = df[['title', 'text']].apply(lambda i: i.replace('\xa0', ' '))
df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)

In [13]:
# getting named entities to make triplets
ner_titles = ner_model(df['title'])

In [14]:
ner_titles

[[['1914',
   '.',
   'Русские',
   'войска',
   'вступили',
   'в',
   '\xa0',
   'пределы',
   'Венгрии'],
  ['1914',
   '.',
   'Празднование',
   'столетия',
   'М',
   '.',
   'Ю',
   '.',
   'Лермонтова',
   'отложено'],
  ['1914', '.', 'Das', 'ist', 'Nesteroff', '!'],
  ['1914', '.', 'Бульдог', '-', 'гонец', 'под', 'Льежем'],
  ['1914', '.', 'Под', 'Люблином', 'пойман', 'швабский', 'зверь'],
  ['Космонавты', 'сомневаются', 'в', '\xa0', 'надежности', '"', 'Мира', '"'],
  ['Взрыв',
   'в',
   '\xa0',
   'центре',
   'Москвы',
   ':',
   'пострадало',
   '30',
   '\xa0',
   'человек'],
  ['Япония',
   'кредитует',
   'Россию',
   'на',
   '\xa0',
   'полтора',
   'миллиарда',
   'долларов'],
  ['Британцы', 'отмечают', 'двухлетие', 'смерти', 'Дианы'],
  ['Отмытые',
   'через',
   'Bank',
   'of',
   '\xa0',
   'NY',
   'деньги',
   'не',
   '\xa0',
   'имели',
   'отношения',
   'к',
   '\xa0',
   'МВФ'],
  ['C',
   '1',
   '\xa0',
   'сентября',
   'вводится',
   'новая',
   'форма

In [15]:
titles_with_entities = []

for title_tokens, entities in zip(*ner_titles):
    title_entities = []
    current_word = []
    last_entity_desc = None
    
    for token, entity_desc in zip(title_tokens, entities):
        if len(current_word) > 0 and entity_desc == 'O':
            title_entities.append((' '.join(current_word), last_entity_desc))
            current_word = []
        
        if entity_desc != 'O':
            if entity_desc[0] == 'B':
                last_entity_desc = entity_desc

            current_word.append(token)
    
    titles_with_entities.append(title_entities)

In [16]:
titles_with_entities

[[('1914', 'B-DATE'), ('Русские', 'B-NORP')],
 [('1914', 'B-DATE'), ('М . Ю . Лермонтова', 'B-PERSON')],
 [('1914', 'B-DATE'), ('Nesteroff', 'B-PERSON')],
 [('1914', 'B-DATE')],
 [('1914', 'B-DATE'), ('Люблином', 'B-GPE')],
 [],
 [('Москвы', 'B-GPE'), ('30', 'B-CARDINAL')],
 [('Япония', 'B-GPE'), ('Россию', 'B-GPE')],
 [('Британцы', 'B-NORP')],
 [('Bank of \xa0 NY', 'B-ORG')],
 [('1 \xa0 сентября', 'B-DATE')],
 [('ФСБ', 'B-ORG')],
 [('Южно - Сахалинск', 'B-GPE')],
 [],
 [('одно', 'B-CARDINAL'),
  ('Турции', 'B-GPE'),
  ('один', 'B-CARDINAL'),
  ('около 70', 'B-CARDINAL')],
 [('Дагестана', 'B-GPE')],
 [('Карачаево - Черкесия', 'B-GPE')],
 [('Коржаков', 'B-PERSON'), ('Генпрокуратуре', 'B-ORG')],
 [('Российские', 'B-NORP')],
 [('Киргизия', 'B-GPE'), ('Таджикистаном', 'B-GPE')],
 [('Литва', 'B-GPE')],
 [('Восточного Тимора', 'B-LOC')],
 [('США', 'B-GPE'), ('Китаю 4 , 5 \xa0 миллиона долларов', 'B-MONEY')],
 [('Илюхин', 'B-PERSON'), ('Исаков', 'B-PERSON')],
 [('Дель Понти', 'B-PERSON')],
 [

In [19]:
df['date']

0     1914-09-16
1     1914-09-16
2     1914-09-17
3     1914-09-17
4     1914-09-18
         ...    
495   1999-09-27
496   1999-09-27
497   1999-09-27
498   1999-09-27
499   1999-09-27
Name: date, Length: 500, dtype: datetime64[ns]