In [None]:
!pip install fasttext pyonmttok
!pip install torch==1.5.0+cu101 torchvision==0.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
!wget -nc https://www.dropbox.com/s/dpz0k0ypbql4rbt/ru_tg_1101_0510.jsonl.tar.gz -O - | tar -xz > ru_tg_1101_0510.jsonl
!wget https://www.dropbox.com/s/rohop0gt3zr2msm/ru_vectors_v1.bin

In [None]:
!rm -f lenta-ru-news.csv.gz
!wget https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz
!rm -f lenta-ru-news.csv
!gzip -d lenta-ru-news.csv.gz

In [4]:
import json

tg_data = []
with open("ru_tg_1101_0510.jsonl", "r") as r:
    for line in r:
        tg_data.append(json.loads(line))
tg_data.sort(key=lambda x: x['timestamp'])

In [5]:
import csv
import re

def get_date(url):
    dates = re.findall(r"\d\d\d\d\/\d\d\/\d\d", url)
    return next(iter(dates), None)

with open("lenta-ru-news.csv", "r") as r:
    next(r)
    reader = csv.reader(r, delimiter=',')
    lenta_data = []
    for row in reader:
        url, title, text, _, _ = row
        date = get_date(url)
        lenta_data.append({"date": date, "text": text, "site_name": "lenta", "title": title})

lenta_data.sort(key=lambda x: x["date"])

In [6]:
import pyonmttok
import fasttext

ft_model = fasttext.load_model('ru_vectors_v1.bin')
tokenizer = pyonmttok.Tokenizer("conservative", joiner_annotate=False)

def words_to_embed(model, words):
    vectors = [model.get_word_vector(w) for w in words]
    norm_vectors = [x / np.linalg.norm(x) for x in vectors]
    avg_wv = np.mean(norm_vectors, axis=0)
    max_wv = np.max(norm_vectors, axis=0)
    min_wv = np.min(norm_vectors, axis=0)
    return np.concatenate((avg_wv, max_wv, min_wv))

def preprocess(tokenizer, text):
    text = str(text).strip().replace("\n", " ").replace("\xa0", " ").lower()
    tokens, _ = tokenizer.tokenize(text)
    text = " ".join(tokens)
    return text



In [7]:
import numpy as np

def get_samples(data, ft_model, tokenizer, count, min_words=4, max_words=300):
    last_host_end = {}
    samples = []
    for count, row in enumerate(data[:count]):
        if count % 10000 == 0:
            print(count)
        
        host = row["site_name"]
        text = preprocess(tokenizer, row["title"] + " " + row["text"])
        words = text.split(" ")
        if len(words) < min_words:
            continue
        words = words[:max_words]
            
        border = len(words) // 2
        begin_words = words[:border]
        end_words = words[border:]

        left_vector = words_to_embed(ft_model, begin_words)
        left_text = " ".join(begin_words)
        right_vector = words_to_embed(ft_model, end_words)
        right_text = " ".join(end_words)

        if host in last_host_end:
            samples.append((left_vector, right_vector, last_host_end[host][0]))
        last_host_end[host] = (right_vector, right_text)
    return samples

tg_samples = get_samples(tg_data, ft_model, tokenizer, 250000)
lenta_samples = get_samples(lenta_data, ft_model, tokenizer, 250000)

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000


In [8]:
tg_test_size = len(tg_samples) // 10
lenta_test_size = len(lenta_samples) // 10
train_samples = tg_samples[:-tg_test_size] + lenta_samples[:-lenta_test_size]
test_samples = tg_samples[-tg_test_size:] + lenta_samples[-lenta_test_size:]
tg_test_samples = tg_samples[-tg_test_size:]

In [9]:
from sklearn import metrics
from scipy import spatial

scores = []
test_y = []
for sample in test_samples:
    left_vector, pos_right_vector, neg_right_vector = sample
    test_y += [1, 0]
    scores.append(-spatial.distance.cosine(left_vector, pos_right_vector))
    scores.append(-spatial.distance.cosine(left_vector, neg_right_vector))
metrics.roc_auc_score(test_y, scores)

0.8358113239280407

# Model

In [None]:
import torch
import torch.nn as nn

class SiamiseModelTripletLoss(nn.Module):
    def __init__(self, embedding_dim=384, hidden_dim=50):
        super().__init__()
        
        self.mapping_layer = nn.Linear(embedding_dim, hidden_dim)
        # self.mapping_layer_1 = nn.Linear(embedding_dim, hidden_dim)
        # self.tanh = nn.Tanh()
        # self.mapping_layer_2 = nn.Linear(hidden_dim, hidden_dim)
        self.distance = nn.PairwiseDistance(p=2)
        self.margin = 0.3
    
    def build_projections(self, in_vectors):
        projections = self.mapping_layer(in_vectors)
        norm = projections.norm(p=2, dim=1, keepdim=True)
        projections = projections.div(norm)
        return projections

    def forward(self, pivot_vectors, positive_vectors, negative_vectors):
        pivot = self.build_projections(pivot_vectors)
        positive = self.build_projections(positive_vectors)
        negative = self.build_projections(negative_vectors)
        distances = self.distance(pivot, positive) - self.distance(pivot, negative) + self.margin
        loss = torch.mean(torch.max(distances, torch.zeros_like(distances)))
        return loss
    
    def apply(self, vectors):
        return self.build_projections(vectors)

# Training

In [None]:
import random
import time
import torch.optim as optim

def get_next_gen_batch(samples, batch_size=64):
    indices = np.arange(len(samples))
    np.random.shuffle(indices)
    batch_begin = 0
    while batch_begin < len(samples):
        batch_indices = indices[batch_begin: batch_begin + batch_size]
        pivot_vectors = []
        positive_vectors = []
        negative_vectors = []
        for data_ind in batch_indices:
            pivot, positive, negative = samples[data_ind]
            pivot_vectors.append(pivot)
            positive_vectors.append(positive)
            negative_vectors.append(negative)
        batch_begin += batch_size
        yield torch.cuda.FloatTensor(pivot_vectors), torch.cuda.FloatTensor(positive_vectors), torch.cuda.FloatTensor(negative_vectors)

def train_model(model, train_samples, val_samples, epochs_count=10, 
                loss_every_nsteps=10000, lr=0.01, device_name="cuda"):
    device = torch.device(device_name)
    model = model.to(device)
    total_loss = 0
    start_time = time.time()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_function = nn.BCELoss().cuda()
    prev_avg_val_loss = None
    for epoch in range(epochs_count):
        model.train()
        for step, (pivot, positive, negative) in enumerate(get_next_gen_batch(train_samples)):
            loss = model(pivot, positive, negative)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()
            if step % loss_every_nsteps == 0:
                val_total_loss = 0
                val_batch_count = 0
                model.eval()
                for _, (pivot, positive, negative) in enumerate(get_next_gen_batch(val_samples)):
                    val_total_loss += model(pivot, positive, negative)
                    val_batch_count += 1
                avg_val_loss = val_total_loss/val_batch_count
                print("Epoch = {}, Avg Train Loss = {:.4f}, Avg val loss = {:.4f}, Time = {:.2f}s".format(epoch, total_loss / loss_every_nsteps, avg_val_loss, time.time() - start_time))
                total_loss = 0
                start_time = time.time()

random.shuffle(train_samples)
random.shuffle(test_samples)
model = SiamiseModelTripletLoss()
train_model(model, train_samples, test_samples)

# Testing

In [None]:
tg_test_left = []
tg_test_right = []
test_y = []
for sample in tg_test_samples:
    tg_left, tg_pos_right, tg_neg_right = sample
    tg_test_left += [tg_left, tg_left]
    tg_test_right += [tg_pos_right, tg_neg_right]
    test_y += [1, 0]

batch = []
batch_start = 0
nrows = len(tg_test_left)
scores = []
while batch_start < nrows:
    batch_end = batch_start + 32
    left_batch = tg_test_left[batch_start: batch_end]
    right_batch = tg_test_right[batch_start: batch_end]
    left = model.apply(torch.cuda.FloatTensor(left_batch)).cpu().detach().numpy()
    right = model.apply(torch.cuda.FloatTensor(right_batch)).t().cpu().detach().numpy()
    left = left / np.linalg.norm(left)
    right = right / np.linalg.norm(right)
    score = (left.dot(right) + 1.0) / 2.0 - 1.0
    score = np.diag(score)
    scores.extend(score.tolist())
    batch_start = batch_end
metrics.roc_auc_score(test_y, scores)

# Saving

In [None]:
model = model.cpu()

In [None]:
import torch
import torch.nn as nn

class Embedder(nn.Module):
    def __init__(self, embedding_dim=384, hidden_dim=50):
        super().__init__()
        
        self.mapping_layer = nn.Linear(embedding_dim, hidden_dim)
    
    def forward(self, in_vectors):
        projections = self.mapping_layer(in_vectors)
        norm = projections.norm(p=2, dim=1, keepdim=True)
        projections = projections.div(norm)
        return projections

examples = torch.zeros((1, 384))
examples[0][:] = torch.FloatTensor(tg_test_samples[0][0])
embedder = Embedder()
embedder.mapping_layer.weight.data = model.mapping_layer.weight.data
embedder.mapping_layer.bias = model.mapping_layer.bias
traced_embedder = torch.jit.trace(embedder.cpu(), examples)

In [None]:
torch.save(model.state_dict(), "ru_full_model.pt")

In [None]:
traced_embedder.save("ru_sentence_embedder_v1.pt")