In [1]:
from transformers import BertTokenizer
from razdel import sentenize
import torch
from torch import nn
import numpy as np
import pandas as pd
import tqdm
import json
import pickle
import os
import io

In [2]:
DEVICE = 'cuda:0'
MAX_LEN = 200
UNK, PAD = "UNK", "PAD"

## Dataset

In [3]:
from torch.utils.data import Dataset, DataLoader

In [4]:
class TelegramRegressionReader(Dataset):
    def __init__(self, txt_path, vec_path, chunk_size=2048):
        self.txt_path = txt_path
        self.vec_path = vec_path

        self.shift = 3200 # numpy load reads 3200 bytes from file handler which is equal one vector
        
        self.chunk_size = chunk_size
        
        self.size = sum(
            len(el) for el in pd.read_json(
                self.txt_path,
                encoding='utf-8',
                lines=True,
                chunksize=chunk_size)
        )
        
        s = [0]
        with open(txt_path, 'r', encoding='utf-8') as f:
            self.txt_linelocs = [s.append(s[0]+len(n)+1) or s.pop(0) for n in f]
            
    def __len__(self):
        return self.size

    def __getitem__(self, idx):
        with open(self.txt_path, 'r', encoding='utf-8') as f_txt,\
             open(self.vec_path, 'rb') as f_vec:
            f_txt.seek(self.txt_linelocs[idx], 0)
            txt = f_txt.readline()

            f_vec.seek(self.shift * idx, 0)
            vec = np.load(f_vec).reshape(1, -1)
            
            return {'text': txt, 'vector': vec}

In [5]:
train = TelegramRegressionReader('/data/alolbuhtijarov/datasets/BertSumAbs_predictions/split/train_texts.jsonl',
                                '/data/alolbuhtijarov/datasets/BertSumAbs_predictions/split/train_vec.npy')

test = TelegramRegressionReader('/data/alolbuhtijarov/datasets/BertSumAbs_predictions/split/test_texts.jsonl',
                                '/data/alolbuhtijarov/datasets/BertSumAbs_predictions/split/test_vec.npy')

val = TelegramRegressionReader('/data/alolbuhtijarov/datasets/BertSumAbs_predictions/split/val_texts.jsonl',
                              '/data/alolbuhtijarov/datasets/BertSumAbs_predictions/split/val_vec.npy')

In [6]:
len(train), len(val), len(test)

(456939, 14231, 9813)

In [7]:
BATCH_SIZE = 1024

train_loader = DataLoader(train, batch_size=BATCH_SIZE, shuffle=True, num_workers=16, pin_memory=True)
val_loader = DataLoader(val, batch_size=512, num_workers=8, pin_memory=True)
test_loader = DataLoader(test, batch_size=512, num_workers=8, pin_memory=True)

### Vocabulary building

In [8]:
from collections import Counter
from nltk.tokenize import wordpunct_tokenize

In [9]:
def dict_to_token_list(txt_dict):
    txt = txt_dict['text'] + ' ' + txt_dict['title']
    return wordpunct_tokenize(txt.replace('\xa0', ' ').lower().strip())

def sample_to_token_list(sample):
    return dict_to_token_list(json.loads(sample['text']))

In [10]:
cnt = Counter()
for i in tqdm.trange(len(train)):
    cnt.update(sample_to_token_list(train[i]))

100%|██████████| 456939/456939 [04:45<00:00, 1599.31it/s]


In [11]:
len(cnt)

948538

In [12]:
tokens = [UNK, PAD] + [el[0] for el in cnt.most_common(50000)]
token_to_id = {t: i for i, t in enumerate(tokens)}

### Encoder with pretrained FastText embeddings

In [13]:
import fasttext

In [14]:
ft = fasttext.load_model('/data/alolbuhtijarov/fasttext_pretrained/cc.ru.300.bin')



In [15]:
vocab_token_vectors = torch.FloatTensor([
    ft.get_word_vector('w') for w in tokens
])

vocab_token_vectors.shape

torch.Size([50002, 300])

In [16]:
UNK_IX, PAD_IX = map(token_to_id.get, [UNK, PAD])

In [17]:
#vocab_token_vectors[UNK_IX] = 0
vocab_token_vectors[PAD_IX] = 0

In [18]:
def json_txt_to_input_inds(json_dict_txt):
    tokens = dict_to_token_list(json.loads(json_dict_txt))
    if len(tokens) < MAX_LEN:
        tokens = tokens + [PAD] * (MAX_LEN - len(tokens))

    return torch.LongTensor([token_to_id.get(word, UNK_IX) for word in tokens[:MAX_LEN]])

def batch_to_torch_x_y(batch):
    x = list(map(json_txt_to_input_inds, batch['text']))
    x = torch.cat(x).view(-1, MAX_LEN).to(DEVICE)
    y = torch.FloatTensor(batch['vector']).squeeze(1).to(DEVICE)
    return x, y


def raw_txt_to_input_inds(txt):
    tokens = wordpunct_tokenize(txt.replace('\xa0', ' ').lower().strip())
    if len(tokens) < MAX_LEN:
        tokens = tokens + [PAD] * (MAX_LEN - len(tokens))

    return torch.LongTensor([token_to_id.get(word, UNK_IX) for word in tokens[:MAX_LEN]])

In [19]:
def print_metrics(model, data_loader, batch_size=256):
    squared_error = abs_error = num_samples = 0.0
    cos_loss_val = 0
    cos_loss = nn.CosineEmbeddingLoss()
    model.eval()
    with torch.no_grad():
        for batch in data_loader:
            x, y = batch_to_torch_x_y(batch)
            batch_pred = model(x)
            squared_error += torch.sum(torch.square(batch_pred - y))            
            cos_loss_val += cos_loss(batch_pred, y, torch.ones(len(y)).to(DEVICE)).item()
            abs_error += torch.sum(torch.abs(batch_pred - y))
            num_samples += len(y)
    mse = squared_error.detach().cpu().numpy() / num_samples
    mae = abs_error.detach().cpu().numpy() / num_samples
    print("Mean square error: %.5f" % mse)
    print("Mean absolute error: %.5f" % mae)
    print("Cosine loss: %.5f" % cos_loss_val)
    return mse, mae


In [20]:
class SmallEncoder(nn.Module):
    def __init__(self, n_tokens=len(tokens),
                 hid_size=128):
        super().__init__()
        
        self.embed = nn.Embedding.from_pretrained(vocab_token_vectors, freeze=False)

        self.layers = nn.Sequential(
            nn.Conv1d(in_channels=300, out_channels=300, kernel_size=3),
            nn.AdaptiveAvgPool1d(output_size=1),
            nn.BatchNorm1d(num_features=300),
            nn.ReLU(),
        )
        
        self.ff = nn.Linear(300, 768)

    def forward(self, x):
        x = self.embed(x)
        x = x.permute(0, 2, 1)
        x = self.layers(x).squeeze(-1)
        x = self.ff(x)
        return x

In [21]:
model = SmallEncoder().to(DEVICE)
criterion = nn.MSELoss(reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
EPOCHS = 5

In [22]:
for epoch in range(EPOCHS):
    print(f"epoch: {epoch + 1}")
    run_loss = None
    model.train()
    for i, batch in tqdm.tqdm(enumerate(train_loader), total=len(train) // BATCH_SIZE):
        x, y = batch_to_torch_x_y(batch)
        pred = model(x)

        loss = criterion(pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if run_loss is None:
            run_loss = loss.item()
            
        run_loss = 0.9 * run_loss + 0.1 * loss.item()
    
    print_metrics(model, val_loader)
      

epoch: 1


447it [02:46,  2.69it/s]                         


Mean square error: 294.65074
Mean absolute error: 374.39600
Cosine loss: 23.11614
epoch: 2


447it [02:45,  2.71it/s]                         


Mean square error: 208.58994
Mean absolute error: 314.59602
Cosine loss: 19.32831
epoch: 3


447it [02:51,  2.61it/s]                         


Mean square error: 191.15101
Mean absolute error: 299.37562
Cosine loss: 18.57789
epoch: 4


447it [02:48,  2.65it/s]                         


Mean square error: 139.27867
Mean absolute error: 255.43291
Cosine loss: 16.55982
epoch: 5


447it [02:41,  2.77it/s]                         


Mean square error: 97.44005
Mean absolute error: 215.37186
Cosine loss: 14.48609


In [23]:
print_metrics(model, test_loader);

Mean square error: 97.35272
Mean absolute error: 215.27334
Cosine loss: 10.32175


In [46]:
from evaluation.evaluate_clustering import eval_clustering

In [25]:
model.eval()

SmallEncoder(
  (embed): Embedding(50002, 300)
  (layers): Sequential(
    (0): Conv1d(300, 300, kernel_size=(3,), stride=(1,))
    (1): AdaptiveAvgPool1d(output_size=1)
    (2): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): ReLU()
  )
  (ff): Linear(in_features=300, out_features=768, bias=True)
)

In [26]:
def txt_to_vec(txt):
    with torch.no_grad():
        vec = model(raw_txt_to_input_inds(txt).to(DEVICE).unsqueeze(0))
    
    return vec.cpu().numpy().flatten()

In [43]:
import evaluation.evaluate_clustering

In [44]:
import importlib

In [45]:
importlib.reload(evaluation.evaluate_clustering)

<module 'evaluation.evaluate_clustering' from '/home/alolbuhtijarov/PreSumm/src/Clustering/evaluation/evaluate_clustering.py'>

In [47]:
eval_clustering(txt_to_vec)





0it [00:00, ?it/s][A[A[A[A



136it [00:00, 1356.01it/s][A[A[A[A



292it [00:00, 1409.94it/s][A[A[A[A



427it [00:00, 1388.27it/s][A[A[A[A



562it [00:00, 1374.07it/s][A[A[A[A



708it [00:00, 1396.30it/s][A[A[A[A



858it [00:00, 1424.69it/s][A[A[A[A



1006it [00:00, 1438.44it/s][A[A[A[A



1158it [00:00, 1459.89it/s][A[A[A[A



1300it [00:00, 1446.17it/s][A[A[A[A



1446it [00:01, 1449.41it/s][A[A[A[A



1596it [00:01, 1463.45it/s][A[A[A[A



1745it [00:01, 1470.22it/s][A[A[A[A



1893it [00:01, 1470.45it/s][A[A[A[A



2053it [00:01, 1505.11it/s][A[A[A[A



2209it [00:01, 1519.68it/s][A[A[A[A



2368it [00:01, 1537.24it/s][A[A[A[A



2527it [00:01, 1551.57it/s][A[A[A[A



2683it [00:01, 1528.52it/s][A[A[A[A



2836it [00:01, 1456.85it/s][A[A[A[A



2983it [00:02, 1419.75it/s][A[A[A[A



3129it [00:02, 1431.29it/s][A[A[A[A



3280it [00:02, 1453.39it/s][A[A[A[A



3429it [00:02, 1463.71

Best distance = 0.07705595090766043
              precision    recall  f1-score   support

           0       0.78      0.87      0.82      1571
           1       0.78      0.66      0.72      1130

    accuracy                           0.78      2701
   macro avg       0.78      0.76      0.77      2701
weighted avg       0.78      0.78      0.78      2701



#### stuff

https://pytorch.org/docs/stable/optim.html#per-parameter-options

In [None]:
def separate_optimizer(net):
    embed_param = [kv[1] for kv in net.named_parameters() if kv[0] == 'embed.weight']
    model_params = [kv[1] for kv in net.named_parameters() if kv[0] != 'embed.weight']
    opt = torch.optim.Adam([
                {'params': model_params},
                {'params': embed_param, 'lr': 3e-4}
    ], lr=3e-3)
    return opt