In [1]:
from transformers import BertTokenizer
from razdel import sentenize
import torch
from torch import nn
import numpy as np
import pandas as pd
import tqdm
import json
import pickle
import os
import io

In [105]:
DEVICE = 'cuda:4'
MAX_LEN = 200

## Dataset

In [3]:
from torch.utils.data import Dataset, DataLoader

In [110]:
class TelegramRegressionReader(Dataset):
    def __init__(self, txt_path, vec_path, chunk_size=2048):
        self.txt_path = txt_path
        self.vec_path = vec_path

        self.shift = 3200 # numpy load reads 3200 bytes from file handler which is equal one vector
        
        self.chunk_size = chunk_size
        
        self.size = sum(
            len(el) for el in pd.read_json(
                self.txt_path,
                encoding='utf-8',
                lines=True,
                chunksize=chunk_size)
        )
        
        s = [0]
        with open(txt_path, 'r', encoding='utf-8') as f:
            self.txt_linelocs = [s.append(s[0]+len(n)+1) or s.pop(0) for n in f]
            
    def __len__(self):
        return self.size
    
    def _convert_txt(self, txt):
            tokens = dict_to_token_list(json.loads(txt))
            if len(tokens) < MAX_LEN:
                tokens = tokens + [PAD] * (MAX_LEN - len(tokens))
                
            return [token_to_id.get(word, UNK_IX) for word in tokens[:MAX_LEN]]

    def __getitem__(self, idx):
        with open(self.txt_path, 'r', encoding='utf-8') as f_txt,\
             open(self.vec_path, 'rb') as f_vec:
            f_txt.seek(self.txt_linelocs[idx], 0)
            txt = f_txt.readline()

            f_vec.seek(self.shift * idx, 0)
            vec = np.load(f_vec).reshape(1, -1)
            
            return {'tokens': self._convert_txt(txt), 'vector': vec}

In [111]:
train = TelegramRegressionReader('/data/alolbuhtijarov/datasets/BertSumAbs_predictions/split/train_texts.jsonl',
                                '/data/alolbuhtijarov/datasets/BertSumAbs_predictions/split/train_vec.npy')

test = TelegramRegressionReader('/data/alolbuhtijarov/datasets/BertSumAbs_predictions/split/test_texts.jsonl',
                                '/data/alolbuhtijarov/datasets/BertSumAbs_predictions/split/test_vec.npy')

val = TelegramRegressionReader('/data/alolbuhtijarov/datasets/BertSumAbs_predictions/split/val_texts.jsonl',
                              '/data/alolbuhtijarov/datasets/BertSumAbs_predictions/split/val_vec.npy')

In [112]:
len(train), len(val), len(test)

(456939, 14231, 9813)

In [113]:
BATCH_SIZE = 512

train_loader = DataLoader(train, batch_size=BATCH_SIZE, shuffle=True, num_workers=16, pin_memory=True)
val_loader = DataLoader(val, batch_size=512, num_workers=8, pin_memory=True)
test_loader = DataLoader(test, batch_size=512, num_workers=8, pin_memory=True)

### Vocabulary building

In [7]:
from collections import Counter
from nltk.tokenize import wordpunct_tokenize

In [63]:
def dict_to_token_list(txt_dict):
    txt = txt_dict['text'] + ' ' + txt_dict['title']
    return wordpunct_tokenize(txt.replace('\xa0', ' ').lower().strip())

In [9]:
cnt = Counter()
for i in tqdm.trange(len(train)):
    ind = np.array([i])
    cnt.update(sample_to_token_list(train[ind][0]))

100%|██████████| 456939/456939 [05:52<00:00, 1295.58it/s]


In [10]:
len(cnt)

948538

In [11]:
UNK, PAD = "UNK", "PAD"
tokens = [UNK, PAD] + [el[0] for el in cnt.most_common(50000)]
token_to_id = {t: i for i, t in enumerate(tokens)}

### Encoder with pretrained FastText embeddings

In [12]:
import fasttext

In [13]:
ft = fasttext.load_model('/data/alolbuhtijarov/fasttext_pretrained/cc.ru.300.bin')



In [14]:
vocab_token_vectors = torch.FloatTensor([
    ft.get_word_vector('w') for w in tokens
])

vocab_token_vectors.shape

torch.Size([50002, 300])

In [15]:
UNK_IX, PAD_IX = map(token_to_id.get, [UNK, PAD])

In [16]:
#vocab_token_vectors[0] = 0
vocab_token_vectors[1] = 0

In [172]:
def print_metrics(model, data_loader, batch_size=256):
    squared_error = abs_error = num_samples = 0.0
    cos_loss_val = 0
    cos_loss = nn.CosineEmbeddingLoss()
    model.eval()
    with torch.no_grad():
        for batch in data_loader:
            x = torch.cat(batch['tokens']).view(-1, MAX_LEN).to(DEVICE)
            y = torch.FloatTensor(batch['vector']).squeeze(1).to(DEVICE)
            batch_pred = model(x)
            squared_error += torch.sum(torch.square(batch_pred - y))            
            cos_loss_val += cos_loss(batch_pred, y, torch.ones(len(y)).to(DEVICE)).item()
            abs_error += torch.sum(torch.abs(batch_pred - y))
            num_samples += len(y)
    mse = squared_error.detach().cpu().numpy() / num_samples
    mae = abs_error.detach().cpu().numpy() / num_samples
    print("Mean square error: %.5f" % mse)
    print("Mean absolute error: %.5f" % mae)
    print("Cosine loss: %.5f" % cos_loss_val)
    return mse, mae


In [165]:
class SmallEncoder(nn.Module):
    def __init__(self, n_tokens=len(tokens),
                 hid_size=128):
        super().__init__()
        
        self.embed = nn.Embedding.from_pretrained(vocab_token_vectors, freeze=False)

        self.layers = nn.Sequential(
            nn.Conv1d(in_channels=300, out_channels=300, kernel_size=3),
            nn.AdaptiveAvgPool1d(output_size=1),
            nn.BatchNorm1d(num_features=300),
            nn.ReLU(),
        )
        
        self.ff = nn.Linear(300, 768)

    def forward(self, x):
        x = self.embed(x)
        x = x.permute(0, 2, 1)
        x = self.layers(x).squeeze(-1)
        x = self.ff(x)
        return x

In [166]:
model = SmallEncoder().to(DEVICE)
criterion = nn.MSELoss(reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
EPOCHS = 5

In [167]:
for epoch in range(EPOCHS):
    print(f"epoch: {epoch}")
    run_loss = None
    model.train()
    for i, batch in tqdm.tqdm(enumerate(train_loader), total=len(train) // BATCH_SIZE):
        x = torch.cat(batch['tokens']).view(-1, MAX_LEN).to(DEVICE)
        y = torch.FloatTensor(batch['vector']).squeeze(1).to(DEVICE)
        
        pred = model(x)

        loss = criterion(pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if run_loss is None:
            run_loss = loss.item()
            
        run_loss = 0.9 * run_loss + 0.1 * loss.item()
        
        if i % 40 == 39:
            print(run_loss)

    
    print_metrics(model, val_loader)
      

epoch: 0






  0%|          | 0/892 [00:00<?, ?it/s][A[A[A[A



  0%|          | 1/892 [01:09<17:07:57, 69.22s/it][A[A[A[A



  0%|          | 3/892 [01:09<11:58:17, 48.48s/it][A[A[A[A



  1%|          | 5/892 [01:09<8:21:56, 33.95s/it] [A[A[A[A



  1%|          | 7/892 [01:09<5:50:49, 23.78s/it][A[A[A[A



  1%|          | 7/892 [01:19<5:50:49, 23.78s/it][A[A[A[A



  1%|          | 9/892 [02:00<5:57:40, 24.30s/it][A[A[A[A



  1%|          | 10/892 [02:00<4:10:50, 17.06s/it][A[A[A[A



  1%|▏         | 12/892 [02:00<2:55:30, 11.97s/it][A[A[A[A



  2%|▏         | 14/892 [02:01<2:02:51,  8.40s/it][A[A[A[A



  2%|▏         | 16/892 [02:01<1:26:02,  5.89s/it][A[A[A[A



  2%|▏         | 16/892 [02:19<1:26:02,  5.89s/it][A[A[A[A



  2%|▏         | 17/892 [02:50<4:37:57, 19.06s/it][A[A[A[A



  2%|▏         | 19/892 [02:51<3:14:29, 13.37s/it][A[A[A[A



  2%|▏         | 20/892 [02:51<2:17:15,  9.44s/it][A[A[A[A



  2%|▏         | 22/

KeyboardInterrupt: 

In [173]:
print_metrics(model, test_loader);

Mean square error: 76.98941
Mean absolute error: 192.27158
Cosine loss: 15.39472


#### stuff

https://pytorch.org/docs/stable/optim.html#per-parameter-options

In [None]:
def separate_optimizer(net):
    embed_param = [kv[1] for kv in net.named_parameters() if kv[0] == 'embed.weight']
    model_params = [kv[1] for kv in net.named_parameters() if kv[0] != 'embed.weight']
    opt = torch.optim.Adam([
                {'params': model_params},
                {'params': embed_param, 'lr': 3e-4}
    ], lr=3e-3)
    return opt