In [1]:
from transformers import BertTokenizer
from razdel import sentenize
import torch
from torch import nn
import numpy as np
import pandas as pd
import tqdm
import json
import pickle
import os
import io

In [2]:
DEVICE = 'cuda:4'

## Dataset

In [3]:
from torch.utils.data import Dataset, DataLoader

In [4]:
class TelegramRegressionReader(Dataset):
    def __init__(self, txt_path, vec_path, chunk_size=2048):
        self.txt_path = txt_path
        self.vec_path = vec_path

        self.shift = 3200 # numpy load reads 3200 bytes from file handler which is equal one vector
        
        self.chunk_size = chunk_size
        
        self.size = sum(
            len(el) for el in pd.read_json(
                self.txt_path,
                encoding='utf-8',
                lines=True,
                chunksize=chunk_size)
        )
        
        s = [0]
        with open(txt_path, 'r', encoding='utf-8') as f:
            self.txt_linelocs = [s.append(s[0]+len(n)+1) or s.pop(0) for n in f]

    def __len__(self):
        return self.size

    def __getitem__(self, idx):
        # idx - list of indexes
        assert type(idx) == np.ndarray
        samples = []

        with open(self.txt_path, 'r', encoding='utf-8') as f_txt,\
             open(self.vec_path, 'rb') as f_vec:
            for pos in idx:
                f_txt.seek(self.txt_linelocs[pos], 0)
                txt = f_txt.readline()

                f_vec.seek(self.shift * pos, 0)
                vec = np.load(f_vec).reshape(1, -1)

                samples.append({'text': txt, 'vector': vec})

        return samples

In [5]:
train = TelegramRegressionReader('/data/alolbuhtijarov/datasets/BertSumAbs_predictions/split/train_texts.jsonl',
                                 '/data/alolbuhtijarov/datasets/BertSumAbs_predictions/split/train_vec.npy')

test = TelegramRegressionReader('/data/alolbuhtijarov/datasets/BertSumAbs_predictions/split/test_texts.jsonl',
                                '/data/alolbuhtijarov/datasets/BertSumAbs_predictions/split/test_vec.npy')

val = TelegramRegressionReader('/data/alolbuhtijarov/datasets/BertSumAbs_predictions/split/val_texts.jsonl',
                               '/data/alolbuhtijarov/datasets/BertSumAbs_predictions/split/val_vec.npy')

In [6]:
len(train), len(val), len(test)

(456939, 14231, 9813)

In [None]:
train_loader = DataLoader(train, batch_size=256,, sampler=None, 
                          batch_sampler=None, num_workers=8, pin_memory=True)

### Vocabulary building

In [7]:
from collections import Counter
from nltk.tokenize import wordpunct_tokenize

In [8]:
def sample_to_token_list(sample):
    txt_dict = json.loads(sample['text'])
    txt = txt_dict['text'] + ' ' + txt_dict['title']
    return wordpunct_tokenize(txt.replace('\xa0', ' ').lower().strip())

In [9]:
cnt = Counter()
for i in tqdm.trange(len(train)):
    ind = np.array([i])
    cnt.update(sample_to_token_list(train[ind][0]))

100%|██████████| 456939/456939 [05:52<00:00, 1295.58it/s]


In [10]:
len(cnt)

948538

In [11]:
UNK, PAD = "UNK", "PAD"
tokens = [UNK, PAD] + [el[0] for el in cnt.most_common(50000)]
token_to_id = {t: i for i, t in enumerate(tokens)}

### Encoder with pretrained FastText embeddings

In [12]:
import fasttext

In [13]:
ft = fasttext.load_model('/data/alolbuhtijarov/fasttext_pretrained/cc.ru.300.bin')



In [14]:
vocab_token_vectors = torch.FloatTensor([
    ft.get_word_vector('w') for w in tokens
])

vocab_token_vectors.shape

torch.Size([50002, 300])

In [15]:
UNK_IX, PAD_IX = map(token_to_id.get, [UNK, PAD])

def as_matrix(sequences, max_len=200):
    if isinstance(sequences[0], str):
        sequences = list(map(str.split, sequences))
        
    max_len = min(max(map(len, sequences)), max_len or float('inf'))
    
    matrix = np.full((len(sequences), max_len), np.int32(PAD_IX))
    for i,seq in enumerate(sequences):
        row_ix = [token_to_id.get(word, UNK_IX) for word in seq[:max_len]]
        matrix[i, :len(row_ix)] = row_ix
    
    return matrix

In [16]:
#vocab_token_vectors[0] = 0
vocab_token_vectors[1] = 0

In [17]:
def make_batch(data):
    x = []
    y = []
    for i in range(len(data)):
        tokens = sample_to_token_list(data[i])
        vec = data[i]['vector']
        
        x.append(tokens)
        y.append(vec)
        

    x = np.array(x)
    x = as_matrix(x)
    x = apply_word_dropout(x)
    return torch.LongTensor(x).to(DEVICE), torch.FloatTensor(y).to(DEVICE).squeeze(1)

def apply_word_dropout(matrix, keep_prop=0.9, replace_with=UNK_IX, pad_ix=PAD_IX,):
    dropout_mask = np.random.choice(2, np.shape(matrix), p=[keep_prop, 1 - keep_prop])
    dropout_mask &= matrix != pad_ix
    return np.choose(dropout_mask, [matrix, np.full_like(matrix, replace_with)])

In [18]:
def iterate_minibatches(data, batch_size=256, shuffle=True):
    """ iterates minibatches of data in random order """
    while True:
        indices = np.arange(len(data))
        if shuffle:
            indices = np.random.permutation(indices)

        for start in range(0, len(indices), batch_size):
            batch = make_batch(data[indices[start: start + batch_size]])
            yield batch
        
        break

In [40]:
def print_metrics(model, data, batch_size=256):
    squared_error = abs_error = num_samples = 0.0
    cos_loss_val = 0
    cos_loss = nn.CosineEmbeddingLoss()
    model.eval()
    with torch.no_grad():
        for x, y in iterate_minibatches(data, batch_size=batch_size, shuffle=False):
            batch_pred = model(x)
            squared_error += torch.sum(torch.square(batch_pred - y))            
            cos_loss_val += cos_loss(batch_pred, y, torch.ones(len(y)).to(DEVICE)).item()
            abs_error += torch.sum(torch.abs(batch_pred - y))
            num_samples += len(y)
    mse = squared_error.detach().cpu().numpy() / num_samples
    mae = abs_error.detach().cpu().numpy() / num_samples
    print("Mean square error: %.5f" % mse)
    print("Mean absolute error: %.5f" % mae)
    print("Cosine loss: %.5f" % cos_loss_val)
    return mse, mae


In [20]:
class SmallEncoder(nn.Module):
    def __init__(self, n_tokens=len(tokens),
                 hid_size=128):
        super().__init__()
        
        self.embed = nn.Embedding.from_pretrained(vocab_token_vectors, freeze=False)

        self.layers = nn.Sequential(
            nn.Conv1d(in_channels=300, out_channels=300, kernel_size=3),
            nn.AdaptiveAvgPool1d(output_size=1),
            nn.BatchNorm1d(num_features=300),
            nn.ReLU(),
        )
        
        self.ff = nn.Linear(300, 768)

    def forward(self, x):
        x = self.embed(x)
        x = x.permute(0, 2, 1)
        x = self.layers(x).squeeze(-1)
        x = self.ff(x)
        return x

In [24]:
model = SmallEncoder().to(DEVICE)
criterion = nn.MSELoss(reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
EPOCHS = 5
BATCH_SIZE = 2048

In [25]:
for epoch in range(EPOCHS):
    print(f"epoch: {epoch}")
    run_loss = None
    model.train()
    for i, (x, y) in tqdm.tqdm(enumerate(
            iterate_minibatches(train, batch_size=BATCH_SIZE)),
            total=len(train) // BATCH_SIZE
        ):
        pred = model(x)

        loss = criterion(pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if run_loss is None:
            run_loss = loss.item()
            
        run_loss = 0.9 * run_loss + 0.1 * loss.item()
        
        if i % 40 == 39:
            print(run_loss)

    if epoch % 2 == 1:
        print_metrics(model, val)
      


  0%|          | 0/223 [00:00<?, ?it/s][A

epoch: 0



  0%|          | 1/223 [00:58<3:38:12, 58.97s/it][A
  1%|          | 2/223 [02:00<3:40:33, 59.88s/it][A
  1%|▏         | 3/223 [02:53<3:31:53, 57.79s/it][A
  2%|▏         | 4/223 [03:52<3:31:19, 57.90s/it][A
  2%|▏         | 5/223 [04:40<3:19:55, 55.03s/it][A
  3%|▎         | 6/223 [05:28<3:11:52, 53.05s/it][A
  3%|▎         | 7/223 [06:45<3:37:01, 60.28s/it][A
  4%|▎         | 8/223 [07:42<3:32:00, 59.16s/it][A
  4%|▍         | 9/223 [08:30<3:18:34, 55.67s/it][A
  4%|▍         | 10/223 [09:17<3:09:08, 53.28s/it][A
  5%|▍         | 11/223 [10:04<3:01:06, 51.26s/it][A
  5%|▌         | 12/223 [11:12<3:18:26, 56.43s/it][A
  6%|▌         | 13/223 [11:57<3:05:14, 52.93s/it][A
  6%|▋         | 14/223 [12:40<2:53:52, 49.92s/it][A
  7%|▋         | 15/223 [13:34<2:57:43, 51.27s/it][A
  7%|▋         | 16/223 [14:24<2:55:22, 50.84s/it][A
  8%|▊         | 17/223 [15:26<3:06:12, 54.23s/it][A
  8%|▊         | 18/223 [16:24<3:09:03, 55.34s/it][A
  9%|▊         | 19/223 [17:15<3:03:

0.09896427965586789



 18%|█▊        | 41/223 [31:55<1:42:47, 33.89s/it][A
 19%|█▉        | 42/223 [32:27<1:40:06, 33.18s/it][A
 19%|█▉        | 43/223 [32:57<1:37:27, 32.49s/it][A
 20%|█▉        | 44/223 [33:37<1:43:10, 34.58s/it][A
 20%|██        | 45/223 [34:13<1:44:07, 35.10s/it][A
 21%|██        | 46/223 [34:46<1:41:39, 34.46s/it][A
 21%|██        | 47/223 [35:18<1:38:24, 33.55s/it][A
 22%|██▏       | 48/223 [35:47<1:34:33, 32.42s/it][A
 22%|██▏       | 49/223 [36:16<1:30:54, 31.35s/it][A

KeyboardInterrupt: 

In [42]:
print_metrics(model, test);

Mean square error: 79.35949
Mean absolute error: 195.63906
Cosine loss: 29.05058


#### stuff

https://pytorch.org/docs/stable/optim.html#per-parameter-options

In [None]:
def separate_optimizer(net):
    embed_param = [kv[1] for kv in net.named_parameters() if kv[0] == 'embed.weight']
    model_params = [kv[1] for kv in net.named_parameters() if kv[0] != 'embed.weight']
    opt = torch.optim.Adam([
                {'params': model_params},
                {'params': embed_param, 'lr': 3e-4}
    ], lr=3e-3)
    return opt