In [1]:
import re
import sentencepiece as spm

import smart_open as sm
import gensim
import numpy as np

from tqdm import tqdm_notebook

In [2]:
def read_data(path='data/news.txt.gz'):
    with sm.open(path, encoding='utf-8') as f:
        for line in f:
            cat, headline, text = line.strip().split('\t')
            yield cat, headline, text
            
        
def tokenize_text(text):
    text = text.lower()
    words = re.findall(r'\b\w+\b', text.lower())
    return words

def normalize_text(text):
    return ' '.join(tokenize_text(text))

def prepare_spm_file(f_out, data):
    with open(f_out, 'w', encoding='utf-8') as f:
        for cat, headline, text in tqdm_notebook(data):
            f.write(normalize_text(headline))
            f.write('\n')
            
            sents = (sent for sent in re.split(r'[.!?]', text) if len(sent) > 20)
    
            for sent in sents:
                f.write(normalize_text(sent))
                f.write('\n')
        
    

In [4]:
prepare_spm_file('data/spm.txt', read_data())

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [6]:
!head -n 5 data/spm.txt

rolex наградит победителей регаты
парусная гонка giraglia rolex cup пройдет в средиземном море в 64 й раз
победители соревнования проводимого с 1953 года yacht club italiano помимо других призов традиционно получают в подарок часы от швейцарского бренда rolex
об этом сообщается в пресс релизе поступившем в редакцию ленты
rolex yacht master 40 фото пресс служба mercury соревнования будут проходить с 10 по 18 июня


In [8]:
spm.SentencePieceTrainer.Train('--input=data/spm.txt \
                               --pad_id=0 \
                               --bos_id=2 \
                               --eos_id=3  \
                               --unk_id=1 \
                               --model_prefix=data/news_spm \
                               --vocab_size=5000')

True

In [9]:
proc = spm.SentencePieceProcessor()
proc.Load('data/news_spm.model')

True

In [10]:
proc.EncodeAsPieces('октября')

['▁октября']

In [12]:
from gensim.models import Word2Vec

sents = [proc.EncodeAsPieces(line.strip()) 
                     for line in open('data/spm.txt', encoding='utf-8')]
w2v = Word2Vec(sents)

w2v.wv.save_word2vec_format('data/w2v_vectors.bin')

In [13]:
w2v.wv.most_similar(proc.EncodeAsPieces('октября'))

[('▁июля', 0.9749369025230408),
 ('▁сентября', 0.9721417427062988),
 ('▁марта', 0.969996988773346),
 ('▁декабря', 0.9690101742744446),
 ('▁августа', 0.9684216976165771),
 ('▁июня', 0.9678225517272949),
 ('▁ноября', 0.9669003486633301),
 ('▁февраля', 0.9666882753372192),
 ('▁апреля', 0.9643102884292603),
 ('▁мая', 0.9626060128211975)]

In [14]:
emb_size = w2v.wv.vector_size

def _piece_id_to_vect(piece_id):
    piece = proc.id_to_piece(piece_id) 
    if piece in w2v.wv:
        return w2v.wv[piece]
    return np.zeros((emb_size,))

emb = np.array([_piece_id_to_vect(piece_id) for piece_id in range(0, len(proc))])
np.save('data/vectors.npy', emb)

In [15]:
proc.EncodeAsPieces(normalize_text('привет мир'))

['▁при', 'вет', '▁мир']

In [16]:
labels = sorted({label for (label, _, _) in read_data()})
label_to_idx = {label:idx for (idx, label) in enumerate(labels)}

In [13]:
max_seq_len = 120

def prepare_text(text):
    text = normalize_text(text)
    pieces = proc.EncodeAsIds(text)
    if len(pieces) > max_seq_len:
        pieces = pieces[:max_seq_len]
    to_add = (max_seq_len - len(pieces))
    pieces = pieces + to_add * [proc.pad_id()]
    
    return np.array(pieces)
    

def prepare_data(label_to_idx):
    X = []
    y = []
    for label, headline, text in read_data():
        label_id = label_to_idx[label]
        X.append(prepare_text(headline + ' ' + text))
        y.append(label_id)
       
    return np.array(X), np.array(y)
    
X, y = prepare_data(label_to_idx)

In [14]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, TensorDataset, DataLoader, random_split

In [15]:
emb_layer = nn.Embedding.from_pretrained(torch.tensor(emb), padding_idx=proc.pad_id())
emb_layer(torch.tensor(prepare_text('привет мир')))

tensor([[-0.2508,  0.1793,  0.2189,  ..., -1.1806,  2.7978, -0.2542],
        [-1.0755,  0.7262,  0.5747,  ..., -0.4297,  0.7703,  0.1982],
        [ 0.4433,  0.6389, -0.5718,  ..., -0.0887,  0.7056, -0.2806],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       dtype=torch.float64)

In [16]:
X = torch.LongTensor(X)
y = torch.LongTensor(y)

l = X.size(0)
l_train, l_test = int(l * 0.7), int(l * 0.2)

data = TensorDataset(X, y)
train_ds, test_ds, val_ds = random_split(data, [l_train, l_test, l - l_train - l_test])

In [17]:
model = nn.Sequential(nn.EmbeddingBag.from_pretrained(torch.FloatTensor(emb)),
                      nn.Linear(emb.shape[1], 20),
                      #nn.ReLU(),
                      #nn.Dropout(0.2),
                      nn.Linear(20, len(labels)),
                      nn.Softmax(dim=1))

In [18]:
def train_model(model, train, val, test, max_epochs=300):
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)

    loss = nn.CrossEntropyLoss()
    train_loader = DataLoader(dataset=train_ds, batch_size=30)
    val_loader = DataLoader(dataset=val_ds)
    test_loader = DataLoader(dataset=test_ds)

    for epoch in range(max_epochs):
        cur = 0
        total = 0
        for X_batch, y_batch in train_loader:   
            model.train()
            y_pred = model(X_batch)    
            bce = loss(y_pred, y_batch)
                        
            bce.backward()        

            cur += (y_pred.argmax(1) == y_batch).sum().item()
            total += y_batch.size(0)
            
            optimizer.step()
            optimizer.zero_grad()       
           
        if epoch % 10 == 0:        
            acc = cur / total
            print(f'Epoch = {epoch}, acc = {acc:.5f}, loss = {bce}')   

train_model(model, train_ds, val_ds, test_ds)

Epoch = 0, acc = 0.46157, loss = 1.9048439264297485
Epoch = 10, acc = 0.64229, loss = 1.7881412506103516
Epoch = 20, acc = 0.65129, loss = 1.7816226482391357
Epoch = 30, acc = 0.66000, loss = 1.7215139865875244
Epoch = 40, acc = 0.66343, loss = 1.7081806659698486
Epoch = 50, acc = 0.65986, loss = 1.6618688106536865
Epoch = 60, acc = 0.66571, loss = 1.6168361902236938
Epoch = 70, acc = 0.66300, loss = 1.5674864053726196
Epoch = 80, acc = 0.66086, loss = 1.663656234741211
Epoch = 90, acc = 0.67014, loss = 1.736398696899414
Epoch = 100, acc = 0.66729, loss = 1.5614030361175537
Epoch = 110, acc = 0.67100, loss = 1.5611494779586792
Epoch = 120, acc = 0.66729, loss = 1.561150312423706
Epoch = 130, acc = 0.67286, loss = 1.5692418813705444
Epoch = 140, acc = 0.66514, loss = 1.5646708011627197
Epoch = 150, acc = 0.67557, loss = 1.561161994934082
Epoch = 160, acc = 0.67186, loss = 1.5611525774002075
Epoch = 170, acc = 0.66986, loss = 1.5611501932144165
Epoch = 180, acc = 0.67357, loss = 1.561730