In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torchtext import datasets

import numpy as np
from sklearn.metrics import classification_report

import random

from gensim.models import FastText
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.metrics import accuracy_score, f1_score

ModuleNotFoundError: No module named 'torchtext'

In [5]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [6]:
train_data, _, test_data = datasets.UDPOS()
train_data = [d for d in train_data]
test_data = [d for d in test_data]

train_tokens = [ [w.lower() for w in d[0]] for d in train_data]
train_tags = [ d[1] for d in train_data]

test_tokens = [[w.lower() for w in d[0]] for d in test_data]
test_tags = [d[1] for d in test_data]

tag2num = { t:i for i, t in enumerate(np.unique([tag for tags in train_tags for tag in tags])) }

100%|██████████| 688k/688k [00:00<00:00, 2.20MB/s]


In [7]:
print(train_data[0][0])
print(train_data[0][1])

print(train_tokens[0])
print(train_tags[0])

['Al', '-', 'Zaman', ':', 'American', 'forces', 'killed', 'Shaikh', 'Abdullah', 'al', '-', 'Ani', ',', 'the', 'preacher', 'at', 'the', 'mosque', 'in', 'the', 'town', 'of', 'Qaim', ',', 'near', 'the', 'Syrian', 'border', '.']
['PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'ADJ', 'NOUN', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'PROPN', 'PUNCT', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT']
['al', '-', 'zaman', ':', 'american', 'forces', 'killed', 'shaikh', 'abdullah', 'al', '-', 'ani', ',', 'the', 'preacher', 'at', 'the', 'mosque', 'in', 'the', 'town', 'of', 'qaim', ',', 'near', 'the', 'syrian', 'border', '.']
['PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'ADJ', 'NOUN', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'PROPN', 'PUNCT', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT']


In [8]:
ft = FastText(sentences=train_tokens, size=100, window=5, min_count=1, min_n=1, workers=4)

In [9]:
max_len = 20
pad_inds = len(tag2num)


def prepare_data(all_tokens, all_tags, ft, tag2num, max_len, pad_tags):
  '''
    Из массива слов all_tokens получим тензор векторов, где каждое слово представлено вектором (ft.wv)
    А целевую переменную классов all_tags преобразуем в числа.
    Все строки не длиннее max_len.
    Пустые значения заполняются нулями или pad_tags
  '''
    
    # укорачиваем токены
  all_tokens = [tokens[:max_len] for tokens in all_tokens]
  all_tags = [tags[:max_len] for tags in all_tags]

  # переводим теги в числа
  all_tags = [np.array([tag2num[tag]  for tag in tags]) for tags in all_tags]

  # all_ids = []
  # for tokens in all_tokens:
  #     ids = prepare_sequence(tokens, word_to_ix)
  #     all_ids.append(ids)
      
  X_vecs = []
  Y_vecs = []

  for tokens, tags in zip(all_tokens, all_tags):
      X_vecs.append(torch.tensor(np.row_stack([ft.wv[w] for w in tokens])))
      Y_vecs.append(torch.tensor(tags, dtype=torch.long))
      
  # в качестве заполнителя X используем новый индекс len(word_to_ix)
  X = pad_sequence(X_vecs, batch_first=True)

  # в качестве заполнителя Y используем pad_tags
  Y = pad_sequence(Y_vecs, batch_first=True, padding_value=pad_tags)

  return X, Y

X_train, Y_train = prepare_data(train_tokens, train_tags, ft, tag2num, max_len, pad_inds)

# X_train.size(), Y_train.size()

X_test, Y_test = prepare_data(test_tokens, test_tags, ft, tag2num, max_len, pad_inds)

# X_test.size(), Y_test.size()

In [10]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

bs = 128
data = TensorDataset(X_train, Y_train)
dataloader = DataLoader(data, sampler=SequentialSampler(data), batch_size=bs)

In [11]:
class BiLSTMPOSTagger(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        
        super().__init__()

        self.output_dim = output_dim
        self.input_size = input_dim
        
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout if n_layers > 1 else 0)
        
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, sentence):
    # sentence = [batch size, sent len, emb dim]
      sentence = sentence.view(sentence.shape[1], sentence.shape[0], self.input_size)
    # sentence = [sent len, batch size, emb dim]

      outputs, (hidden, cell) = self.lstm(sentence)

      predictions = self.fc(self.dropout(outputs))

      # predictions = [sent len, batch size, output dim]
      predictions = predictions.view(predictions.shape[1],predictions.shape[0], self.output_dim)
      # predictions = [batch size, sent len, output dim]
      
      # raise NotImplementedException()       
      return predictions

In [12]:
def train_on_epoch(model, dataloader, optimizer):
    model.train()
    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input, b_tags = batch
        
        model.zero_grad()
        outputs = model(b_input)  

        # outputs = [batch size, sent len, out dim]
        outputs = outputs.view(-1, outputs.shape[-1])       
        # outputs = [batch size * sent len, out dim]

        # b_tags = [batch size, sent len]
        b_tags = b_tags.view(-1)
        # b_tags = [batch size * sent len]
        
        loss = criterion(outputs, b_tags)
        loss.backward()
        optimizer.step()


def predict_on_dataloader(model, dataloaded):
    model.eval()
        
    all_outputs = []
    all_tags = []
    for batch in dataloaded:
        batch = tuple(t.to(device) for t in batch)
        b_input, b_tags = batch
        outputs = model(b_input)  
        
        outputs = outputs.view(-1, outputs.shape[-1])       
        b_tags = b_tags.view(-1)

        all_outputs.append(outputs)
        all_tags.append(b_tags)

    all_outputs = torch.cat(all_outputs)
    all_tags = torch.cat(all_tags)
    
    return all_outputs, all_tags

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
print(device)

cuda


In [14]:
INPUT_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = len(tag2num)
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25

model = BiLSTMPOSTagger(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)
model.to(device)

criterion = nn.CrossEntropyLoss(ignore_index=pad_inds)
optimizer = optim.Adam(model.parameters())

In [15]:
epochs = 50
for e in range(epochs):
    train_on_epoch(model, dataloader, optimizer)    
    
    all_outputs, all_tags = predict_on_dataloader(model, dataloader)
    loss = criterion(all_outputs, all_tags).item()
    all_outputs = all_outputs.detach().cpu().numpy()
    all_tags = all_tags.detach().cpu().numpy()
    
    mask = all_tags != pad_inds
    loss = loss/len(all_tags[mask]) 
    all_tags = all_tags[mask]
    all_preds = np.argmax(all_outputs, axis=1)[mask]
    
    print(f"{e}:\tLoss {loss}, "
          f"accuracy: {accuracy_score(all_tags, all_preds)}, "
          f"f1-macro: {f1_score(all_tags, all_preds, average='macro')}")

0:	Loss 9.883845083465591e-06, accuracy: 0.49271511774509985, f1-macro: 0.26873855130688656
1:	Loss 8.51163066635808e-06, accuracy: 0.5592231098974766, f1-macro: 0.3677729977123259
2:	Loss 7.6861043017055e-06, accuracy: 0.5918805390013346, f1-macro: 0.4330344111015487
3:	Loss 7.118146602829307e-06, accuracy: 0.6162905834670996, f1-macro: 0.46122097842590876
4:	Loss 6.734568607153403e-06, accuracy: 0.6352823237821116, f1-macro: 0.4849428384815078
5:	Loss 6.4475724460112385e-06, accuracy: 0.6476441754767923, f1-macro: 0.4972731928366662
6:	Loss 6.231278375546207e-06, accuracy: 0.6536713469498208, f1-macro: 0.5046255016381151
7:	Loss 6.043796680894111e-06, accuracy: 0.6602827850452345, f1-macro: 0.5145501487376035
8:	Loss 5.901720926821849e-06, accuracy: 0.6670479775149603, f1-macro: 0.5233966207870386
9:	Loss 5.744919327959877e-06, accuracy: 0.6748710000799523, f1-macro: 0.5335470172933718
10:	Loss 5.620300798765649e-06, accuracy: 0.6802216523060081, f1-macro: 0.5410919734936509
11:	Loss

In [16]:
def count_metrics(model, dataloader):
  y_pred, y_true = predict_on_dataloader(model, dataloader)

  y_pred = y_pred.detach().cpu().numpy()
  y_true = y_true.detach().cpu().numpy()

  mask = y_true != pad_inds
  y_true = y_true[mask]
  y_pred = np.argmax(y_pred, axis=1)[mask]

  print(classification_report(y_true, y_pred))

In [17]:
count_metrics(model, dataloader)

              precision    recall  f1-score   support

           0       0.65      0.33      0.44      9962
           1       0.89      0.85      0.87     13578
           2       0.72      0.65      0.68      8547
           3       0.85      0.95      0.90     10404
           4       0.99      0.98      0.99      5202
           5       0.95      0.98      0.96     13014
           6       0.94      0.62      0.75       649
           7       0.55      0.85      0.67     27080
           8       0.93      0.95      0.94      3339
           9       0.79      0.90      0.84      4484
          10       0.95      0.95      0.95     15619
          11       0.81      0.15      0.25     10523
          12       0.99      1.00      0.99     16990
          13       0.77      0.59      0.67      3134
          14       0.92      0.65      0.76       484
          15       0.71      0.69      0.70     18849
          16       0.99      0.13      0.24       739

    accuracy              

In [19]:
data = TensorDataset(X_test, Y_test)
test_dataloader = DataLoader(data, sampler=SequentialSampler(data), batch_size=bs)
count_metrics(model, test_dataloader)

              precision    recall  f1-score   support

           0       0.64      0.32      0.42      1466
           1       0.85      0.81      0.83      1656
           2       0.64      0.60      0.62      1066
           3       0.82      0.93      0.87      1336
           4       0.98      0.98      0.98       599
           5       0.95      0.96      0.96      1607
           6       0.86      0.44      0.59       115
           7       0.50      0.81      0.61      3446
           8       0.90      0.96      0.93       448
           9       0.71      0.80      0.75       546
          10       0.92      0.93      0.93      1923
          11       0.67      0.08      0.14      1773
          12       0.98      0.99      0.98      2467
          13       0.67      0.48      0.56       330
          14       0.70      0.43      0.53        81
          15       0.61      0.68      0.64      2306
          16       0.00      0.00      0.00       114

    accuracy              

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
