In [51]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader
from TorchCRF import CRF
from deeppavlov.models.embedders.fasttext_embedder import FasttextEmbedder
# from deeppavlov.models.embedders.tfidf_weighted_embedder import TfidfWeightedEmbedder
# from deeppavlov.models.embedders.elmo_embedder import ELMoEmbedder
# from deeppavlov.models.embedders.transformers_embedder import TransformersBertEmbedder
import re
import json
import nltk
nltk.download("punkt")
from nltk import sent_tokenize, word_tokenize
from nltk.tokenize.util import align_tokens
from glob import glob
from functools import partial
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook as tqdm
from conlleval import evaluate as prec_rec_f
from brat_format import read_file, BratDoc

ModuleNotFoundError: No module named 'fasttext'

In [49]:
from deeppavlov.models.embedders.elmo_embedder import ELMoEmbedde

ModuleNotFoundError: No module named 'deeppavlov.models.embedders.elmo_embedder'

In [50]:
!pip install deeppavlov==0.0.9

Collecting deeppavlov==0.0.9
  Using cached deeppavlov-0.0.9-py3-none-any.whl (438 kB)
Collecting fuzzywuzzy==0.16.0
  Using cached fuzzywuzzy-0.16.0-py2.py3-none-any.whl (14 kB)
Collecting keras==2.2.0
  Using cached Keras-2.2.0-py2.py3-none-any.whl (300 kB)
Collecting scikit-learn==0.19.1
  Using cached scikit-learn-0.19.1.tar.gz (9.5 MB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting rusenttokenize==0.0.4
  Using cached rusenttokenize-0.0.4-py3-none-any.whl (10 kB)
Collecting flask-cors==3.0.6
  Using cached Flask_Cors-3.0.6-py2.py3-none-any.whl (13 kB)
Collecting pymorphy2-dicts-ru
  Using cached pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
Collecting overrides==1.9
  Using cached overrides-1.9-py3-none-any.whl
Collecting Cython==0.28.5
  Using cached Cython-0.28.5.tar.gz (1.9 MB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting py

  error: subprocess-exited-with-error
  
  python setup.py bdist_wheel did not run successfully.
  exit code: 1
  
  [316 lines of output]
  Unable to find pgen, not compiling formal grammar.
  running bdist_wheel
  running build
  running build_py
  creating build
  creating build\lib.win-amd64-cpython-39
  copying cython.py -> build\lib.win-amd64-cpython-39
  creating build\lib.win-amd64-cpython-39\Cython
  copying Cython\CodeWriter.py -> build\lib.win-amd64-cpython-39\Cython
  copying Cython\Coverage.py -> build\lib.win-amd64-cpython-39\Cython
  copying Cython\Debugging.py -> build\lib.win-amd64-cpython-39\Cython
  copying Cython\Shadow.py -> build\lib.win-amd64-cpython-39\Cython
  copying Cython\StringIOTree.py -> build\lib.win-amd64-cpython-39\Cython
  copying Cython\TestUtils.py -> build\lib.win-amd64-cpython-39\Cython
  copying Cython\Utils.py -> build\lib.win-amd64-cpython-39\Cython
  copying Cython\__init__.py -> build\lib.win-amd64-cpython-39\Cython
  creating build\lib.win-a

In [27]:
def span_sentences(text, shift=0):
    """
    Extracts sentences and their spans from text.

    Parameters
    text : str
        Text to extract sentences and spans from.
    shift : int
        Initial position from which to start counting span.

    Returns
    sents : List[str]
        Sentences extracted from text.
    spans : List[Tuple[int, int]]
        Extracted sentences position in text.
    """
    
    sents = sent_tokenize(text, language="russian")
    spans = align_tokens(sents, text)
    spans = [(start + shift, end + shift) for start, end in spans]
    
    return sents, spans


def span_tokens(text, shift=0):
    """
    Extracts tokens and their spans from text.

    Parameters
    text : str
        Text to extract tokens and spans from.
    shift : int
        Initial position from which to start counting span.

    Returns
    tokens : List[str]
        Tokens extracted from text.
    spans : List[Tuple[int, int]]
        Extracted tokens position in text.
    """
    
    tokens, spans = [], []

    for tok in re.finditer(r"([^\W_]+|\S)", text):
        tokens.append(tok.group(1))
        spans.append((shift + tok.start(1), 
                      shift + tok.end(1)))
    
    return tokens, spans


def to_conll(brat_ners, spans):
    """
    Converts named entities from brat to conll format. In conll format every 
    token has a tag:
    B-named_entity_type - for the first token in named entity,
    I-named_entity_type - for a token of named entity that is not first,
    O - for a token out of named entity.

    Parameters
    brat_ners : List[Dict]
        Named entities in brat format.
    spans : List[Tuple[int, int]]
        Position of tokens in reference text.

    Returns
    conll_ners : List[str]
        Conll tags of the tokens corresponding to spans.
    """
    
    conll_ners = []

    for token_start, token_end in spans:
        
        for ner in brat_ners:
            
            if (ner["start"] <= token_start) and (ner["end"] >= token_end):
                prefix = "I" if (ner["start"] < token_start) else "B"
                conll_ners.append(prefix + "-" + ner["ner_type"])
                break
        
        else:
            conll_ners.append("O")  
    
    return conll_ners


def to_brat(conll_ners, spans, ner_id=1):
    """
    Converts named entities from conll to brat format. In brat format every 
    named entity is represented with its id, type, and position in reference 
    text.

    Parameters
    conll_ners : List[str]
        Conll tags of the tokens corresponding to spans.
    spans : List[tuple[int]]
        Position of tokens in reference text.
    ner_id : int
        The initial id from which to start counting ner_ids

    Returns
    brat_ners : List[Dict]
        Named entities in brat format.
    """

    brat_ners = []
    prev = "O"

    for tag, (token_start, token_end)  in zip(conll_ners, spans):
        splitted_tag = tag.split("-")
        
        if len(splitted_tag) > 1:
            prefix, ner_type = splitted_tag
            
            if prefix == "I":
                
                if prev != "O":
                    brat_ners[-1]["end"] = token_end
                    prev = "I"
                    continue
            
            brat_ners.append({"ner_id": ner_id, 
                              "ner_type": ner_type, 
                              "start": token_start, 
                              "end": token_end})
            prev = "B"
            ner_id += 1
        
        else:
            prev = "O"

    return brat_ners


def extract_data(files):
    """
    Given text sequence as tokens, predicts corresponding conll tags.

    Parameters
    files : List[str]
        Paths to .ann files to extract data from.

    Returns
    tokens : List[List[str]]
        Tokenized text sequences.
    tags : List[List[str]]
        Conll tags corresponding to token sequences.
    """

    tokens, tags = [], []

    for file_path in tqdm(files):
        brat_doc = read_file(file_path)
        doc_ners = [{"id": i, 
                    "ner_type": brat_doc.ners[idx][0], 
                    "start": brat_doc.ners[idx][1], 
                    "end": brat_doc.ners[idx][2]} 
                    for i, idx in brat_doc.ner_id_2_idx.items()]
        
        for line in re.finditer(r"[^\n]+(\n+|$)", brat_doc.txt_data):
            sents, sent_spans = span_sentences(line.group(0), shift=line.start())
            
            for sent, (sent_start, _) in zip(sents, sent_spans):
                toks, spans = span_tokens(sent, shift=sent_start)
                tokens.append(toks)
                tags.append(to_conll(doc_ners, spans))

    return tokens, tags

In [28]:
files = glob("data/train/*.ann")
tokens, tags = extract_data(files)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for file_path in tqdm(files):


  0%|          | 0/188 [00:00<?, ?it/s]

In [29]:
files = glob("data/train/*.ann")
tokens, tags = extract_data(files)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for file_path in tqdm(files):


  0%|          | 0/188 [00:00<?, ?it/s]

In [30]:
train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens, tags, 
                                                                  test_size=0.1)
len(train_tokens), len(val_tokens)

(19026, 2115)

In [31]:
class NER_Dataset(Dataset):
    def __init__(self, tag2id, seqs, seq_tags):
        self.tag2id = tag2id
        self.seqs = [[token.lower() for token in seq] for seq in seqs]
        self.seq_tags = [[self.tag2id[tag] for tag in tags] for tags in seq_tags]

    def __len__(self):
        return len(self.seqs)

    def __getitem__(self, idx):
        return self.seqs[idx], self.seq_tags[idx]

In [32]:
# Conll tags encoding
tags = list({tag for sent in train_tags for tag in sent})
tag2id = {tag: i for i, tag in enumerate(tags)}
id2tag = {i: tag for i, tag in enumerate(tags)}

with open("tags.json", "w") as f:
    json.dump(tags, f)

tags

['I-CMP',
 'B-ACT',
 'B-CMP',
 'O',
 'I-ECO',
 'B-SOC',
 'I-SOC',
 'I-ACT',
 'I-MET',
 'B-BIN',
 'B-ECO',
 'B-QUA',
 'I-BIN',
 'I-INST',
 'B-INST',
 'B-MET',
 'I-QUA']

In [33]:
train_ds = NER_Dataset(tag2id, train_tokens, train_tags)
val_ds = NER_Dataset(tag2id, val_tokens, val_tags)

In [34]:
val_ds[:2]

([['приложение', '1', '.'],
  ['и',
   'плановых',
   'объемов',
   'финансирования',
   'муниципальной',
   'программы',
   '(',
   'подпро',
   '-']],
 [[7, 7, 7], [8, 8, 3, 7, 3, 3, 3, 15, 8]])

In [35]:
class BiLSTM_CRF(nn.Module):
    def __init__(self, embedding_size, hidden_size, feature_dim, num_classes, 
                 dropout):
        super().__init__()
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.feature_dim = feature_dim
        self.num_classes = num_classes
        self.dropout = dropout

        self.lstm = nn.LSTM(embedding_size, hidden_size, 2, bidirectional=True, 
                            batch_first=True)
        self.drop = nn.Dropout(dropout)
        self.fc_0 = nn.Linear(2 * hidden_size, feature_dim)
        self.Q = nn.Linear(feature_dim, feature_dim)
        self.K = nn.Linear(feature_dim, feature_dim)
        self.V = nn.Linear(feature_dim, feature_dim)
        self.layer_norm = nn.LayerNorm(feature_dim)
        self.fc_1 = nn.Linear(feature_dim, num_classes)
        self.crf = CRF(num_classes)


    def forward(self, x, lengths):
        # LSTM
        x_packed = pack_padded_sequence(x, lengths, batch_first=True)
        seq_out_packed, _ = self.lstm(x_packed)
        seq_out, _ = pad_packed_sequence(seq_out_packed, batch_first=True)
        seq_out = self.drop(seq_out)
        seq_out = self.fc_0(F.relu(seq_out))

        # Attention
        Q, K, V = self.Q(seq_out), self.K(seq_out), self.V(seq_out)
        attn = torch.bmm(Q, K.transpose(1, 2))
        attn /= torch.sqrt(torch.tensor(self.feature_dim, dtype=torch.float))
        attn = F.softmax(attn, dim=-1)
        out = torch.bmm(attn, V)
        out = self.layer_norm(out)

        scores = self.fc_1(out)

        return scores

In [48]:
device = "cuda" if torch.cuda.is_available() else "cpu"
# bert_embedder = TransformersBertEmbedder('rubert_cased_L-12_H-768_A-12_pt_v1')
fasttext_embedder = FasttextEmbedder('/data/embeddings/wiki.ru.bin')

NameError: name 'FasttextEmbedder' is not defined

In [37]:
def collate_fn(batch):
    x, y = list(zip(*batch))
    
    # Sort sequences by length (descending)
    lengths = torch.tensor(list(map(len, y)))
    lengths, indices = lengths.sort(0, descending=True)

    # Pad sequences and create mask
    y = pad_sequence(list(map(torch.tensor, y)), padding_value=-1, batch_first=True)
    y = y[indices]
    mask = (y != -1)
    x = pad_sequence(list(map(torch.tensor, bert_embedder(x))), batch_first=True)
    x = x[indices]
    
    return x.to(device), lengths.to(device), mask.to(device), y.to(device)

In [42]:
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_ds, batch_size=64, collate_fn=collate_fn)

In [46]:
train_ds

<__main__.NER_Dataset at 0x25adc23a610>

In [45]:
next(iter(train_loader))

TypeError: __call__() missing 2 required positional arguments: 'startofwords_batch' and 'attention_batch'

In [39]:
def train(loader, model, optimizer):
    model.train()
    total_loss = 0
    
    for x, lengths, mask, y in tqdm(loader):
        scores = model(x, lengths)
        loss = -model.crf(scores, y, mask)
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print("Train loss {:.4f}".format(total_loss / len(loader)))


def evaluate(loader, model):
    model.eval()
    total_loss = 0
    pred_tags, tags = [], []

    with torch.no_grad():
        
        for x, lengths, mask, y in tqdm(loader):
            scores = model(x, lengths)
            loss = -model.crf(scores, y, mask)
            total_loss += loss.item()

            y_pred = model.crf.decode(scores, mask)
            pred_tags.extend([id2tag[i] for sent in y_pred for i in sent])
            tags.extend([id2tag[i] for i in y[mask].tolist()])

    print("Loss {:.4f}".format(total_loss / len(loader)))
    res = prec_rec_f(tags, pred_tags, verbose=False)
    print("Precision = {:.2f}%, Recall = {:.2f}%, F-score = {:.2f}%".format(*res))
    return res[-1]

In [40]:
bert_embedder.dim

768

In [41]:
model_params = {"embedding_size": bert_embedder.dim, 
                "hidden_size": 200,  
                "feature_dim": 50, 
                "num_classes": len(tag2id), 
                "dropout": 0.2}
model = BiLSTM_CRF(**model_params)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2 * 1e-4)
best_res = -1

for e in range(5):
    print(f"Epoch {e}")
    train(train_loader, model, optimizer)
    f_score = evaluate(val_loader, model)
    
    if f_score > best_res:
        best_res = f_score
        torch.save(model.state_dict(), "model.pt")
    
    print()

Epoch 0


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for x, lengths, mask, y in tqdm(loader):


  0%|          | 0/298 [00:00<?, ?it/s]

TypeError: __call__() missing 2 required positional arguments: 'startofwords_batch' and 'attention_batch'