In [14]:
from sklearn.metrics import classification_report, r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn_crfsuite import metrics
from sklearn.utils import shuffle
import torch
import torch.nn as nn

from torchcrf import CRF
import numpy as np
import json

from torch_model_base import TorchModelBase
from torch_rnn_classifier import TorchRNNDataset, TorchRNNClassifier, TorchRNNModel
import utils

In [2]:
with open('annotations2.jsonl') as jsonl_file:
    # note: after running data-preprocessing.ipynb this file already has token-level labels
    lines = jsonl_file.readlines()
annot = [json.loads(line) for line in lines]

In [12]:
# now get data into format that TorchRNN expects:
X=[] 
y=[]
for j in range(0,len(annot)):
    a = annot[j]['tokens']
    auxX = []
    auxy = []
    if annot[j]['spans']!=[]: # are there annot for this example?
        for i in range(0,len(a)):
            #token_element = (a[i]['text'],a[i]['label'])
            auxX.append(a[i]['text'])
            auxy.append(a[i]['label'])
        X.append(auxX)
        y.append(auxy)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
#X_train, X_test, y_train, y_test = X[:120], X[120:], y[:120], y[120:]
vocab = sorted({w for seq in X for w in seq}) + ["$UNK"]

In [15]:
train_ratio = 0.75
train_test_split = round(0.75*len(X) - 0.5) # -0.5 => floor
idx = [i for i in range(0,len(X))]
idx_shuffle = shuffle(idx,random_state=0)
X_shuffle, y_shuffle = [X[auxIdx] for auxIdx in idx_shuffle], [y[auxIdx] for auxIdx in idx_shuffle]
X_train, X_test, y_train, y_test = X_shuffle[:train_test_split], X_shuffle[train_test_split:], y_shuffle[:train_test_split], y_shuffle[train_test_split:]

In [16]:
# reload modules
import torch_rnn_classifier, torch_model_base
import importlib
importlib.reload(torch_model_base)
importlib.reload(torch_rnn_classifier)
from torch_model_base import TorchModelBase
from torch_rnn_classifier import TorchRNNClassifier, TorchRNNModel, TorchRNNDataset

In [5]:
class TorchRNNSequenceLabeler(TorchRNNClassifier):

    def build_graph(self):
        rnn = TorchRNNModel(
            vocab_size=len(self.vocab),
            embedding=self.embedding,
            use_embedding=self.use_embedding,
            embed_dim=self.embed_dim,
            rnn_cell_class=self.rnn_cell_class,
            hidden_dim=self.hidden_dim,
            bidirectional=self.bidirectional,
            freeze_embedding=self.freeze_embedding)
        model = TorchSequenceLabeler(
            rnn=rnn,
            output_dim=self.n_classes_)
        self.embed_dim = rnn.embed_dim
        return model

    def build_dataset(self, X, y=None):
        X, seq_lengths = self._prepare_sequences(X)
        if y is None:
            return TorchRNNDataset(X, seq_lengths)
        else:
            # These are the changes from a regular classifier. All
            # concern the fact that our labels are sequences of labels.
            self.classes_ = sorted({x for seq in y for x in seq})
            self.n_classes_ = len(self.classes_)
            class2index = dict(zip(self.classes_, range(self.n_classes_)))
            # `y` is a list of tensors of different length. Our Dataset
            # class will turn it into a padding tensor for processing.
            y = [torch.tensor([class2index[label] for label in seq])
                 for seq in y]
            return TorchRNNDataset(X, seq_lengths, y)

    def predict_proba(self, X):
        seq_lengths = [len(ex) for ex in X]
        # The base class does the heavy lifting:
        preds = self._predict(X)
        # Trim to the actual sequence lengths:
        preds = [p[: l] for p, l in zip(preds, seq_lengths)]
        # Use `softmax`; the model doesn't do this because the loss
        # function does it internally.
        probs = [torch.softmax(seq, dim=1) for seq in preds]
        return probs

    def predict(self, X):
        probs = self.predict_proba(X)
        return [[self.classes_[i] for i in seq.argmax(axis=1)] for seq in probs]

    def score(self, X, y):
        preds = self.predict(X)
        flat_preds = [x for seq in preds for x in seq]
        flat_y = [x for seq in y for x in seq]
        return utils.safe_macro_f1(flat_y, flat_preds)

In [6]:
class TorchSequenceLabeler(nn.Module):
    def __init__(self, rnn, output_dim):
        super().__init__()
        self.rnn = rnn
        self.output_dim = output_dim
        if self.rnn.bidirectional:
            self.classifier_dim = self.rnn.hidden_dim * 2
        else:
            self.classifier_dim = self.rnn.hidden_dim
        self.classifier_layer = nn.Linear(
            self.classifier_dim, self.output_dim)

    def forward(self, X, seq_lengths):
        outputs, state = self.rnn(X, seq_lengths)
        outputs, seq_length = torch.nn.utils.rnn.pad_packed_sequence(
            outputs, batch_first=True)
        logits = self.classifier_layer(outputs)
        # During training, we need to swap the dimensions of logits
        # to accommodate `nn.CrossEntropyLoss`:
        if self.training:
            return logits.transpose(1, 2)
        else:
            return logits

In [7]:
seq_mod = TorchRNNSequenceLabeler(
    vocab,
    early_stopping=True,
    eta=0.001)

In [8]:
%time _ = seq_mod.fit(X_train, y_train)

Stopping after epoch 25. Validation score did not improve by tol=1e-05 for more than 10 epochs. Final error is 2.1324784755706787

Wall time: 3.43 s


In [30]:
y_pred = seq_mod.predict(X_test)

In [32]:
labels = sorted({aux_y for seq in y for aux_y in seq})
print(metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels))
#sorted_labels = sorted(
#    classes,
#    key=lambda name: (name[1:], name[0])
#)
print(metrics.flat_classification_report(
    y_test, y_pred, labels, digits=3
))

0.6224368103752443
                     precision    recall  f1-score   support

DATUM_VERBUECHERUNG      0.133     0.273     0.179        55
      DATUM_VERTRAG      0.000     0.000     0.000        62
            FLAECHE      0.000     0.000     0.000        38
        GESAMTPREIS      0.000     0.000     0.000        29
           IMMO_TYP      0.105     0.298     0.156        47
            KAEUFER      0.000     0.000     0.000        33
                  O      0.859     0.722     0.784      1525
                ORT      0.159     0.576     0.249        59
            QMPREIS      0.012     0.048     0.020        21
            STRASSE      0.086     0.273     0.130        44
   TERRASSENGROESSE      0.000     0.000     0.000         8
         VERKAEUFER      0.000     0.000     0.000        62

           accuracy                          0.594      1983
          macro avg      0.113     0.182     0.126      1983
       weighted avg      0.673     0.594     0.622      1983



  _warn_prf(average, modifier, msg_start, len(result))
