In [9]:
import nltk
from sklearn.metrics import classification_report, r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
import torch
import torch.nn as nn

#from torch_model_base import TorchModelBase
#from torch_shallow_neural_classifier import TorchShallowNeuralClassifier
from torch_rnn_classifier import TorchRNNDataset, TorchRNNClassifier, TorchRNNModel
import utils

import pandas as pd
from collections import Counter

In [3]:
# refresh torch rnn classifier:
import importlib
import torch_rnn_classifier
importlib.reload(torch_rnn_classifier)
from torch_rnn_classifier import TorchRNNDataset

In [10]:
import json

In [4]:
class TorchRNNSequenceLabeler(TorchRNNClassifier):

    def build_graph(self):
        rnn = TorchRNNModel(
            vocab_size=len(self.vocab),
            embedding=self.embedding,
            use_embedding=self.use_embedding,
            embed_dim=self.embed_dim,
            rnn_cell_class=self.rnn_cell_class,
            hidden_dim=self.hidden_dim,
            bidirectional=self.bidirectional,
            freeze_embedding=self.freeze_embedding)
        model = TorchSequenceLabeler(
            rnn=rnn,
            output_dim=self.n_classes_)
        self.embed_dim = rnn.embed_dim
        return model

    def build_dataset(self, X, y=None):
        X, seq_lengths = self._prepare_sequences(X)
        if y is None:
            return TorchRNNDataset(X, seq_lengths)
        else:
            # These are the changes from a regular classifier. All
            # concern the fact that our labels are sequences of labels.
            self.classes_ = sorted({x for seq in y for x in seq})
            self.n_classes_ = len(self.classes_)
            class2index = dict(zip(self.classes_, range(self.n_classes_)))
            # `y` is a list of tensors of different length. Our Dataset
            # class will turn it into a padding tensor for processing.
            y = [torch.tensor([class2index[label] for label in seq])
                 for seq in y]
            return TorchRNNDataset(X, seq_lengths, y)

    def predict_proba(self, X):
        seq_lengths = [len(ex) for ex in X]
        # The base class does the heavy lifting:
        preds = self._predict(X)
        # Trim to the actual sequence lengths:
        preds = [p[: l] for p, l in zip(preds, seq_lengths)]
        # Use `softmax`; the model doesn't do this because the loss
        # function does it internally.
        probs = [torch.softmax(seq, dim=1) for seq in preds]
        return probs

    def predict(self, X):
        probs = self.predict_proba(X)
        return [[self.classes_[i] for i in seq.argmax(axis=1)] for seq in probs] # seq.argmax(axis=1) gives index of col that maximizes softmax prob
        # see difference vs TorchRNNClassifier.predict

    def score(self, X, y):
        preds = self.predict(X)
        flat_preds = [x for seq in preds for x in seq]
        flat_y = [x for seq in y for x in seq]
        return utils.safe_macro_f1(flat_y, flat_preds)

In [101]:
with open('annotations2.jsonl') as jsonl_file:
    # note: after running data-preprocessing.ipynb this file already has token-level labels
    lines = jsonl_file.readlines()
annot = [json.loads(line) for line in lines]

In [102]:
# now get data into format that TorchRNN expects:
X=[] 
y=[]
for j in range(0,len(annot)):
    a = annot[j]['tokens']
    auxX = []
    auxy = []
    if annot[j]['spans']!=[]: # are there annot for this example?
        for i in range(0,len(a)):
            #token_element = (a[i]['text'],a[i]['label'])
            auxX.append(a[i]['text'])
            auxy.append(a[i]['label'])
        X.append(auxX)
        y.append(auxy)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train, X_test, y_train, y_test = X[:120], X[120:], y[:120], y[120:]
vocab = sorted({w for seq in X_train for w in seq}) + ["$UNK"]

In [103]:
# check counts for each label in training set
from collections import Counter
# first unroll list of lists
a = [item for auxVec in y_train for item in auxVec]
b=Counter(a)
print(b)

Counter({'O': 4957, 'ORT': 212, 'IMMO_TYP': 200, 'DATUM_VERTRAG': 195, 'VERKAEUFER': 191, 'DATUM_VERBUECHERUNG': 187, 'STRASSE': 150, 'KAEUFER': 135, 'FLAECHE': 124, 'GESAMTPREIS': 92, 'QMPREIS': 67, 'TERRASSENGROESSE': 31})


In [104]:
seq_mod = TorchRNNSequenceLabeler(
    vocab,
    early_stopping=True,
    eta=0.001)

In [105]:
class TorchSequenceLabeler(nn.Module):
    def __init__(self, rnn, output_dim):
        super().__init__()
        self.rnn = rnn
        self.output_dim = output_dim
        if self.rnn.bidirectional:
            self.classifier_dim = self.rnn.hidden_dim * 2
        else:
            self.classifier_dim = self.rnn.hidden_dim
        self.classifier_layer = nn.Linear(
            self.classifier_dim, self.output_dim)

    def forward(self, X, seq_lengths): # seq_lengths is the number of distinct classes we want to split the inputs into
        outputs, state = self.rnn(X, seq_lengths)
        outputs, seq_length = torch.nn.utils.rnn.pad_packed_sequence(
            outputs, batch_first=True)
        logits = self.classifier_layer(outputs)
        # During training, we need to swap the dimensions of logits
        # to accommodate `nn.CrossEntropyLoss`:
        if self.training:
            return logits.transpose(1, 2)
        else:
            return logits

In [106]:
%time _ = seq_mod.fit(X_train, y_train)

Stopping after epoch 25. Validation score did not improve by tol=1e-05 for more than 10 epochs. Final error is 2.114985227584839

Wall time: 1.32 s


In [107]:
y_pred = seq_mod.predict(X_test)
print(y_test[0])
print(y_pred[0])

['ORT', 'O', 'O', 'STRASSE', 'STRASSE', 'O', 'ORT', 'O', 'O', 'FLAECHE', 'O', 'O', 'IMMO_TYP', 'O', 'O', 'O', 'O', 'QMPREIS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'VERKAEUFER', 'O', 'O', 'O', 'O', 'O', 'GESAMTPREIS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'DATUM_VERTRAG', 'DATUM_VERTRAG', 'O', 'O', 'O', 'O', 'O', 'DATUM_VERBUECHERUNG', 'DATUM_VERBUECHERUNG', 'O']
['KAEUFER', 'KAEUFER', 'O', 'O', 'KAEUFER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'IMMO_TYP', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [108]:
labels=seq_mod.classes_
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

In [109]:
# unfold all our data - NOTE: this means we don't care about per sentence results. 
# i.e. each classification is worth same regardless of sentence in which it occurs
y_test_unfold = [y for element in y_test for y in element]
y_pred_unfold = [y for element in y_pred for y in element]

In [82]:
# convert y_test and y_pred into binary formats
#from sklearn.preprocessing import MultiLabelBinarizer

In [110]:
print(classification_report(
    y_test_unfold, y_pred_unfold, labels=sorted_labels, digits=3
))

                     precision    recall  f1-score   support

                  O      0.808     0.888     0.846       643
            KAEUFER      0.070     0.278     0.112        18
DATUM_VERBUECHERUNG      0.000     0.000     0.000        25
      DATUM_VERTRAG      1.000     0.037     0.071        27
         VERKAEUFER      0.333     0.042     0.074        24
   TERRASSENGROESSE      0.000     0.000     0.000         5
        GESAMTPREIS      0.000     0.000     0.000        11
            FLAECHE      0.000     0.000     0.000        15
           IMMO_TYP      0.000     0.000     0.000        19
            QMPREIS      0.000     0.000     0.000        10
                ORT      0.000     0.000     0.000        26
            STRASSE      0.118     0.125     0.121        16

           accuracy                          0.691       839
          macro avg      0.194     0.114     0.102       839
       weighted avg      0.664     0.691     0.657       839



Now try with leading "B-" and "I-"

In [83]:
########## ONLY RUN IF WE WANT TO ADD LEADING "B-" / "I-" TO CLASS LABEL
# now use above code and loop through all items of annot list:
# addLeading=1 for "Yes" (i.e. add leading "B-","I-" to annot); 0 for "No" (i.e. add labels to annot simply as they are)
addLeading = 1

if addLeading == 1:
    for j in range(0,len(annot)):
        a = annot[j]
        # select list of dict of tokens w/ annnotations and add column w/ no. of words to each dict:
        b = a['spans']
        # add noWords to b dict. note: b is list of dicts w/ annotations; tokens not on this list don't have annotations
        if b!=[]: #i.e. only try to add annotations to tokens if there are annotations to begin with
            #print(b)
            for i in range(0,len(annot[j]['tokens'])):
                    # now break-up label into 1st occurrence (leading "B-") and subsequent occurrences (leading "I-") (only for non "O"'s)
                    if annot[j]['tokens'][i]['label'] != "O":
                        if i==0:
                            annot[j]['tokens'][i]['label'] = "B-" + annot[j]['tokens'][i]['label']
                        else: 
                            if annot[j]['tokens'][i]['label'] == annot[j]['tokens'][i-1]['label'][2:]: # need to remove the leading "B-" that we had already been added to c[i-1]
                                annot[j]['tokens'][i]['label'] = "I-" + annot[j]['tokens'][i]['label']
                            else:
                                annot[j]['tokens'][i]['label'] = "B-" + annot[j]['tokens'][i]['label'] 

In [84]:
# now get data into format that TorchRNN expects:
X=[] 
y=[]
for j in range(0,len(annot)):
    a = annot[j]['tokens']
    auxX = []
    auxy = []
    if annot[j]['spans']!=[]: # are there annot for this example?
        for i in range(0,len(a)):
            #token_element = (a[i]['text'],a[i]['label'])
            auxX.append(a[i]['text'])
            auxy.append(a[i]['label'])
        X.append(auxX)
        y.append(auxy)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train, X_test, y_train, y_test = X[:120], X[120:], y[:120], y[120:]
vocab = sorted({w for seq in X_train for w in seq}) + ["$UNK"]

In [89]:
print(X_train[0])

['DORNBIRN', 'In', 'der', 'Schulgasse', 'in', 'Dornbirn', 'hat', 'eine', '71,93', 'Quadratmeter', 'große', 'Wohnung', 'für', 'einen', 'Quadratmeterpreis', 'von', '5533,71', 'Euro', 'den', 'Besitzer', 'gewechselt', '.', 'Dieser', 'beinhaltet', 'auch', 'einen', 'Pkw-Abstellplatz', '.', 'Käufer', 'der', 'Wohnung', 'mit', '9,86', 'Quadratmetern', 'Terrasse', 'ist', 'die', 'ValLiLean', 'Beteiligungs-', 'und', 'Immobilienverwaltungs', 'GmbH', 'Beim', 'Verkäufer', 'handelt', 'es', 'sich', 'um', 'die', 'Karrenblick', 'Projekt', 'GmbH', ' ', 'Der', 'Kaufpreis', 'liegt', 'bei', '398.040', 'Euro', '.', 'Unterzeichnet', 'wurde', 'der', 'Kaufvertrag', 'am', '18.', 'September', '.', 'Die', 'Verbücherung', 'datiert', 'mit', 'Oktober', '2020', '.', '.', '.']


In [85]:
%time _ = seq_mod.fit(X_train, y_train)
y_pred = seq_mod.predict(X_test)
print(y_test[0])
print(y_pred[0])

Stopping after epoch 35. Validation score did not improve by tol=1e-05 for more than 10 epochs. Final error is 2.249159574508667

Wall time: 1.78 s
['B-ORT', 'O', 'O', 'B-STRASSE', 'I-STRASSE', 'O', 'B-ORT', 'O', 'O', 'B-FLAECHE', 'O', 'O', 'B-IMMO_TYP', 'O', 'O', 'O', 'O', 'B-QMPREIS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-VERKAEUFER', 'O', 'O', 'O', 'O', 'O', 'B-GESAMTPREIS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATUM_VERTRAG', 'I-DATUM_VERTRAG', 'O', 'O', 'O', 'O', 'O', 'B-DATUM_VERBUECHERUNG', 'I-DATUM_VERBUECHERUNG', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [90]:
labels=seq_mod.classes_
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

In [91]:
# unfold all our data - NOTE: this means we don't care about per sentence results. 
# i.e. each classification is worth same regardless of sentence in which it occurs
y_test_unfold = [y for element in y_test for y in element]
y_pred_unfold = [y for element in y_pred for y in element]

In [92]:
print(classification_report(
    y_test_unfold, y_pred_unfold, labels=sorted_labels, digits=3
))

                       precision    recall  f1-score   support

                    O      0.773     0.988     0.867       643
B-DATUM_VERBUECHERUNG      0.000     0.000     0.000        13
I-DATUM_VERBUECHERUNG      0.000     0.000     0.000        12
      B-DATUM_VERTRAG      0.000     0.000     0.000        13
      I-DATUM_VERTRAG      0.000     0.000     0.000        14
            B-FLAECHE      0.000     0.000     0.000        15
            I-FLAECHE      0.000     0.000     0.000         0
        B-GESAMTPREIS      0.000     0.000     0.000        11
        I-GESAMTPREIS      0.000     0.000     0.000         0
           B-IMMO_TYP      0.000     0.000     0.000        19
           I-IMMO_TYP      0.000     0.000     0.000         0
            B-KAEUFER      0.000     0.000     0.000        10
            I-KAEUFER      0.000     0.000     0.000         8
                B-ORT      0.300     0.115     0.167        26
            B-QMPREIS      0.000     0.000     0.000  

  _warn_prf(average, modifier, msg_start, len(result))


Remove "B-" and "I-" (in case they are present in labels)

In [119]:
for j in range(0,len(annot)):
    a = annot[j]
    b = a['spans']
    if b!=[]: #i.e. only try to add annotations to tokens if there are annotations to begin with
        for i in range(0,len(annot[j]['tokens'])):
                if annot[j]['tokens'][i]['label'] != "O":
                    if annot[j]['tokens'][i]['label'][:2]=="B-" or annot[j]['tokens'][i]['label'][:2]=="I-":
                        annot[j]['tokens'][i]['label']=annot[j]['tokens'][i]['label'][2:]

Try bi-directional LSTM

In [121]:
seq_mod = TorchRNNSequenceLabeler(
    vocab,
    early_stopping=True,
    eta=0.001,
    bidirectional=True)

In [122]:
%time _ = seq_mod.fit(X_train, y_train)

Stopping after epoch 15. Validation score did not improve by tol=1e-05 for more than 10 epochs. Final error is 2.213289499282837

Wall time: 768 ms


In [123]:
y_pred = seq_mod.predict(X_test)
print(y_test[0])
print(y_pred[0])

labels=seq_mod.classes_
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

# unfold all our data - NOTE: this means we don't care about per sentence results. 
# i.e. each classification is worth same regardless of sentence in which it occurs
y_test_unfold = [y for element in y_test for y in element]
y_pred_unfold = [y for element in y_pred for y in element]

print(classification_report(
    y_test_unfold, y_pred_unfold, labels=sorted_labels, digits=3
))

['ORT', 'O', 'O', 'STRASSE', 'STRASSE', 'O', 'ORT', 'O', 'O', 'FLAECHE', 'O', 'O', 'IMMO_TYP', 'O', 'O', 'O', 'O', 'QMPREIS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'VERKAEUFER', 'O', 'O', 'O', 'O', 'O', 'GESAMTPREIS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'DATUM_VERTRAG', 'DATUM_VERTRAG', 'O', 'O', 'O', 'O', 'O', 'DATUM_VERBUECHERUNG', 'DATUM_VERBUECHERUNG', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'IMMO_TYP', 'O', 'O', 'TERRASSENGROESSE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'TERRASSENGROESSE', 'TERRASSENGROESSE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
                     precision    recall  f1-score   support

                  O      0.760     0.893     0.821       643
            KAEUFER      0.000     0.000     0.000        18
DATUM_VERBUECHERUN