In [1]:
import nltk
from sklearn.metrics import classification_report, r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
import torch
import torch.nn as nn

#from torch_model_base import TorchModelBase
#from torch_shallow_neural_classifier import TorchShallowNeuralClassifier
from torch_rnn_classifier import TorchRNNDataset, TorchRNNClassifier, TorchRNNModel
import utils

import pandas as pd
from collections import Counter

import json

from torchcrf import CRF
from sklearn.utils import shuffle
from sklearn_crfsuite import metrics

In [2]:
with open('annotations2.jsonl') as jsonl_file:
    # note: after running data-preprocessing.ipynb this file already has token-level labels
    lines = jsonl_file.readlines()
annot = [json.loads(line) for line in lines]

In [3]:
# now get data into two separate list of lists:
X=[] 
y=[]
for j in range(0,len(annot)):
    a = annot[j]['tokens']
    auxX = []
    auxy = []
    if annot[j]['spans']!=[]: # are there annot for this example?
        for i in range(0,len(a)):
            #token_element = (a[i]['text'],a[i]['label'])
            auxX.append(a[i]['text'])
            auxy.append(a[i]['label'])
        X.append(auxX)
        y.append(auxy)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
#X_train, X_test, y_train, y_test = X[:120], X[120:], y[:120], y[120:]


In [4]:
# create word to id dictionary
word_to_ix = {}
for sentence in X:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix) # i.e. each successive word added gets a successive index

In [5]:
# create label to id dictionary
label_to_ix = {}
for sentence in y:
    for label in sentence:
        if label not in label_to_ix:
            label_to_ix[label] = len(label_to_ix) # i.e. each successive word added gets a successive index

# and id to label dictionary also:
ix_to_label = {label_to_ix[item]:item for item in label_to_ix}

In [6]:
# use dicts to convert X and y to indices (ints)
X_ix = [[word_to_ix[w] for w in seq] for seq in X]
y_ix = [[label_to_ix[l] for l in seq] for seq in y]

In [7]:
train_ratio = 0.75
train_test_split = round(0.75*len(X) - 0.5) # -0.5 => floor

# next 3 lines guarantee we always use same random indices for train / test set
idx_shuffle=pd.read_csv('idx_shuffle.csv') 
auxIdxList = idx_shuffle.values.tolist()
idx_shuffle = [el for auxList in auxIdxList for el in auxList]
X_shuffle, y_shuffle = [X[auxIdx] for auxIdx in idx_shuffle], [y[auxIdx] for auxIdx in idx_shuffle]
X_train, X_test, y_train, y_test = X_shuffle[:train_test_split], X_shuffle[train_test_split:], y_shuffle[:train_test_split], y_shuffle[train_test_split:]

In [8]:
print(idx_shuffle)

[94, 130, 26, 8, 30, 92, 110, 101, 44, 104, 66, 100, 43, 97, 22, 116, 96, 89, 7, 24, 62, 10, 109, 45, 16, 2, 59, 129, 107, 125, 33, 68, 56, 85, 126, 13, 51, 84, 54, 50, 15, 61, 40, 76, 52, 3, 48, 6, 71, 60, 105, 122, 119, 99, 63, 93, 108, 27, 18, 113, 11, 111, 73, 127, 98, 41, 90, 1, 86, 118, 42, 4, 17, 38, 5, 53, 124, 78, 0, 34, 28, 55, 75, 35, 23, 74, 31, 91, 57, 106, 131, 32, 115, 14, 95, 19, 29, 49, 112, 82, 64, 132, 79, 69, 80, 20, 128, 72, 77, 25, 37, 81, 120, 46, 123, 39, 102, 65, 58, 12, 121, 88, 70, 87, 36, 114, 21, 83, 9, 103, 67, 117, 47]


In [8]:
labels = sorted({y for auxVec in y_train for y in auxVec})
numTags = len(labels)
seq_lengths = [len(auxVec) for auxVec in X_train] # note: want to create mask only for inputs we're going to use to train the model
seq_length_max = max(seq_lengths)

In [9]:
def create_mask(seq_length):
    maxLen=max(seq_length)
    auxLen=len(seq_length)
    auxOne = torch.ones(maxLen)
    auxZero = torch.zeros(maxLen)
    auxOne_l=[1]*maxLen
    auxZero_l=[0]*maxLen
    auxMatrix=[]
    for i in range(auxLen):
        auxRow=auxOne_l[:seq_length[i]]+auxZero_l[seq_length[i]:]
        auxMatrix.append(auxRow)
    return torch.tensor(auxMatrix,dtype=torch.uint8)  

def fillList_ofLists(y):
    auxMatrix = []
    seq_length_max = max([len(auxVec) for auxVec in y])
    for i in range(0,len(y)):
        auxRow=y[i]+[-1]*(seq_length_max-len(y[i])) # -1 is the tagId I'm using for the filled data
        auxMatrix.append(auxRow)
    return auxMatrix 

def convert_to_Tensor_andStack(y_train_forCRF):
    auxMatrix = torch.tensor(y_train_forCRF[0])
    for i in range(1,len(y_train_forCRF)):
        auxMatrix = torch.vstack((auxMatrix,torch.tensor(y_train_forCRF[i])))
    return auxMatrix

In [10]:
mask = create_mask(seq_lengths) # this is mask for CRF model
X_train_forCRF = fillList_ofLists(X_train) # tags filled out and converted to tensor
y_train_forCRF = fillList_ofLists(y_train) # tags filled out and converted to tensor
tags = convert_to_Tensor_andStack(y_train_forCRF) # these are tags for CRF mode

In [None]:
# TO DO: calc emission probabilities. dim = (noExamples,maxSeqLength,noDistinctTags) (incl bogus tag used for padding)
# represents prob of each word in sequence being emitted by a given tag/label
# (naiive emissions calc - i.e. no more sophisticated notions of word embeddings or neighbour information)

In [11]:
# now create dict of (wordId,tagId); value=no. of co-occurrences
def convertList_ofLists_bivariate(X_train,y_train): #stitches together 2 separate list of lists w/ identical lengths
    auxMatrix=[]
    for i in range(0,len(X_train)):
        auxRow=list(zip(X_train[i],y_train[i]))
        auxMatrix.append(auxRow)
    return auxMatrix

def emissionProbabilities(X_input, y_input): # X_input and y_input are list of (equal-length) lists - i.e. after padding; dim=(noExamples,maxSeqLength)
    allData_train = convertList_ofLists_bivariate(X_input,y_input)
    allData_countDict = Counter([auxItem for auxVec in allData_train for auxItem in auxVec]) # dict of (wordId,tagId); value=no. of co-occurrences

    # obtain counts of each tag:
    y_countDic = Counter([auxY for seq in y_input for auxY in seq]) # key=labelId; value = # of x's labeId shows up
    # now convert into emission probabilities of each pair (wordId,tagId) i.e. prob that word is emitted by a given tag
    allData_prob = {} # dict for emission probabilities of (wordId,tagId)
    for item in allData_countDict:
        if item==(-1,-1): 
            allData_prob[item]=0
        else:
            wordId,tagId = item
            allData_prob[item]=allData_countDict[item]/y_countDic[tagId]
    return allData_prob

def emissionMatrix_forCRF(X_input, labels, emissionProbs): # X_input is list of (equal-length) lists - i.e. after padding; dim=(noExamples,maxSeqLength)
    # labels is list of distinct labels; emissionProbs is dict of prob word i is emitted by label/tag j
    # now use above emission prob (wordId,tagId) as lookup for emission prob of seq i: for all wordId's in position j of seq_i and all tagId's populate w/ emission prob        
    nExamples = len(X_input)
    seq_length_max = len(X_input[0])
    labels_aux = labels + [-1] # add bogus tag for padded data
    nLabels_aux=len(labels_aux)
    emiss=torch.zeros(nExamples,seq_length_max,nLabels_aux)
    for i in range(0,nExamples):
        auxSeq = X_input[i]
        for j in range(0,seq_length_max):
            for k in range(0,nLabels_aux):
                if (auxSeq[j],labels_aux[k]) in emissionProbs: # note: if certain combination doesn't exist then leave with zero
                    emiss[i][j][k]=emissionProbs[(auxSeq[j],labels_aux[k])]
    return emiss


In [12]:
allData_prob = emissionProbabilities(X_train_forCRF,y_train_forCRF)
emiss = emissionMatrix_forCRF(X_train_forCRF,labels,allData_prob)

In [13]:
# RUN CRF MODEL
torch.manual_seed(1)
#seq_length_max = 3  # maximum sequence length in a batch
batch_size = train_test_split  # number of samples in the batch
model = CRF(numTags+1,batch_first=True)

#emissions = torch.randn(batch_size, seq_length_max, num_tags)
print(model(emiss, tags, mask=mask)) # model log likelihood
out=model.decode(emiss,mask=mask) # most likely tag sequences

tensor(-13403.5869, grad_fn=<SumBackward0>)


In [14]:
# now convert out (and y_train) back to text labels       
labels_text = [ix_to_label[label] for label in labels]
y_test_text = [[ix_to_label[item] for item in auxOut] for auxOut in y_test]

In [19]:
# now need to calc emissions of X_test

In [15]:
seq_lengths_test = [len(auxVec) for auxVec in X_test]
mask_test = create_mask(seq_lengths_test) # this is mask for CRF model
X_test_forCRF = fillList_ofLists(X_test) # tags filled out and converted to tensor
emiss_test = emissionMatrix_forCRF(X_test_forCRF,labels,allData_prob) # use emission probabities from training data (allData_prob)

In [16]:
out_test=model.decode(emiss_test,mask=mask_test) # most likely tag sequences
out_test_text=[[ix_to_label[item] for item in auxOut] for auxOut in out_test]# convert out_test to text

In [21]:
print(metrics.flat_f1_score(y_test_text, out_test_text,average='macro', labels=labels_text))
print(metrics.sequence_accuracy_score(y_test_text, out_test_text))

0.25206892079328913
0.0


In [22]:
print(metrics.flat_classification_report(
    y_test_text, out_test_text, labels=labels_text, digits=3
))

                     precision    recall  f1-score   support

                ORT      0.920     0.390     0.548        59
                  O      0.836     0.850     0.843      1525
            STRASSE      0.157     0.318     0.211        44
            FLAECHE      0.000     0.000     0.000        38
           IMMO_TYP      0.522     0.511     0.516        47
            QMPREIS      0.000     0.000     0.000        21
   TERRASSENGROESSE      0.011     0.125     0.019         8
            KAEUFER      0.250     0.030     0.054        33
         VERKAEUFER      0.375     0.290     0.327        62
        GESAMTPREIS      0.045     0.034     0.039        29
      DATUM_VERTRAG      0.000     0.000     0.000        62
DATUM_VERBUECHERUNG      0.481     0.455     0.467        55

           accuracy                          0.708      1983
          macro avg      0.300     0.250     0.252      1983
       weighted avg      0.716     0.708     0.706      1983

