In [1]:
import json # read from json file
import re
import sklearn
from sklearn.utils import shuffle
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import scipy.stats
import eli5

#with open('data.jsonl') as jsonl_file: # not using the full (unannotated) dataset in the below
#    lines = jsonl_file.readlines()
#jsonS = [json.loads(line) for line in lines]
#print(len(jsonS))

In [2]:
### RUN
with open('annotations2.jsonl') as jsonl_file:
    # note: after running data-preprocessing.ipynb this file already has token-level labels
    lines = jsonl_file.readlines()
annot = [json.loads(line) for line in lines]
#print(annot[0])

Chose whether or not to add leading "B-" / "I-" to class labels

In [3]:
########## ONLY RUN IF WE WANT TO ADD LEADING "B-" / "I-" TO CLASS LABEL
# now use above code and loop through all items of annot list:
# addLeading=1 for "Yes" (i.e. add leading "B-","I-" to annot); 0 for "No" (i.e. add labels to annot simply as they are)
addLeading = 1

if addLeading == 1:
    for j in range(0,len(annot)):
        a = annot[j]
        # select list of dict of tokens w/ annnotations and add column w/ no. of words to each dict:
        b = a['spans']
        # add noWords to b dict. note: b is list of dicts w/ annotations; tokens not on this list don't have annotations
        if b!=[]: #i.e. only try to add annotations to tokens if there are annotations to begin with
            #print(b)
            for i in range(0,len(annot[j]['tokens'])):
                    # now break-up label into 1st occurrence (leading "B-") and subsequent occurrences (leading "I-") (only for non "O"'s)
                    if annot[j]['tokens'][i]['label'] != "O":
                        if i==0:
                            annot[j]['tokens'][i]['label'] = "B-" + annot[j]['tokens'][i]['label']
                        else: 
                            if annot[j]['tokens'][i]['label'] == annot[j]['tokens'][i-1]['label'][2:]: # need to remove the leading "B-" that we had already been added to c[i-1]
                                annot[j]['tokens'][i]['label'] = "I-" + annot[j]['tokens'][i]['label']
                            else:
                                annot[j]['tokens'][i]['label'] = "B-" + annot[j]['tokens'][i]['label'] 

In [3]:
### RUN
# now convert annotation tokens into list (sentences) of lists (tokens) format for sklearn_crfsuite.CRF
train_sents=[] 
for j in range(0,len(annot)):
    a = annot[j]['tokens']
    train_sentence = []
    for i in range(0,len(a)):
        if 'label' in a[i]: # only add element if this sample sentence has been labelled 
            token_element = (a[i]['text'],a[i]['label'])
            train_sentence.append(token_element)
    train_sents.append(train_sentence)

In [4]:
### RUN
import pandas as pd
df = pd.read_csv('townsVoralberg.csv',names=['towns'])  

In [5]:
### RUN
lCase = lambda a: str(a).lower()

def word2features(sent, i):
    word = sent[i][0]
    word_l = word.lower()
    towns = map(lCase,df['towns'])
    #postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word_l,
        'word[-3:]': word[-3:], # 3 char ending of word (suffix)
        'word[-2:]': word[-2:], # 2 char ending of word (suffix)
        'word.isupper()': word.isupper(), # all letters are caps
        'word.istitle()': word.istitle(), # 1st letter is a capital letter
        'word.isdigit()': word.isdigit(), # is digit?
        'word.isPrice()': bool(re.match(r'(\d\.?)?(\d{3}\.?\d{3})',word)),
        'word.isTown()': word_l in towns,
        #'postag': postag, # don't have PoS data
        #'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0] # enriches feature function w/ immediately preceding word
        word1_l = word1.lower()
        #postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1_l,
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.isPrice()': bool(re.match(r'(\d\.?)?(\d{3}\.?\d{3})',word1)),
            '-1.word.isTown()': word1_l in towns,
            #'-1:postag': postag1,
            #'-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True # adds the start token

    if i < len(sent)-1:
        word1 = sent[i+1][0] # ... as well as w/ immediate sucessor word 
        word1_l = word1.lower()
        #postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1_l,
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1word.isPrice()': bool(re.match(r'(\d\.?)?(\d{3}\.?\d{3})',word1)),
            '+1.word.isTown()': word1_l in towns,
            #'+1:postag': postag1,
            #'+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True # adds the end token

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
   # return [label for token, postag, label in sent]
    return [label for token, label in sent]

def sent2tokens(sent):
   # return [token for token, postag, label in sent]
    return [token for token, label in sent]

In [7]:
X = [sent2features(s) for s in train_sents] # both are 140 x # of tokens in each example
y = [sent2labels(s) for s in train_sents]

In [8]:
train_ratio = 0.75
train_test_split = round(0.75*len(X) - 0.5) # -0.5 => floor
idx = [i for i in range(0,len(X))]
idx_shuffle = shuffle(idx,random_state=0)
X_shuffle, y_shuffle = [X[auxIdx] for auxIdx in idx_shuffle], [y[auxIdx] for auxIdx in idx_shuffle]
X_train, X_test, y_train, y_test = X_shuffle[:train_test_split], X_shuffle[train_test_split:], y_shuffle[:train_test_split], y_shuffle[train_test_split:]

In [9]:
%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

Wall time: 0 ns




CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [10]:
labels = list(crf.classes_)
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.9568049349982446

In [11]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

                     precision    recall  f1-score   support

                  O      0.971     0.995     0.983      1472
            KAEUFER      0.513     0.606     0.556        33
DATUM_VERBUECHERUNG      1.000     0.983     0.991        58
      DATUM_VERTRAG      1.000     0.922     0.959        64
         VERKAEUFER      0.841     0.552     0.667        67
   TERRASSENGROESSE      0.818     1.000     0.900         9
        GESAMTPREIS      1.000     0.793     0.885        29
            FLAECHE      1.000     0.974     0.987        38
           IMMO_TYP      0.889     0.800     0.842        50
            QMPREIS      1.000     1.000     1.000        23
                ORT      0.984     1.000     0.992        61
            STRASSE      0.902     0.841     0.871        44

           accuracy                          0.958      1948
          macro avg      0.910     0.872     0.886      1948
       weighted avg      0.958     0.958     0.957      1948





In [12]:
####### RUN THIS ONE
eli5.show_weights(crf, top=30)

From \ To,DATUM_VERBUECHERUNG,DATUM_VERTRAG,FLAECHE,GESAMTPREIS,IMMO_TYP,KAEUFER,O,ORT,QMPREIS,STRASSE,TERRASSENGROESSE,VERKAEUFER
DATUM_VERBUECHERUNG,3.919,0.0,0.0,0.0,0.0,0.0,-0.524,0.0,0.0,-0.2,0.0,0.0
DATUM_VERTRAG,-1.061,5.002,0.0,0.0,-0.087,-0.298,0.0,-0.0,0.0,-0.214,0.0,-0.109
FLAECHE,-0.087,-0.021,1.233,0.0,-0.661,-0.09,1.006,0.0,0.0,-0.067,0.0,-0.159
GESAMTPREIS,-0.0,0.0,0.0,1.275,-0.059,-0.031,0.98,0.0,-0.119,0.0,0.0,-0.05
IMMO_TYP,-0.284,0.0,-0.85,-0.14,4.004,-0.314,-0.013,0.0,0.0,0.0,-0.485,-0.384
KAEUFER,0.0,0.0,0.0,0.0,-0.029,4.633,-0.616,0.0,0.0,-0.816,0.0,-0.69
O,0.166,-0.484,0.986,0.614,0.073,-0.012,2.421,1.082,0.237,-0.25,0.522,-0.215
ORT,0.0,-0.126,0.0,0.0,-0.385,-0.424,1.46,0.0,0.0,0.075,0.0,-0.064
QMPREIS,-0.012,0.0,0.0,0.0,-0.027,0.0,0.935,0.0,1.144,0.0,0.0,0.0
STRASSE,0.0,-0.102,-0.241,0.0,0.0,0.0,0.312,-1.761,0.0,4.158,0.0,0.0

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11
+2.945,-1:word.lower():mit,,,,,,,,,,
+2.432,-1:word.lower():im,,,,,,,,,,
+2.232,+1:word.lower():.,,,,,,,,,,
+1.772,+1:word.lower():oktober,,,,,,,,,,
+1.702,word.isdigit(),,,,,,,,,,
+1.239,word[-3:]:ber,,,,,,,,,,
+1.208,+1:word.lower():september,,,,,,,,,,
+1.206,+1:word.lower():2021,,,,,,,,,,
+0.915,+1:word.lower():2020,,,,,,,,,,
+0.839,word.istitle(),,,,,,,,,,

Weight?,Feature
+2.945,-1:word.lower():mit
+2.432,-1:word.lower():im
+2.232,+1:word.lower():.
+1.772,+1:word.lower():oktober
+1.702,word.isdigit()
+1.239,word[-3:]:ber
+1.208,+1:word.lower():september
+1.206,+1:word.lower():2021
+0.915,+1:word.lower():2020
+0.839,word.istitle()

Weight?,Feature
+5.990,-1:word.lower():am
+1.733,+1:word.lower():.
+1.609,+1:word.lower():19.
+1.437,+1:word.lower():25.
+1.337,+1:word.lower():23.
+1.295,word[-3:]:ber
+1.089,+1:word.lower():unterzeichnet
+0.857,-1:word.lower():kaufvertrag
+0.799,+1:word.istitle()
+0.586,word.lower():august

Weight?,Feature
+4.306,+1:word.lower():quadratmeter
+2.423,-1:word.lower():von
+2.173,+1:word.lower():quadratmetern
+2.007,+1:word.lower():553
+1.809,word.isdigit()
+1.768,-1:word.lower():eine
+1.540,+1:word.lower():einen
+1.500,-1:word.lower():quadratmeter
+1.079,-1:word.lower():großen
+1.068,word.lower():(

Weight?,Feature
+4.457,word.isPrice()
+3.532,+1:word.lower():euro
+1.694,word.lower():1
+1.694,word[-3:]:1
+1.694,word[-2:]:1
+1.409,word[-2:]:4
+1.409,word[-3:]:4
+1.409,word.lower():4
+1.324,word[-2:]:9
+1.324,word.lower():9

Weight?,Feature
+3.553,word.lower():grundstücksfläche
+2.955,-1:word.lower():große
+2.284,word.istitle()
+2.228,word.lower():boden
+2.180,+1:word.lower():auf
+2.130,-1:word.lower():dazugehörige
+2.004,word[-3:]:ude
+1.996,+1:word.lower():mit
+1.994,-1:word.lower():ein
+1.793,+1:word.lower():freihaltefläche-widmung

Weight?,Feature
+2.682,+1:word.lower():breitenberg
+2.244,-1:word.lower():die
+1.927,+1:word.lower():.
+1.700,-1:word.lower():mehrere
+1.688,word.lower():privatperson
+1.688,word[-3:]:son
+1.571,+1:word.lower():gmbh
+1.540,word.lower():privatpersonen
+1.509,-1:word.lower():eine
+1.432,+1:word.lower():hat

Weight?,Feature
+3.496,bias
+2.991,word.lower():baumgartenstraße
+2.053,"word.lower():153,29"
+1.810,word.lower():quadratmeter
+1.810,-1:word.lower():wohnung
+1.763,-1:word.lower():gebäude
+1.743,word[-2:]:ro
+1.734,-1:word.isupper()
+1.719,-1:word.lower():.
+1.714,word[-2:]:.

Weight?,Feature
+6.440,word.isTown()
+1.961,-1:word.lower():in
+1.513,BOS
+0.850,word.isupper()
+0.344,+1:word.lower():in
+0.333,word.lower():dornbirn
+0.284,word[-2:]:au
+0.271,word.lower():brand
+0.239,word.lower():lochau
+0.209,word[-3:]:and

Weight?,Feature
6.842,+1:word.lower():euro
2.283,-1:word.lower():beträgt
1.593,-1:word.lower():von
1.5,-1:word.lower():bei
1.423,+1:word.lower():266
1.423,-1:word.lower():baufläche-wohngebietwidmung
0.786,+1:word.istitle()
0.633,word.lower():(
0.633,word[-3:]:(
0.633,word[-2:]:(

Weight?,Feature
+3.103,+1:word.lower():in
+3.074,-1:word.lower():der
+2.547,-1:word.lower():im
+2.463,+1:word.lower():nollen
+2.156,+1:word.lower():bleiche
+1.826,-1:word.lower():bereich
+1.784,+1:word.lower():der
+1.783,+1:word.lower():strass
+1.729,-1:word.lower():lech
+1.645,+1:word.lower():altreute

Weight?,Feature
+3.020,+1:word.lower():quadratmetern
+2.719,-1:word.lower():mit
+1.759,+1:word.lower():quadratmeter
+1.449,-1:word.lower():und
+1.227,"word.lower():14,29"
+1.121,word.lower():6
+1.121,word[-3:]:6
+1.121,word[-2:]:6
+0.994,"word.lower():137,49"
+0.994,word[-2:]:49

Weight?,Feature
+2.526,+1:word.lower():greif
+2.293,word.lower():privatpersonen
+2.209,+1:word.lower():hagen
+2.120,+1:word.lower():um
+1.940,word[-2:]:on
+1.683,+1:word.lower():primus
+1.682,-1:word.lower():die
+1.650,word.lower():privatperson
+1.650,word[-3:]:son
+1.610,word[-3:]:nen


In [13]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
DATUM_VERTRAG -> DATUM_VERTRAG 5.001647
KAEUFER -> KAEUFER 4.633315
VERKAEUFER -> VERKAEUFER 4.535609
STRASSE -> STRASSE 4.157877
IMMO_TYP -> IMMO_TYP 4.003745
DATUM_VERBUECHERUNG -> DATUM_VERBUECHERUNG 3.919460
O      -> O       2.420556
ORT    -> O       1.460406
GESAMTPREIS -> GESAMTPREIS 1.275251
FLAECHE -> FLAECHE 1.232595
QMPREIS -> QMPREIS 1.143765
TERRASSENGROESSE -> O       1.095506
O      -> ORT     1.081547
FLAECHE -> O       1.005788
O      -> FLAECHE 0.986099
GESAMTPREIS -> O       0.979653
QMPREIS -> O       0.935153
O      -> GESAMTPREIS 0.614090
O      -> TERRASSENGROESSE 0.521701
STRASSE -> O       0.312111

Top unlikely transitions:
VERKAEUFER -> IMMO_TYP -0.269014
IMMO_TYP -> DATUM_VERBUECHERUNG -0.284437
DATUM_VERTRAG -> KAEUFER -0.298023
IMMO_TYP -> KAEUFER -0.314359
IMMO_TYP -> VERKAEUFER -0.383739
ORT    -> IMMO_TYP -0.385320
VERKAEUFER -> O       -0.408767
ORT    -> KAEUFER -0.423716
O      -> DATUM_VERTRAG -0.483720
IMMO_TYP -> TERRASSEN