In [1]:
import pandas as pd
import os
import json

import numpy as np

from nltk import word_tokenize
from nltk.tag import pos_tag

import spacy

import sklearn_crfsuite

In [2]:
def metrics(confusion_matrix):
    precision = confusion_matrix["B"]["B"]/(confusion_matrix["B"]["B"] + confusion_matrix["O"]["B"])
    recall = confusion_matrix["B"]["B"]/(confusion_matrix["B"]["B"] + confusion_matrix["B"]["O"])
    f_score = (precision*2*recall)/(precision+recall)
    return f_score

In [7]:
def metrics(confusion_matrix):
    precision = confusion_matrix["B"]["B"]/(confusion_matrix["B"]["B"] + confusion_matrix["O"]["B"])
    recall = confusion_matrix["B"]["B"]/(confusion_matrix["B"]["B"] + confusion_matrix["B"]["O"])
    f_score = (precision*2*recall)/(precision+recall)
    return f_score

In [15]:
# custom evaluation function - cross
def evaluate(pred,labl):
    cmatrix = {"O": {"O": 0, "B": 0}, "B": {"O": 0, "B": 0}}
    for l,p in zip(labl,pred):
        for i,j in zip(l,p):
            cmatrix[i][j] += 1
    return cmatrix

## Load data - Features Single
one pos tag - in training set - from ua \ ru model, in testing - from ru model

In [31]:
import json
with open("data_uk_train.json", "r") as f:
    data_uk = json.load(f)
with open("data_ru_train.json", "r") as f:
    data_ru = json.load(f)

In [32]:
import json
with open("data_uk_test.json", "r") as f:
    data_uk_test = json.load(f)
with open("data_ru_test.json", "r") as f:
    data_ru_test = json.load(f)

In [33]:
data_ru[17]

[['O', '«', 'PUNCT'],
 ['O', 'Она', 'PRON'],
 ['O', 'всегда', 'ADV'],
 ['O', 'была', 'AUX'],
 ['O', 'уставшей', 'VERB'],
 ['O', ',', 'PUNCT'],
 ['O', 'периодически', 'ADV'],
 ['O', 'болела', 'VERB'],
 ['O', 'простудой', 'NOUN'],
 ['O', ',', 'PUNCT'],
 ['O', 'а', 'CCONJ'],
 ['O', 'на', 'ADP'],
 ['O', 'ее', 'DET'],
 ['O', 'ногах', 'NOUN'],
 ['O', 'вечно', 'ADV'],
 ['O', 'появлялись', 'VERB'],
 ['O', 'синяки', 'NOUN'],
 ['O', '.', 'PUNCT']]

In [34]:
data_uk[17]

[['O', 'Луки', 'NOUN'],
 ['O', ',', 'PUNCT'],
 ['O', 'ідея', 'NOUN'],
 ['O', 'якого', 'DET'],
 ['O', 'належить', 'VERB'],
 ['O', 'Владиці', 'NOUN'],
 ['O', 'Володимиру', 'PROPN'],
 ['O', '(', 'PUNCT'],
 ['B', 'Війтишину', 'PROPN'],
 ['B', ')', 'PUNCT'],
 ['B', '.', 'PUNCT']]

## Extract Features
use only word token from one model - ukrainian, pos tokens from correct model for training and from ru in testing

In [35]:
def word2features(sent, i):
    word = sent[i][1]
    postag = sent[i][2]

    features = {
        'bias': 1.0,
        'word_': word,
        'word.lower()': word.lower(),
        'word.istitle()': word.istitle(),
        'postag': postag
    }
    if i > 0:
        word1 = sent[i-1][1]
        postag1 = sent[i-1][2]
        features.update({
            '-1word': word1,
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:postag': postag1
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i-1][1]
        postag1 = sent[i-1][2]
        features.update({
            '+1word': word1,
            '+1:word.lower()': word1.lower(),
            '+1:postag': postag1
        })
    else:
        features['EOS'] = True
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


In [36]:
X = [sent2features(s) for s in data_uk + data_ru]
Y = [[l for l,text,pos in s] for s in data_uk + data_ru]

In [37]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.5,
    c2=0.5,
    max_iterations=200,
    all_possible_transitions=True
)

## Train the CRF

In [38]:
try:
    crf.fit(X, Y)
except AttributeError:
    pass

## Evaluate on test part

In [39]:
X_test = [sent2features(s) for s in data_uk_test + data_ru_test]
Y_test = [[l for l,text,pos in s] for s in data_uk_test + data_ru_test]

In [40]:
Y_pred = crf.predict(X_test)
# confusion matrix
evaluate(Y_pred,Y_test)

{'O': {'O': 444520, 'B': 1692}, 'B': {'O': 11572, 'B': 6669}}

In [41]:
# f1 score
metrics(evaluate(Y_pred,Y_test))

0.5013908728667017

## See relevant features

In [18]:
from collections import Counter

In [19]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(10))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-10:])

Top positive:
8.327176 O        BOS
1.863649 O        word_:британском
1.863649 O        word.lower():британском
1.669897 O        word_:Минобороны
1.669897 O        word.lower():минобороны
1.658079 B        -1word:СССР
1.658079 B        -1:word.lower():ссср
1.647327 B        word_:СССР
1.647327 B        word.lower():ссср
1.589936 O        word_:День

Top negative:
-1.099021 B        word.lower():м.
-1.143653 B        word_:села
-1.143653 B        word.lower():села
-1.145578 O        -1word:м.
-1.178023 B        word.lower():конституции
-1.179705 B        word.lower():чемпіонаті
-1.208804 B        word_:Конституции
-1.295684 B        +1:postag:PROPN
-1.427839 O        word_:РФ
-1.427839 O        word.lower():рф


## Use both ru and ua pos tags

In [20]:
import json
with open("data_uk_train_multiple.json", "r") as f:
    data_uk = json.load(f)
with open("data_ru_train_multiple.json", "r") as f:
    data_ru = json.load(f)

In [21]:
import json
with open("data_uk_test_multiple.json", "r") as f:
    data_uk_test = json.load(f)
with open("data_ru_test_multiple.json", "r") as f:
    data_ru_test = json.load(f)

In [22]:
def word2features(sent, i):
    word = sent[i][1]
    word_ru = sent[i][3]
    postag_uk = sent[i][2]
    postag_ru = sent[i][4]

    features = {
        'bias': 1.0,
        'word_': word,
        'word.lower()': word.lower(),
        'word.istitle()': word.istitle(),
        'postag_uk': postag_uk,
        'postag_ru': postag_uk,
    }
    if i > 0:
        word1 = sent[i-1][1]
        postag1_uk = sent[i-1][2]
        postag1_ru = sent[i-1][3]
        features.update({
            '-1word': word1,
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:postag_uk': postag1_uk,
            '-1:postag_ru': postag1_ru
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i-1][1]
        postag1_uk = sent[i-1][2]
        postag1_ru = sent[i-1][3]
        features.update({
            '+1word': word1,
            '+1:word.lower()': word1.lower(),
            '+1:postag_uk': postag1_uk,
            '+1:postag_ru': postag1_ru
        })
    else:
        features['EOS'] = True
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


In [23]:
X = [sent2features(s) for s in data_uk+data_ru]
Y = [[l for l,text_u,pos_u,text_r,pos_r in s] for s in data_uk+data_ru]

In [24]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.5,
    c2=0.5,
    max_iterations=200,
    all_possible_transitions=True
)

## Train the CRF

In [25]:
try:
    crf.fit(X, Y)
except AttributeError:
    pass

## Evaluate on test part

In [26]:
X_test = [sent2features(s) for s in data_uk_test+data_ru_test]
Y_test = [[l for l,text_u,pos_u,text_r,pos_r in s] for s in data_uk_test+data_ru_test]

In [27]:
Y_pred = crf.predict(X_test)
# confusion matrix
evaluate(Y_pred,Y_test)

{'O': {'O': 575598, 'B': 10507}, 'B': {'O': 33465, 'B': 86188}}

In [30]:
# f score
metrics(evaluate(Y_pred,Y_test))

0.7967533788156119

In [28]:
from collections import Counter

In [29]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(10))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-10:])

Top positive:
10.872601 O        BOS
1.921468 B        -1word:Киев
1.921468 B        -1:word.lower():киев
1.921468 B        -1:postag_ru:Киев
1.903493 B        +1word:Киев
1.903493 B        +1:word.lower():киев
1.903493 B        +1:postag_ru:Киев
1.733833 O        word_:секторе
1.733833 O        word.lower():секторе
1.595281 B        word_:вул.

Top negative:
-1.232685 B        word_:сході
-1.340709 O        word_:России
-1.340709 O        word.lower():россии
-1.360747 O        word_:Украины
-1.360747 O        word.lower():украины
-1.391849 B        +1:postag_ru:при
-1.396699 O        -1:postag_ru:при
-1.401627 O        -1word:при
-1.415792 B        word_:заході
-1.491825 B        +1:word.lower():при
