In [1]:
# load prepared data
import pandas as pd
r = pd.read_csv('result.csv', encoding='utf8')
r = r[r['Answer']!='not']
len(r)

5053

In [2]:
from tokenize_uk import *
import os,sys
mitie_path = os.environ['MITIE_HOME']
sys.path.append(mitie_path)

from mitie import *
import itertools
import re

In [3]:
# load previous models
ner = named_entity_extractor("uk_model.dat")
trainer = binary_relation_detector_trainer("people.person.parents", ner)

In [6]:
tokens = tokenize_words('Несе Галя воду, коромисло гнеться')

['Несе', 'Галя', 'воду', ',', 'коромисло', 'гнеться']

In [7]:
ner.extract_entities(tokens)

[(range(1, 2), 'PERS', 0.7061146871904945)]

In [10]:
# tokenize texts and find anchors
def detect_anchor(tokens, anchor):
    found = False
    index = 0
    for i in range(len(tokens)):
        token = tokens[i]
        pattern = re.escape(anchor[index]) + u"[а|ом]?"
        match = re.match(pattern, token, flags=re.U|re.L)
        
        if(found):
            if(match is not None):
                index += 1
            else:
                index = 0
                found = False
                pattern = re.escape(anchor[index]) + u"[а|ом]?"
                match = re.match(pattern, token, flags=re.U|re.L)
        
        if(not found and match is not None):
            start = i
            index += 1
            found = True
            
        if(index == len(anchor)):
            end = i + 1
            yield range(start, end)
            index = 0
            found = False
            
def extract_from_tokens_by_range(tokens, r):
    return ' '.join(tokens[r[0]:r[-1]+1])

def convert_answer(answer):
    if(answer=='has' or answer=='weak'):
        return 1
    else:
        return 0
        
def no_range_overlaps(r1, r2):
    return max(r1) < min(r2) or max(r2) < min(r1)
    

def select_closest(detection1, detection2):
    min_distance = 1000
    min_pair = ([],[])
    for d1 in detection1:
        for d2 in detection2:
            if not no_range_overlaps(d1, d2): continue
            if max(d1) < min(d2):
                distance = min(d2)-max(d1)
            else:
                distance = min(d1)-max(d2)
            if distance < min_distance:
                min_distance = distance
                min_pair = (d1,d2)
                
    return min_pair
    
def prepare_row(row, only_closest_anchors=True, verbose=False):
    text = row['Text']
    if(verbose):
        print(text)
    tokens = tokenize_text(text)
    tokens = [t for tt in tokens for sentence in tt for t in sentence]
    detection1 = list(detect_anchor(tokens, tokenize_words(row['SubjectAnchor'])))
    if(verbose):
        for d in detection1:
            print(extract_from_tokens_by_range(tokens, d))
    detection2 = list(detect_anchor(tokens, tokenize_words(row['ObjectAnchor'])))
    if(verbose):
        for d in detection2:
            print(extract_from_tokens_by_range(tokens, d))
    
    if only_closest_anchors:
        detection1, detection2 = select_closest(detection1, detection2)
            
    return (tokens, detection1, detection2, convert_answer(row['Answer']))

df = pd.DataFrame(list(r.apply(prepare_row,axis=1)), columns=['Text','Subjects','Objects','Answer'])
correct = df.apply(lambda row: len(row['Subjects']) > 0 and len(row['Objects']) > 0, axis=1)
df = df[correct]
no_overlap = df.apply(lambda row: no_range_overlaps(row['Subjects'], row['Objects']), axis=1)
df = df[no_overlap]
df.head()



Unnamed: 0,Text,Subjects,Objects,Answer
0,"[Ярополк, жив, у, згоді, з, батьком, Володимир...","(11, 12)","(6, 7)",1
1,"[Після, смерті, Ріцімера, Східна, Римська, імп...","(34, 35)","(27, 28)",1
2,"[Був, сином, Сапа, Інки, Уайни, Капака, та, мо...",(26),"(4, 5)",0
3,"[Володимир, Ярославич, —, князь, Новгородський...","(0, 1)","(9, 10, 11)",1
4,"[Відповідно, до, напису, біля, входу, до, прим...","(30, 31)","(24, 25)",0


In [11]:
len(df)

5053

In [12]:
# get train-test split
from sklearn.model_selection import StratifiedKFold

X = df[['Subjects', 'Objects']]
y = df['Answer']

state = 42
sss = StratifiedKFold(n_splits=4, shuffle=True, random_state=state).split(X, y)

In [13]:
from sklearn.linear_model import LogisticRegression
import numpy as np

# mitie trainig function
def train_mitie_model(X, y, train_index):
    for i in train_index:
        subject_position = X.iloc[i]['Subjects']
        object_position = X.iloc[i]['Objects']
        text = X.iloc[i]['Text']

        if(y.iloc[i]==1):
            trainer.add_positive_binary_relation(text, subject_position, object_position)
            trainer.add_negative_binary_relation(text, object_position, subject_position)
        else:
            trainer.add_negative_binary_relation(text, subject_position, object_position)
    return trainer.train()


def mitie_predict(model, text, subj, obj):
    rel = ner.extract_binary_relation(text, subj, obj)
    return model(rel)

def train_logreg_model(mitie_model, X, y, train_index):
    logreg_x = X.iloc[train_index].apply(lambda row: mitie_predict(mitie_model, row['Text'], row['Subjects'], row['Objects']), axis=1)
    model = LogisticRegression(random_state=state, fit_intercept=False, penalty='l1')
    logreg_x = np.array(logreg_x).reshape((len(logreg_x), 1))
    logreg_y = np.array(y.iloc[train_index])
    model.fit(logreg_x, logreg_y)
    return model

In [None]:
from sklearn.metrics import recall_score, f1_score, precision_score, average_precision_score

f1_scores = []
precision_scores = []
recall_scores = []
y_trues = []
y_preds = []
y_preds_probas = []
pr_aucs = []

for train_i, test_i in sss:
    
    mitie_model = train_mitie_model(df, y, train_i)
    logreg_model = train_logreg_model(mitie_model, df, y, train_i)
    
    logreg_x = df.iloc[test_i].apply(lambda row: mitie_predict(mitie_model, row['Text'], row['Subjects'], row['Objects']), axis=1)
    logreg_x = np.array(logreg_x).reshape((len(logreg_x), 1))
    
    y_pred_proba = logreg_model.predict_proba(logreg_x)[:,1]
    #y_pred = logreg_model.predict(logreg_x)
    threshold = 0.2
    y_pred = map(lambda y: 1.0 if y > threshold else 0.0, y_pred_proba)
    y_true = np.array(y.iloc[test_i].astype(float))
    
    pr_auc = average_precision_score(y_true, y_pred_proba)
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print("PR AUC: " + str(pr_auc))
    print("Recall: " + str(recall))
    print("Precision: " + str(precision))
    print("F1: " + str(f1))
    print("-"*20)
    
    f1_scores.append(f1)
    precision_scores.append(precision)
    recall_scores.append(recall)
    pr_aucs.append(pr_auc)
    y_trues.append(y_true)
    y_preds.append(y_pred)
    y_preds_probas.append(y_pred_proba)

print("Mean PR AUC: " + str(np.mean(pr_auc)))
print("Mean recall: " + str(np.mean(recall_scores)))
print("Mean precision: " + str(np.mean(precision_scores)))
print("Mean F1: " + str(np.mean(f1_scores)))

In [None]:
# plot PR curve
# from sklearn.metrics import precision_recall_curve
# import matplotlib.pyplot as plt
# %matplotlib inline
# import matplotlib
# matplotlib.rcParams['figure.figsize'] = (10,10)
# from itertools import cycle

# precision, recall, _ = precision_recall_curve(y_trues[0], y_preds[0])

# # setup plot details
# colors = cycle(['navy', 'turquoise', 'darkorange', 'cornflowerblue', 'teal'])
# lw = 2

# # Plot Precision-Recall curve
# plt.clf()
# plt.plot(recall, precision, lw=lw, color='navy',
#          label='Precision-Recall curve')
# plt.xlabel('Recall')
# plt.ylabel('Precision')
# plt.ylim([0.0, 1.05])
# plt.xlim([0.0, 1.0])
# plt.title('Precision-Recall example: AUC={0:0.2f}'.format(average_precision_score(y_true, y_pred_proba)))
# plt.legend(loc="lower left")
# plt.show()

In [None]:
threshold = 0.03
y_pred_t = map(lambda y: 1.0 if y > threshold else 0.0, y_pred_proba)


print("PR AUC: " + str(average_precision_score(y_true, y_pred_proba)))
print("Recall: " + str(recall_score(y_true, y_pred_t)))
print("Precision: " + str(precision_score(y_true, y_pred_t)))
print("F1: " + str(f1_score(y_true, y_pred_t)))

In [None]:
train_i = df.index
mitie_model = train_mitie_model(df, y, train_i)
logreg_model = train_logreg_model(mitie_model, df, y, train_i)

In [None]:
# check with random data
sent = u'Ві́ктор Ві́кторович Януко́вич — український політик, Народний депутат України 5-го, 6-го і 7-го скликань, син колишнього президента України Віктора Федоровича Януковича, кандидат наук з державного управління, майстер спорту в категорії автоспорт, чемпіон України з трофі-рейдів 2011 року.'
tokens = tokenize_words(sent)
print('|'.join(tokens))

In [None]:
subj = [0,1,2]
obj = [21,22,23]
mitie_result = mitie_predict(mitie_model, tokens, subj, obj)
mitie_result

In [None]:
logreg_model.predict_proba(np.array([mitie_result]).reshape(-1, 1))[:,1]

In [None]:
mitie_result = mitie_predict(mitie_model, tokens, obj, subj)
logreg_model.predict_proba(np.array([mitie_result]).reshape(-1, 1))[:,1]

In [None]:
sent = u'У часи перебування на посаді Голови облдержадміністрації Віктор Янукович був лобістом деяких проектів у рамках регіону, зокрема Роман Гайовий з Агентства журналістських досліджень приводить такі успішні лобістські проекти губернатора Януковича: введення високого мита на коксівне вугілля, що дало можливість вирівняти'
tokens = tokenize_words(sent)
subj = [7]
obj = [18,19]
mitie_result = mitie_predict(mitie_model, tokens, obj, subj)
logreg_model.predict_proba(np.array([mitie_result]).reshape(-1, 1))[:,1]

In [None]:
mitie_model.save_to_disk('people.person.parents.svm')

In [None]:
import pickle
with open('people.person.parents.pkl', 'wb') as f:
    pickle.dump(logreg_model, f)