### this script sets a baseline for relation extraction using frequency-based BOW model

#### add additional features

In [52]:
import gzip
import numpy as np
import random
import os
import json

from collections import Counter, defaultdict, namedtuple
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, fbeta_score, make_scorer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from sklearn.preprocessing import FunctionTransformer,LabelEncoder
import numpy as np
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer

##### additional imports
import networkx as nx
import spacy
nlp = spacy.load('en')

In [53]:
##################################################################################################
# 1. LOAD DATA
##################################################################################################

PairExample = namedtuple('PairExample',
    'entity_1, entity_2, snippet')
Snippet = namedtuple('Snippet',
    'left, mention_1, middle, mention_2, right, direction')
def load_data(file, verbose=True):
    f = open(file,'r', encoding='utf-8')
    data = []
    labels = []
    for i,line in enumerate(f):
        instance = json.loads(line)

        instance_tuple = PairExample(instance['entity_1'],instance['entity_2'],[])
        
        for snippet in instance['snippet']:
            try:
                snippet_tuple = Snippet(snippet['left'],snippet['mention_1'],snippet['middle'],
                                   snippet['mention_2'],snippet['right'],
                                    snippet['direction'])
                instance_tuple.snippet.append(snippet_tuple)
                
                data.append(instance_tuple)
                labels.append(instance['relation'])
                instance_tuple = PairExample(instance['entity_1'],instance['entity_2'],[])
                
            except:
                print(instance)


    return data,labels
    
train_data, train_labels = load_data('../data/train.json.txt')

In [54]:
# for i in train_data[:10]:
#     print(i)
#     print()

# print(len(train_labels))
# print(len(train_data))


In [55]:
# Statistics over relations
def print_stats(labels):
    labels_counts = Counter(labels)
    print('{:20s} {:>10s} {:>10s}'.format('', '', 'rel_examples'))
    print('{:20s} {:>10s} {:>10s}'.format('relation', 'examples', '/all_examples'))
    print('{:20s} {:>10s} {:>10s}'.format('--------', '--------', '-------'))
    for k,v in labels_counts.items():
        print('{:20s} {:10d} {:10.2f}'.format(k, v, v /len(labels)))
    print('{:20s} {:>10s} {:>10s}'.format('--------', '--------', '-------'))
    print('{:20s} {:10d} {:10.2f}'.format('Total', len(labels), len(labels) /len(labels)))

print('Train set statistics:')
print_stats(train_labels)

Train set statistics:
                                rel_examples
relation               examples /all_examples
--------               --------    -------
has_spouse                13061       0.31
author                    13113       0.31
NO_REL                     3068       0.07
capital                    9427       0.22
worked_at                  3669       0.09
--------               --------    -------
Total                     42338       1.00


In [56]:
# check that each entity pair is assigned only one relation
pair_dict={}
rel_dict={}
for example, label in zip(train_data,train_labels):
    if (example.entity_1,example.entity_2) not in pair_dict.keys():
        pair_dict[(example.entity_1,example.entity_2)] = [label]
        
    else:
        pair_dict[(example.entity_1,example.entity_2)].append(label)
#         print(example.entity_1,example.entity_2,label)
    if label not in rel_dict.keys():
        rel_dict[label] = [example]
    else:
        rel_dict[label].append(example)
print("Done building dictionary")  
    
# example for each relation
for rel in rel_dict.keys():
    ex = rel_dict[rel][0]
    print(rel,ex.entity_1,ex.entity_2)

Done building dictionary
has_spouse Judy_Garland David_Rose
author Charlie_and_the_Chocolate_Factory Roald_Dahl
NO_REL Sichuan Tibet
capital Andalusia Seville
worked_at Carl-Henric_Svanberg Ericsson


In [87]:
def SelectContext(data, verbose=True):
    """BOW feature extraction"""
    only_context_data = []
    for instance in data:
        
        instance_context = []
        for s in instance.snippet:
            context = s.left + " m_1 " + s.middle + " m_2 " + s.right
            instance_context.append(context)
        only_context_data.append(' '.join(instance_context))
    if verbose:
        print(len(only_context_data))
        print(only_context_data[0])
        print(only_context_data[0])
    return only_context_data

In [88]:
test_feat = SelectContext(train_data[:200])

200
thirty and his life and career were riding high . In 1941 , shortly after the death of his father , Mercer began an intense affair with nineteen-year-old m_1 while she was engaged to composer m_2 . Garland married Rose to temporarily stop the affair , but the effect on Mercer lingered , adding to the emotional depth of his lyrics . Their affair
thirty and his life and career were riding high . In 1941 , shortly after the death of his father , Mercer began an intense affair with nineteen-year-old m_1 while she was engaged to composer m_2 . Garland married Rose to temporarily stop the affair , but the effect on Mercer lingered , adding to the emotional depth of his lyrics . Their affair


In [95]:
def ExractSimpleFeatures(data, verbose=True):
    featurized_data = []
    for instance in data:
        featurized_instance = {'mid_words': '', 'distance': np.inf}
        for s in instance.snippet:
            if len(s.middle.split()) < featurized_instance['distance']:
                featurized_instance['mid_words'] = s.middle
                featurized_instance['distance'] = len(s.middle.split())
        featurized_data.append(featurized_instance)
    if verbose:
        print(len(featurized_data))
        print(featurized_data[0])
        print(featurized_data[1])
    return featurized_data

In [96]:
test_feat = ExractSimpleFeatures(train_data[:200])

200
{'mid_words': 'while she was engaged to composer', 'distance': 6}
{'mid_words': 'by', 'distance': 1}


In [91]:
def LengthOfEntities(data, verbose=True):
    featurized_data = []
    for instance in data:
        featurized_instance = {
            'entity1_len': len(instance.entity_1.split("_")),
            'entity2_len': len(instance.entity_2.split("_")),
            'combined_len': len(instance.entity_1.split("_")) + len(instance.entity_2.split("_"))
        }
        featurized_data.append(featurized_instance)
    if verbose:
        print(len(featurized_data))
        print(featurized_data[0])
        print(featurized_data[1])
    return featurized_data 

In [92]:
test_feat = LengthOfEntities(train_data[:200])

200
{'entity1_len': 2, 'entity2_len': 2, 'combined_len': 4}
{'entity1_len': 5, 'entity2_len': 2, 'combined_len': 7}


In [125]:
def UseNLP(data, verbose=True, status=True):
    featurized_data = []
    c = 0
    for instance in data:
        
        featurized_instance = {'tagged_context_1': '', 'tagged_context_2': '', 'path_length': 0}
        
        for s in instance.snippet:
                        
            context = s.left + " m_1 " + s.middle + " m_2 " + s.right
            
            document = nlp(context) # spacy pipeline
            
            tagged_context_1 = []
            tagged_context_2 = []
            
            for i, w in enumerate(document):
                if w.orth_ == "m_1":
                    window_1 = document[i-3:i+4]
                    for e in window_1:
                        if e.orth_ == "m_1" or e.orth_ == "m_2":
                            tagged_context_1.append("MENTION")
                        else:
                            tagged_context_1.append(e.pos_)
                
                if w.orth_ == "m_2":
                    window_2 = document[i-3:i+4]
                    if window_2:
                        for e in window_2:
                            if e.orth_ == "m_1" or e.orth_ == "m_2":
                                tagged_context_2.append("MENTION")
                            else:
                                tagged_context_2.append(e.pos_)
            
            featurized_instance['tagged_context_1'] = ' '.join(tagged_context_1)
            featurized_instance['tagged_context_2'] = ' '.join(tagged_context_2)
            
            edges = []
            for w in document: # FYI https://spacy.io/docs/api/token
                for child in w.children:
                    edges.append(('{0}-{1}'.format(w.lower_, w.i),
                                  '{0}-{1}'.format(child.lower_, child.i)))

            graph = nx.Graph(edges)
#             print(graph)
            for w in graph:
#                 print(w)
                if "m_1" in w:
                    s = w
                if "m_2" in w:
                    t = w
            
            try:
                featurized_instance['path_length'] = nx.shortest_path_length(graph, source=s, target=t)
            except nx.NetworkXNoPath: # unrelated?
                featurized_instance['path_length'] = 0
            except nx.NodeNotFound: # problem with mention
                featurized_instance['path_length'] = 0.5

        featurized_data.append(featurized_instance)
        c = 0
        if status:
            if c % 5000 == 0:
                print("{} instances processed.".format(c))
                
    if verbose:
        print(len(featurized_data))
        print(featurized_data[0])
        print(featurized_data[1])
            
    return featurized_data

In [126]:
test_feat = UseNLP(train_data[:1000])

1000
{'tagged_context_1': 'NOUN PUNCT ADJ MENTION ADP PRON VERB', 'tagged_context_2': 'VERB ADP NOUN MENTION PUNCT PROPN VERB', 'path_length': 5}
{'tagged_context_1': 'ADP DET NUM MENTION ADP MENTION PUNCT', 'tagged_context_2': 'NUM MENTION ADP MENTION PUNCT VERB ADP', 'path_length': 4}


In [128]:
class SimpleFeaturizer(BaseEstimator, TransformerMixin):
    def __init__(self, featurizer):
        self.featurizers = featurizer

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return ExractSimpleFeatures(X, verbose=False)

In [129]:
class EntityLengthFeaturizer(BaseEstimator, TransformerMixin):
    """Extract features from each isntance for DictVectorizer"""
    def __init__(self, featurizer):
        self.featurizers = featurizer
        
    def fit(self, x, y=None):
        return self

    def transform(self, X):
        return LengthOfEntities(X, verbose=False)

In [130]:
class BowFeaturizer(BaseEstimator, TransformerMixin):
    """BOW featurizer"""
    def __init__(self, featurizer):
        self.featurizers = featurizer
        
    def fit(self, x, y=None):
        return self

    def transform(self, X):
        return SelectContext(X, verbose=False)

In [131]:
class DependencyPath(BaseEstimator, TransformerMixin):
    """BOW featurizer"""
    def __init__(self, featurizer):
        self.featurizers = featurizer
        
    def fit(self, x, y=None):
        return self

    def transform(self, X):
        return UseNLP(X, verbose=False, status=True)

In [133]:
# Transform labels to numeric values
le = LabelEncoder()
train_labels_featurized = le.fit_transform(train_labels)

length_pipe = make_pipeline(EntityLengthFeaturizer(LengthOfEntities), DictVectorizer())

bow_pipe = make_pipeline(BowFeaturizer(SelectContext), CountVectorizer(ngram_range=(1,3)))

simple_pipe = make_pipeline(SimpleFeaturizer(ExractSimpleFeatures), DictVectorizer())

syntax_pipe = make_pipeline(DependencyPath(FindDepPath), DictVectorizer())

clf = make_pipeline(FeatureUnion(transformer_list=[
    ('length_pipeline', length_pipe),
    ('bow_pipeline', bow_pipe),
    ('simple_pipeline', simple_pipe),
    ('syntax_pipeline', syntax_pipe)]),
    LogisticRegression())

In [134]:
##################################################################################################
# 3. TRAIN CLASSIFIER AND EVALUATE (CV)
##################################################################################################

def print_statistics_header():
    print('{:20s} {:>10s} {:>10s} {:>10s} {:>10s}'.format(
        'relation', 'precision', 'recall', 'f-score', 'support'))
    print('{:20s} {:>10s} {:>10s} {:>10s} {:>10s}'.format(
        '-' * 18, '-' * 9, '-' * 9, '-' * 9, '-' * 9))

def print_statistics_row(rel, result):
    print('{:20s} {:10.3f} {:10.3f} {:10.3f} {:10d}'.format(rel, *result))

def print_statistics_footer(avg_result):
    print('{:20s} {:>10s} {:>10s} {:>10s} {:>10s}'.format(
        '-' * 18, '-' * 9, '-' * 9, '-' * 9, '-' * 9))
    print('{:20s} {:10.3f} {:10.3f} {:10.3f} {:10d}'.format('macro-average', *avg_result))

def macro_average_results(results):
    avg_result = [np.average([r[i] for r in results.values()]) for i in range(3)]
    avg_result.append(np.sum([r[3] for r in results.values()]))
    return avg_result

def average_results(results):
    avg_result = [np.average([r[i] for r in results]) for i in range(3)]
    avg_result.append(np.sum([r[3] for r in results]))
    return avg_result
    
def evaluateCV(classifier, label_encoder, X, y, verbose=True):
    """
    classifier: clf - pipeline with CountVevtorizer and Logistic regression
    label_encoder: le - label encoder
    X: train data featurized
    y: train labels featurized
    """
    results = {}
    for rel in le.classes_:
#         print(rel)
        results[rel] = []
    if verbose:
        print_statistics_header()
        kfold = StratifiedKFold(n_splits = 5, shuffle=True, random_state=0) 
        for train_index, test_index in kfold.split(X, y):
            #print("TRAIN:", train_index, "TEST:", test_index)
            X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
            y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]
            clf.fit(X_train, y_train)
            pred_labels = classifier.predict(X_test)
            stats = precision_recall_fscore_support(y_test, pred_labels, beta=0.5)
            #print(stats)
            for rel in label_encoder.classes_:
                rel_id = label_encoder.transform([rel])[0]
#             print(rel_id,rel)
                stats_rel = [stat[rel_id] for stat in stats]
                results[rel].append(stats_rel)
        for rel in label_encoder.classes_:
            results[rel] = average_results(results[rel])
            if verbose:
                print_statistics_row(rel, results[rel])
    avg_result = macro_average_results(results)
    if verbose:
        print_statistics_footer(avg_result)
    return avg_result[2]  # return f_0.5 score as summary statistic

In [135]:
evaluateCV(clf, le, train_data, train_labels_featurized)

relation              precision     recall    f-score    support
------------------    ---------  ---------  ---------  ---------
33868
{'tagged_context_1': 'NOUN PUNCT ADJ MENTION ADP PRON VERB', 'tagged_context_2': 'VERB ADP NOUN MENTION PUNCT PROPN VERB', 'path_length': 5}
{'tagged_context_1': 'ADP DET NUM MENTION ADP MENTION PUNCT', 'tagged_context_2': 'NUM MENTION ADP MENTION PUNCT VERB ADP', 'path_length': 4}
8470
{'tagged_context_1': 'ADJ ADP DET MENTION NOUN ADP NOUN', 'tagged_context_2': 'ADP NOUN VERB MENTION CCONJ ADJ NOUN', 'path_length': 5}
{'tagged_context_1': 'PUNCT VERB ADP MENTION PUNCT VERB ADP', 'tagged_context_2': 'ADP ADJ NOUN MENTION PUNCT PRON VERB', 'path_length': 4}
33869
{'tagged_context_1': 'NOUN PUNCT ADJ MENTION ADP PRON VERB', 'tagged_context_2': 'VERB ADP NOUN MENTION PUNCT PROPN VERB', 'path_length': 5}
{'tagged_context_1': 'ADP DET NUM MENTION ADP MENTION PUNCT', 'tagged_context_2': 'NUM MENTION ADP MENTION PUNCT VERB ADP', 'path_length': 4}
8469
{'tagg

0.8721445323326972

In [37]:
# A check for the average F1 score

f_scorer = make_scorer(fbeta_score, beta=0.5, average='macro')

def evaluateCV_check(classifier, X, y, verbose=True):
    kfold = StratifiedKFold(n_splits = 5, shuffle=True, random_state=0) 
    scores = cross_val_score(classifier, X, y, cv=kfold, scoring = f_scorer)
    print("\nCross-validation scores (StratifiedKFold): ", scores)
    print("Mean cv score (StratifiedKFold): ", scores.mean())

In [38]:
evaluateCV_check(clf, train_data, train_labels_featurized)


Cross-validation scores (StratifiedKFold):  [0.78319178 0.77791582 0.78375181 0.78223485 0.78230726]
Mean cv score (StratifiedKFold):  0.7818803012927733


In [39]:
##################################################################################################
# 4. TEST PREDICTIONS and ANALYSIS
##################################################################################################

# Fit final model on the full train data
clf.fit(train_data, train_labels_featurized)

# Predict on test set
test_data, test_labels = load_data('../data/test-covered.json.txt', verbose=False)
print(len(test_labels))
# test_data_featurized = SelectContext(test_data, verbose=False)
test_label_predicted = clf.predict(test_data)
print(len(test_label_predicted))
# Deprecation warning explained: https://stackoverflow.com/questions/49545947/sklearn-deprecationwarning-truth-value-of-an-array
test_label_predicted_decoded = le.inverse_transform(test_label_predicted)
print(len(test_label_predicted_decoded))
print(test_label_predicted_decoded[:2])
f = open("outputs/test_labels.txt", 'w', encoding="utf-8")
for label in test_label_predicted_decoded:
    f.write(label+'\n')

1840
1840
1840
['capital' 'NO_REL']


  if diff:


In [None]:
# Feature analisys - print N most informative
# !! Make changes in this function when you change the pipleine!!
def printNMostInformative(classifier,label_encoder,N):
    """Prints features with the highest coefficient values, per class"""
    feature_names = classifier.named_steps['countvectorizer'].get_feature_names()

    coef = classifier.named_steps['logisticregression'].coef_    
    print(coef.shape)
    for rel in label_encoder.classes_:
        rel_id = label_encoder.transform([rel])[0]
        coef_rel = coef[rel_id]
        coefs_with_fns = sorted(zip(coef_rel, feature_names))
        top_features = coefs_with_fns[-N:]
        print("\nClass {} best: ".format(rel))
        for feat in top_features:
            print(feat)        
        
print("Top features used to predict: ")
# show the top features
printNMostInformative(clf,le,2)