In [1]:
import gzip
import numpy as np
import random
import os
import json

from collections import Counter, defaultdict, namedtuple
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, fbeta_score, make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from sklearn.preprocessing import FunctionTransformer,LabelEncoder
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import FeatureUnion


## Relation Classification
Our prediction problem in this task is given an input pair of entities to predict what relation the pair belongs to. This is a more simplified problem of the general task of relation extraction. Entities are represented by Wiki IDs (that is, suffixes of Wikipedia URLs). The set of possible relations is fixed. We assume that each pair can be only in one relation (This is multi-class classification.) We will build a classifier using supervised machine learning techniques (logistic regression).

### Dataset
Producing labeled data for the general task of relation extraction is expensive. An alternative approach is to use distant supervision (Mintz et al. 2009 Distant supervision for relation extraction without labeled data). The dataset we will use is build using this method. First, a corpus was derived from the expanded version of Wikilinks where each example includes a pair of entities and the context around the entity mention. (This expanded version was build on the Wikilinks dataset announced by Google in 2013. This dataset contains over 40M mentions of 3M distinct entities spanning 10M webpages. It provides entity resolutions by mapping each entity mention to a Wikipedia URL.) The entities are named in the corpus using Wiki IDs, which you can think of as the last portion of a Wikipedia URL. Thus the Wiki ID Barack_Obama designates the entity described by https://en.wikipedia.org/wiki/Barack_Obama. Next, in order to annotate each pair of entity mentions with a relation (if any), the corpus is connected with an external source of knowledge about relations, i.e. knowledge base. Our dataset is biult using a knowledge base (KB) ultimately derived from Freebase. (Unfortunately, Freebase was shut down in 2016, but the Freebase data is still available from various sources and in various forms, i.e. Freebase Easy data dump.) Ultimately, the KB provides the labels for a given entity pair in database, and the corpus provides the features, i.e. all the examples from the corpus where those two entities co-occur are used to generate a feature representation describing the entity pair. Such dataset is used to train relation classification system which can be used further to extract new relation in order to populate KB.

Our dataset for this task 'train.json.txt' is in JSON format. The dataset includes examples for 4 relations: 'worked_at', 'capital', 'has_spouse' and 'author'. There is no overlap between relation in our dataset, i.e. each pair is in one relation which makes it possible to apply multi-class classification algorithms(as opposed to multi-label classification). In addition, there are instances which are not related ('NO_REL' label), however this data portion is noisy. They are derived using a pair of entities from the corpus which are not in KB. Since KB is not complete, some of these supposedly negative instances may be false negatives. A pair of entities might be related in real life even if they don't appear together in the KB.

## Evaluation
We will use precision and recall reported separately for each relation (label). To combine these two metrics into one for each class, we will use a weighted F0.5-score, which gives precision twice as much weight as recall. This is motivated by our problem: if we're (ulimately) extracting new relation triples from (massively abundant) text on the web in order to augment a knowledge base, it's probably more important that the triples we extract are correct (precision) than that we extract all the triples we could (recall). We aggregate our metrics across all relations using macro-averaging which gives equal weight to all relations, and thus give lesser weight to problem instances in relations with more instances (the number of instances per relation is, to some degree, an accident of the data collection methodology). Your system should be optimized for the macro-averaged F0.5-score calcualted on the dataset using cross validation.

In [2]:
############################################################################################
# 1. LOAD DATA
############################################################################################

PairExample = namedtuple('PairExample',
    'entity_1, entity_2, snippet')
Snippet = namedtuple('Snippet',
    'left, mention_1, middle, mention_2, right, direction')
def load_data(file, verbose=True):
    f = open(file,'r', encoding='utf-8')
    data = []
    labels = []
    for i,line in enumerate(f):
        instance = json.loads(line)
        if i==0:
            if verbose:
                print('json example:')
                print(instance)
        #'relation, entity_1, entity_2, snippet' fileds for each example
        #'left, mention_1, middle, mention_2, right, direction' for each snippet
        instance_tuple = PairExample(instance['entity_1'],instance['entity_2'],[])
        for snippet in instance['snippet']:
            try:
                snippet_tuple = Snippet(snippet['left'],snippet['mention_1'],
                                        snippet['middle'], 
                                        snippet['mention_2'],snippet['right'],
                                        snippet['direction'])
                instance_tuple.snippet.append(snippet_tuple)
            except:
                print(instance)
        if i==0:
            if verbose:
                print('\nexample transformed as a named tuple:')
                print(instance_tuple)
        data.append(instance_tuple)
        labels.append(instance['relation'])
    return data,labels
    
train_data, train_labels = load_data('train.json.txt')

json example:
{'relation': 'has_spouse', 'entity_1': 'Judy_Garland', 'entity_2': 'David_Rose', 'snippet': [{'left': 'thirty and his life and career were riding high . In 1941 , shortly after the death of his father , Mercer began an intense affair with nineteen-year-old', 'mention_1': 'Judy Garland', 'middle': 'while she was engaged to composer', 'mention_2': 'David Rose', 'right': '. Garland married Rose to temporarily stop the affair , but the effect on Mercer lingered , adding to the emotional depth of his lyrics . Their affair', 'direction': 'fwd'}]}

example transformed as a named tuple:
PairExample(entity_1='Judy_Garland', entity_2='David_Rose', snippet=[Snippet(left='thirty and his life and career were riding high . In 1941 , shortly after the death of his father , Mercer began an intense affair with nineteen-year-old', mention_1='Judy Garland', middle='while she was engaged to composer', mention_2='David Rose', right='. Garland married Rose to temporarily stop the affair , but 

In [3]:
# print(train_data[0])
# print()
# print(train_labels[0])

# print(train_data[0].entity_1)
# print(train_data[0].entity_2)
# print(train_data[0].snippet)

# print(train_data[0].snippet[0].left)
# print(train_data[0].snippet[0].mention_1)
# print(train_data[0].snippet[0].middle)
# print(train_data[0].snippet[0].mention_2)
# print(train_data[0].snippet[0].right)
# print(train_data[0].snippet[0].direction)




In [4]:
# Statistics over relations
def print_stats(labels):
    labels_counts = Counter(labels)
    print('{:20s} {:>10s} {:>10s}'.format('', '', 'rel_examples'))
    print('{:20s} {:>10s} {:>10s}'.format('relation', 'examples', '/all_examples'))
    print('{:20s} {:>10s} {:>10s}'.format('--------', '--------', '-------'))
    for k,v in labels_counts.items():
        print('{:20s} {:10d} {:10.2f}'.format(k, v, v /len(labels)))
    print('{:20s} {:>10s} {:>10s}'.format('--------', '--------', '-------'))
    print('{:20s} {:10d} {:10.2f}'.format('Total', len(labels), len(labels) /len(labels)))

print('Train set statistics:')
print_stats(train_labels)

Train set statistics:
                                rel_examples
relation               examples /all_examples
--------               --------    -------
has_spouse                 3019       0.31
author                     2653       0.27
NO_REL                     2300       0.24
capital                     510       0.05
worked_at                  1178       0.12
--------               --------    -------
Total                      9660       1.00


In [5]:
# check that each entity pair is assigned only one relation
pair_dict={}
rel_dict={}
for example, label in zip(train_data,train_labels):
    if (example.entity_1,example.entity_2) not in pair_dict.keys():
        pair_dict[(example.entity_1,example.entity_2)] = [label]
        
    else:
        pair_dict[(example.entity_1,example.entity_2)].append(label)
        print(example.entity_1,example.entity_2,label)
    if label not in rel_dict.keys():
        rel_dict[label] = [example]
    else:
        rel_dict[label].append(example)
print("Done building dictionary\n")  
    
# example for each relation
for rel in rel_dict.keys():
    ex = rel_dict[rel][0]
    print(rel,ex.entity_1,ex.entity_2)

Done building dictionary

has_spouse Judy_Garland David_Rose
author Charlie_and_the_Chocolate_Factory Roald_Dahl
NO_REL Sichuan Tibet
capital Andalusia Seville
worked_at Carl-Henric_Svanberg Ericsson


In [6]:
## inspect pair_dict
# for k, v in pair_dict.items():
#     print(k, v)

In [7]:
# how to reconstruct full context
# ex = train_data[0]
# print(ex)
# print("\n full context:")
# s = ex.snippet[0]
# print(' '.join((s.left, s.mention_1, s.middle, s.mention_2, s.right)))

In [9]:
# https://www.youtube.com/watch?v=aCdg-d_476Y
# vectorizer = CountVectorizer()
# BOW = vectorizer.fit(corpus)

In [10]:
# ### EDIT FOR BOW MODEL

# ###########################################################################################
# # 2. EXTRACT FEATURES and BUILD CLASSIFIER
# ###########################################################################################

# # Extract two simple features
# def ExractSimpleFeatures(data, verbose=True):
#     featurized_data = []
#     for instance in data:
#         featurized_instance = {'left_context': '', 'right_context': '', 'mid_words':''}
#         for s in instance.snippet:
# #             if len(s.middle.split()) < featurized_instance['distance']:
#             featurized_instance['mid_words'] = vectorizer.transform(s.middle.split())
# #                 featurized_instance['distance'] = len(s.middle.split())
#             featurized_instance['left_context'] = vectorizer.transform(s.left.split())
#             featurized_instance['right_context'] = vectorizer.transform(s.right.split())
#         featurized_data.append(featurized_instance)
#     if verbose:
#         print(len(data))
#         print(len(featurized_data))
#         print(data[0])
#         print(featurized_data[0])
#         print(featurized_data[1])
#     return featurized_data

# # Transform dataset to features
# train_data_featurized = ExractSimpleFeatures(train_data)

# # Transform labels to nimeric values
# le = LabelEncoder()
# train_labels_featurized = le.fit_transform(train_labels)

# # Fit model one vs rest logistic regression    
# clf = make_pipeline(DictVectorizer(), LogisticRegression())

In [11]:
# for k, v in train_data_featurized[0].items():
#     print(k, v.shape)

In [12]:
###########################################################################################
# 2. EXTRACT FEATURES and BUILD CLASSIFIER
###########################################################################################

# Extract two simple features
def ExractSimpleFeatures(data, verbose=True):
    featurized_data = []
    for instance in data:
        featurized_instance = {'left_context': '', 'right_context': '', 'mid_words':'', 'distance':np.inf}
        for s in instance.snippet:
            if len(s.middle.split()) < featurized_instance['distance']:
                featurized_instance['mid_words'] = s.middle
                featurized_instance['distance'] = len(s.middle.split())
            featurized_instance['left_context'] = s.left
            featurized_instance['right_context'] = s.right
        featurized_data.append(featurized_instance)
    if verbose:
        print(len(data))
        print(len(featurized_data))
        print(data[0])
        print(featurized_data[0])
        print(featurized_data[1])
    return featurized_data

# Transform dataset to features
train_data_featurized = ExractSimpleFeatures(train_data)

# Transform labels to nimeric values
le = LabelEncoder()
train_labels_featurized = le.fit_transform(train_labels)

# Fit model one vs rest logistic regression    
clf = make_pipeline(DictVectorizer(), LogisticRegression())

9660
9660
PairExample(entity_1='Judy_Garland', entity_2='David_Rose', snippet=[Snippet(left='thirty and his life and career were riding high . In 1941 , shortly after the death of his father , Mercer began an intense affair with nineteen-year-old', mention_1='Judy Garland', middle='while she was engaged to composer', mention_2='David Rose', right='. Garland married Rose to temporarily stop the affair , but the effect on Mercer lingered , adding to the emotional depth of his lyrics . Their affair', direction='fwd')])
{'left_context': 'thirty and his life and career were riding high . In 1941 , shortly after the death of his father , Mercer began an intense affair with nineteen-year-old', 'right_context': '. Garland married Rose to temporarily stop the affair , but the effect on Mercer lingered , adding to the emotional depth of his lyrics . Their affair', 'mid_words': 'while she was engaged to composer', 'distance': 6}
{'left_context': 'in touch . Follow Letters of Note ... RSS | Email 

In [13]:
##################################################################################################
# 3. TRAIN CLASSIFIER AND EVALUATE (CV)
##################################################################################################

def print_statistics_header():
    print('{:20s} {:>10s} {:>10s} {:>10s} {:>10s}'.format(
        'relation', 'precision', 'recall', 'f-score', 'support'))
    print('{:20s} {:>10s} {:>10s} {:>10s} {:>10s}'.format(
        '-' * 18, '-' * 9, '-' * 9, '-' * 9, '-' * 9))

def print_statistics_row(rel, result):
    print('{:20s} {:10.3f} {:10.3f} {:10.3f} {:10d}'.format(rel, *result))

def print_statistics_footer(avg_result):
    print('{:20s} {:>10s} {:>10s} {:>10s} {:>10s}'.format(
        '-' * 18, '-' * 9, '-' * 9, '-' * 9, '-' * 9))
    print('{:20s} {:10.3f} {:10.3f} {:10.3f} {:10d}'.format('macro-average', *avg_result))

def macro_average_results(results):
    avg_result = [np.average([r[i] for r in results.values()]) for i in range(3)]
    avg_result.append(np.sum([r[3] for r in results.values()]))
    return avg_result

def average_results(results):
    avg_result = [np.average([r[i] for r in results]) for i in range(3)]
    avg_result.append(np.sum([r[3] for r in results]))
    return avg_result
    
def evaluateCV(classifier, label_encoder, X, y, verbose=True):
    results = {}
    for rel in le.classes_:
            results[rel] = []
    if verbose:
        print_statistics_header()
        kfold = StratifiedKFold(n_splits = 5, shuffle=True, random_state=0) 
        for train_index, test_index in kfold.split(X, y):
            #print("TRAIN:", train_index, "TEST:", test_index)
            X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
            y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]
            clf.fit(X_train, y_train)
            pred_labels = classifier.predict(X_test)
            stats = precision_recall_fscore_support(y_test, pred_labels, beta=0.5)
            #print(stats)
            for rel in label_encoder.classes_:
                rel_id = label_encoder.transform([rel])[0]
            #print(rel_id,rel)
                stats_rel = [stat[rel_id] for stat in stats]
                results[rel].append(stats_rel)
        for rel in label_encoder.classes_:
            results[rel] = average_results(results[rel])
            if verbose:
                print_statistics_row(rel, results[rel])
    avg_result = macro_average_results(results)
    if verbose:
        print_statistics_footer(avg_result)
    return avg_result[2]  # return f_0.5 score as summary statistic

In [14]:
evaluateCV(clf,le,train_data_featurized,train_labels_featurized)

relation              precision     recall    f-score    support
------------------    ---------  ---------  ---------  ---------
NO_REL                    0.430      0.716      0.467       2300
author                    0.668      0.687      0.672       2653
capital                   0.630      0.067      0.212        510
has_spouse                0.886      0.782      0.863       3019
worked_at                 0.757      0.238      0.523       1178
------------------    ---------  ---------  ---------  ---------
macro-average             0.674      0.498      0.547       9660


0.5473472778168718

In [15]:
# A check for the average F1 score

f_scorer = make_scorer(fbeta_score, beta=0.5, average='macro')

def evaluateCV_check(classifier, X, y, verbose=True):
    kfold = StratifiedKFold(n_splits = 5, shuffle=True, random_state=0) 
    scores = cross_val_score(classifier, X, y, cv=kfold, scoring = f_scorer)
    print("\nCross-validation scores (StratifiedKFold): ", scores)
    print("Mean cv score (StratifiedKFold): ", scores.mean())

In [16]:
evaluateCV_check(clf,train_data_featurized,train_labels_featurized)


Cross-validation scores (StratifiedKFold):  [0.55181076 0.57501018 0.55481108 0.49823873 0.55686564]
Mean cv score (StratifiedKFold):  0.5473472778168718


In [17]:
#########################################################################################
# 4. TEST PREDICTIONS and ANALYSIS
#########################################################################################

# Fit final model on the full train data
clf.fit(train_data_featurized, train_labels_featurized)

# Predict on test set
test_data, test_labels = load_data('test-covered.json.txt', verbose=False)
test_data_featurized = ExractSimpleFeatures(test_data, verbose=False)
test_label_predicted = clf.predict(test_data_featurized)
# Deprecation warning explained: https://stackoverflow.com/questions/49545947/sklearn-deprecationwarning-truth-value-of-an-array
test_label_predicted_decoded = le.inverse_transform(test_label_predicted)
print(test_label_predicted_decoded[:10])
f = open("test_labels.txt", 'w', encoding="utf-8")
for label in test_label_predicted_decoded:
    f.write(label+'\n')

['NO_REL' 'NO_REL' 'author' 'author' 'has_spouse' 'author' 'author'
 'has_spouse' 'author' 'NO_REL']


  if diff:


In [18]:
# Feature analisys - print N most informative
# !! Make changes in this function when you change the pipleine!!
def printNMostInformative(classifier,label_encoder,N):
    """Prints features with the highest coefficient values, per class"""
    feature_names = classifier.named_steps['dictvectorizer'].get_feature_names()

    coef = classifier.named_steps['logisticregression'].coef_    
    print(coef.shape)
    for rel in label_encoder.classes_:
        rel_id = label_encoder.transform([rel])[0]
        coef_rel = coef[rel_id]
        coefs_with_fns = sorted(zip(coef_rel, feature_names))
        top_features = coefs_with_fns[-N:]
        print("\nClass {} best: ".format(rel))
        for feat in top_features:
            print(feat)        
        
print("Top features used to predict: ")
# show the top features
printNMostInformative(clf,le,3)

Top features used to predict: 
(5, 21612)

Class NO_REL best: 
(1.4250601622301795, 'mid_words=or')
(1.5156828233132422, 'mid_words=and the')
(1.9545665885919097, 'mid_words=, and')

Class author best: 
(3.1310845722002716, "mid_words='s novel")
(3.2968166300549115, 'mid_words=, by')
(4.9548165954293335, 'mid_words=by')

Class capital best: 
(1.9743402861695774, 'mid_words=, in')
(2.8272103467885317, 'mid_words=in')
(2.9943715995150084, 'mid_words=after')

Class has_spouse best: 
(3.284436384356054, 'mid_words=married')
(3.339915685104778, 'mid_words=&')
(4.044216459098745, 'mid_words=and his wife')

Class worked_at best: 
(2.4611260642169825, 'mid_words=professor')
(2.6143582378202037, 'mid_words=CEO')
(2.7715834402382065, 'mid_words=of the')
