# PA4: Relation classification

In [580]:
import gzip
import numpy as np
import random
import os
import json

from collections import Counter, defaultdict, namedtuple
from gensim.utils import tokenize
from gensim.models import Word2Vec

from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, fbeta_score, make_scorer, f1_score, confusion_matrix, classification_report
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold, train_test_split
from sklearn.preprocessing import FunctionTransformer, LabelEncoder

### 1. Load data

In [501]:
PairExample = namedtuple('PairExample',
    'entity_1, entity_2, snippet')
Snippet = namedtuple('Snippet',
    'left, mention_1, middle, mention_2, right, direction')
def load_data(file, verbose=True):
    f = open(file,'r', encoding='utf-8')
    data = []
    labels = []
    for i,line in enumerate(f):
        instance = json.loads(line)

        instance_tuple = PairExample(instance['entity_1'],instance['entity_2'],[])
        
        for snippet in instance['snippet']:
            try:
                snippet_tuple = Snippet(snippet['left'],snippet['mention_1'],snippet['middle'],
                                   snippet['mention_2'],snippet['right'],
                                    snippet['direction'])
                instance_tuple.snippet.append(snippet_tuple)
                
                data.append(instance_tuple)
                labels.append(instance['relation'])
                instance_tuple = PairExample(instance['entity_1'],instance['entity_2'],[])
                
            except:
                print(instance)

    return data,labels
    
train_data, train_labels = load_data('data/train.json.txt')

In [502]:
# Statistics over relations
def print_stats(labels):
    labels_counts = Counter(labels)
    print('{:20s} {:>10s} {:>10s}'.format('', '', 'rel_examples'))
    print('{:20s} {:>10s} {:>10s}'.format('relation', 'examples', '/all_examples'))
    print('{:20s} {:>10s} {:>10s}'.format('--------', '--------', '-------'))

    for k,v in labels_counts.items():
        print('{:20s} {:10d} {:10.2f}'.format(k, v, v /len(labels)))
    print('{:20s} {:>10s} {:>10s}'.format('--------', '--------', '-------'))
    print('{:20s} {:10d} {:10.2f}'.format('Total', len(labels), len(labels) /len(labels)))

print('Train set statistics:')
print_stats(train_labels)

Train set statistics:
                                rel_examples
relation               examples /all_examples
--------               --------    -------
author                    13113       0.31
worked_at                  3669       0.09
has_spouse                13061       0.31
capital                    9427       0.22
NO_REL                     3068       0.07
--------               --------    -------
Total                     42338       1.00


In [504]:
# get full context
"""def get_context(data, embed_mode=False):
    all_data = []
    for instance in data:
        #s_context = []
        for s in instance.snippet:
            if embed_mode:
                all_data.append(' '.join((s.left, s.mention_1.replace(" ", "_"), s.middle, s.mention_2.replace(" ", "_"), s.right)))
            else:
                all_data.append(' '.join((s.left, "entity1", s.middle, "entity2", s.right)))
                # s_context.append(' '.join((s.left, s.mention_1, s.middle, s.mention_2, s.right)))
                # s_context.append(' '.join((s.left, s.middle, s.right)))
        #all_data.append(' '.join(s_context))

    print(len(all_data))
    return all_data
"""

### 2. EXTRACT FEATURES and BUILD CLASSIFIER

In [538]:
def SelectContext(data, verbose=True):
    """BOW feature extraction"""
    only_context_data = []
    for instance in data:
        
        instance_context = []
        for s in instance.snippet:
            context = s.left + " m_1 " + s.middle + " m_2 " + s.right
            instance_context.append(context)
        only_context_data.append(' '.join(instance_context))
    if verbose:
        print(len(only_context_data))
        print(only_context_data[0])
    return only_context_data

In [539]:
test_feat = SelectContext(train_data[:200])

200
thirty and his life and career were riding high . In 1941 , shortly after the death of his father , Mercer began an intense affair with nineteen-year-old m_1 while she was engaged to composer m_2 . Garland married Rose to temporarily stop the affair , but the effect on Mercer lingered , adding to the emotional depth of his lyrics . Their affair


In [540]:
def ExractSimpleFeatures(data, verbose=True):
    featurized_data = []
    for instance in data:
        featurized_instance = {'mid_words': '', 'distance': np.inf}
        for s in instance.snippet:
            if len(s.middle.split()) < featurized_instance['distance']:
                featurized_instance['mid_words'] = s.middle
                featurized_instance['distance'] = len(s.middle.split())
        featurized_data.append(featurized_instance)
    if verbose:
        print(len(featurized_data))
        print(featurized_data[0])
        print(featurized_data[1])
    return featurized_data

In [541]:
test_feat = ExractSimpleFeatures(train_data[:200])

200
{'distance': 6, 'mid_words': 'while she was engaged to composer'}
{'distance': 1, 'mid_words': 'by'}


In [555]:
def LengthOfEntities(data, verbose=True):
    featurized_data = []
    for instance in data:
        featurized_instance = {
            'entity1_len': len(instance.entity_1.split("_")),
            'entity2_len': len(instance.entity_2.split("_")),
            'combined_len': len(instance.entity_1.split("_")) + len(instance.entity_2.split("_"))
        }
        featurized_data.append(featurized_instance)
    if verbose:
        print(len(featurized_data))
        print(featurized_data[0])
        print(featurized_data[1])
    return featurized_data

In [556]:
test_feat = LengthOfEntities(train_data[:200])

200
{'combined_len': 4, 'entity1_len': 2, 'entity2_len': 2}
{'combined_len': 7, 'entity1_len': 5, 'entity2_len': 2}


In [544]:
class SimpleFeaturizer(BaseEstimator, TransformerMixin):
    def __init__(self, featurizer):
        self.featurizers = featurizer

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return ExractSimpleFeatures(X, verbose=False)

In [545]:
class EntityLengthFeaturizer(BaseEstimator, TransformerMixin):
    """Extract features from each isntance for DictVectorizer"""
    def __init__(self, featurizer):
        self.featurizers = featurizer
        
    def fit(self, x, y=None):
        return self

    def transform(self, X):
        return LengthOfEntities(X, verbose=False)

In [546]:
class BowFeaturizer(BaseEstimator, TransformerMixin):
    """BOW featurizer"""
    def __init__(self, featurizer):
        self.featurizers = featurizer
        
    def fit(self, x, y=None):
        return self

    def transform(self, X):
        return SelectContext(X, verbose=False)

In [584]:
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Transform labels to nimeric values
le = LabelEncoder()
train_labels_featurized = le.fit_transform(train_labels)

length_pipe = make_pipeline(EntityLengthFeaturizer(LengthOfEntities), DictVectorizer())

bow_pipe = make_pipeline(BowFeaturizer(SelectContext), CountVectorizer(ngram_range=(1,3)))
# bow_pipe = make_pipeline(BowFeaturizer(SelectContext), CountVectorizer())

simple_pipe = make_pipeline(SimpleFeaturizer(ExractSimpleFeatures), DictVectorizer())

#syntax_pipe = make_pipeline(DependencyPath(FindDepPath), DictVectorizer())

#clf = make_pipeline(FeatureUnion(transformer_list=[
    #('length_pipeline', length_pipe),
    #('bow_pipeline', bow_pipe),
    #('simple_pipeline', simple_pipe),
    #('syntax_pipeline', syntax_pipe)
    #]),
    # SelectKBest(chi2, k=100000),
    # SelectFromModel(LinearSVC()),
    #LogisticRegression())

clf = Pipeline([('vect', FeatureUnion(transformer_list=[
                ('length_pipeline', length_pipe),
                ('bow_pipeline', bow_pipe),
                ('simple_pipeline', simple_pipe),
                #('syntax_pipeline', syntax_pipe)
                ])), 
                #('reducer', SelectKBest(chi2, k=100000)),
                ('clf', LogisticRegression())
               ])

In [505]:
# test_data, test_labels = load_data('data/test.json.txt', verbose=False)
# print(dev_labels[:10])

# all_train = get_context(train_data, embed_mode=False)
# all_dev = get_context(dev_data, embed_mode=False)
# all_test = get_context(test_data, embed_mode=False)

# print(all_train[:1])
# DATA ExractSimpleFeatures
#train_simple_featurized = ExractSimpleFeatures(train_data, verbose=False)
#dev_simple_featurized = ExractSimpleFeatures(dev_data, verbose=False)
#test_simple_featurized = ExractSimpleFeatures(test_data, verbose=False)

['author', 'has_spouse', 'has_spouse', 'NO_REL', 'author', 'has_spouse', 'author', 'author', 'worked_at', 'author']
42338
9754


In [506]:
# from sklearn.pipeline import Pipeline, FeatureUnion
# MODEL

# Transform labels to nimeric values
# le = LabelEncoder()
# train_labels_featurized = le.fit_transform(train_labels)
# dev_labels_featurized = le.transform(dev_labels)

# print(le.classes_)

# Fit model one vs rest logistic regression    
# clf = make_pipeline(DictVectorizer(), LogisticRegression())

# if with CountVectorizer
# bow_vectorizer = CountVectorizer(ngram_range=(1, 3))
# TFiDF_vectorizer = TfidfVectorizer()

# clf = make_pipeline(bow_vectorizer, LogisticRegression())
# clf = LogisticRegression()

# pipeline = Pipeline([
#     ('feats', FeatureUnion([
#         ('countvec', bow_vectorizer), # can pass in either a pipeline
#         ('transformer', FeatureTransformer()) # or a transformer
#     ])),
#     ('clf', LogisticRegression())  # classifier
# ])

['NO_REL' 'author' 'capital' 'has_spouse' 'worked_at']


### 3. TRAIN CLASSIFIER AND EVALUATE (CV)

In [536]:
def print_statistics_header():
    print('{:20s} {:>10s} {:>10s} {:>10s} {:>10s}'.format(
        'relation', 'precision', 'recall', 'f-score', 'support'))
    print('{:20s} {:>10s} {:>10s} {:>10s} {:>10s}'.format(
        '-' * 18, '-' * 9, '-' * 9, '-' * 9, '-' * 9))

def print_statistics_row(rel, result):
    print('{:20s} {:10.3f} {:10.3f} {:10.3f} {:10d}'.format(rel, *result))

def print_statistics_footer(avg_result):
    print('{:20s} {:>10s} {:>10s} {:>10s} {:>10s}'.format(
        '-' * 18, '-' * 9, '-' * 9, '-' * 9, '-' * 9))
    print('{:20s} {:10.3f} {:10.3f} {:10.3f} {:10d}'.format('macro-average', *avg_result))

def macro_average_results(results):
    avg_result = [np.average([r[i] for r in results.values()]) for i in range(3)]
    avg_result.append(np.sum([r[3] for r in results.values()]))
    return avg_result

def average_results(results):
    avg_result = [np.average([r[i] for r in results]) for i in range(3)]
    avg_result.append(np.sum([r[3] for r in results]))
    return avg_result
    
def evaluateCV(classifier, label_encoder, X, y, verbose=True):
    results = {}
    for rel in le.classes_:
        results[rel] = []
    if verbose:
        print_statistics_header()
        kfold = StratifiedKFold(n_splits = 5, shuffle=True, random_state=0) 
        for train_index, test_index in kfold.split(X, y):
            # print("TRAIN:", train_index, "TEST:", test_index)
            X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
            y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]
            clf.fit(X_train, y_train)
            pred_labels = classifier.predict(X_test)
            stats = precision_recall_fscore_support(y_test, pred_labels, beta=0.5)
            # print(stats)
            for rel in label_encoder.classes_:
                rel_id = label_encoder.transform([rel])[0]
                # print(rel_id,rel)
                stats_rel = [stat[rel_id] for stat in stats]
                results[rel].append(stats_rel)
        for rel in label_encoder.classes_:
            results[rel] = average_results(results[rel])
            if verbose:
                print_statistics_row(rel, results[rel])
    avg_result = macro_average_results(results)
    if verbose:
        print_statistics_footer(avg_result)
    return avg_result[2]  # return f_0.5 score as summary statistic

In [585]:
# This is without dependency info and it takes me around 10 minutes to train it.
# But I did not change anything from your model I guess...
evaluateCV(clf, le, train_data, train_labels_featurized)

relation              precision     recall    f-score    support
------------------    ---------  ---------  ---------  ---------
NO_REL                    0.778      0.401      0.654       3068
author                    0.932      0.971      0.939      13113
capital                   0.940      0.973      0.946       9427
has_spouse                0.910      0.977      0.923      13061
worked_at                 0.905      0.816      0.886       3669
------------------    ---------  ---------  ---------  ---------
macro-average             0.893      0.828      0.870      42338


0.8696233397953861

In [422]:
# A check for the average F1 score

f_scorer = make_scorer(fbeta_score, beta=0.5, average='macro')

def evaluateCV_check(classifier, X, y, verbose=True):
    kfold = StratifiedKFold(n_splits = 5, shuffle=True, random_state=0) 
    scores = cross_val_score(classifier, X, y, cv=kfold, scoring = f_scorer)
    print("\nCross-validation scores (StratifiedKFold): ", scores)
    print("Mean cv score (StratifiedKFold): ", scores.mean())

In [240]:
evaluateCV_check(clf, all_train, train_labels_featurized)


Cross-validation scores (StratifiedKFold):  [0.77831464 0.77490296 0.78322361 0.77969274 0.78449379]
Mean cv score (StratifiedKFold):  0.7801255472002809


### 4. TEST PREDICTIONS and ANALYSIS

In [450]:
# Fit final model on the full train data
clf.fit(all_train, train_labels_featurized)

# Predict on test set
# dev_label_predicted = clf.predict(all_dev)
test_label_predicted = clf.predict(all_test)

In [451]:
# FOR DEV TESTING
y_true = dev_labels_featurized
labels = list(set(train_labels))

final_score = f1_score(y_true, dev_label_predicted, average='weighted')
print("Labels: ", labels)
print(y_true[:30])
print(dev_label_predicted[:30])

print(final_score)
print(classification_report(y_true, dev_label_predicted, target_names=labels))

Labels:  ['NO_REL', 'author', 'has_spouse', 'capital', 'worked_at']
[1 3 3 0 1 3 1 1 4 1 4 0 1 1 0 0 1 2 2 2 3 4 1 0 0 3 3 3 3 0]
[1 3 3 0 1 3 1 1 1 1 0 0 1 1 0 4 1 2 2 0 3 4 1 0 0 3 3 3 3 0]
0.7859857572352148
             precision    recall  f1-score   support

     NO_REL       0.66      0.73      0.70       473
     author       0.84      0.82      0.83       529
 has_spouse       0.86      0.57      0.68       111
    capital       0.86      0.89      0.87       593
  worked_at       0.73      0.65      0.69       226

avg / total       0.79      0.79      0.79      1932



In [346]:
# ON TEST DATA: TO UPLOAD
# Deprecation warning explained: https://stackoverflow.com/questions/49545947/sklearn-deprecationwarning-truth-value-of-an-array
test_label_predicted_decoded = le.inverse_transform(test_label_predicted)
print(test_label_predicted_decoded[:5])
f = open("test_labels.txt", 'w', encoding="utf-8")
for label in test_label_predicted_decoded:
    f.write(label+'\n')

['capital' 'NO_REL' 'worked_at' 'NO_REL' 'has_spouse']


  if diff:


In [146]:
# Feature analisys - print N most informative
# !! Make changes in this function when you change the pipleine!!
def printNMostInformative(classifier, label_encoder, N):
    """Prints features with the highest coefficient values, per class"""
    feature_names = classifier.named_steps['dictvectorizer'].get_feature_names()

    coef = classifier.named_steps['logisticregression'].coef_    
    print(coef.shape)
    for rel in label_encoder.classes_:
        rel_id = label_encoder.transform([rel])[0]
        coef_rel = coef[rel_id]
        coefs_with_fns = sorted(zip(coef_rel, feature_names))
        top_features = coefs_with_fns[-N:]
        print("\nClass {} best: ".format(rel))
        for feat in top_features:
            print(feat)        
        
print("Top features used to predict: ")
# show the top features
printNMostInformative(clf, le, 3)

Top features used to predict: 


AttributeError: 'LogisticRegression' object has no attribute 'named_steps'