# PA4: Relation classification

In [293]:
import gzip
import numpy as np
import random
import os
import json

from collections import Counter, defaultdict, namedtuple
from gensim.utils import tokenize
from gensim.models import Word2Vec

from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, fbeta_score, make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from sklearn.preprocessing import FunctionTransformer,LabelEncoder

### 1. Load data

In [246]:
PairExample = namedtuple('PairExample',
                         'entity_1, entity_2, snippet')
Snippet = namedtuple('Snippet',
                     'left, mention_1, middle, mention_2, right, direction')

def load_data(file, verbose=True):
    f = open(file,'r', encoding='utf-8')
    data = []
    labels = []
    for i, line in enumerate(f):
        instance = json.loads(line)
        if i == 0:
            if verbose:
                print('json example:')
                print(instance)
        # 'relation, entity_1, entity_2, snippet' fileds for each example
        # 'left, mention_1, middle, mention_2, right, direction' for each snippet
        instance_tuple = PairExample(instance['entity_1'], instance['entity_2'], [])
        for snippet in instance['snippet']:
            try:
                snippet_tuple = Snippet(snippet['left'], snippet['mention_1'],
                                        snippet['middle'], 
                                        snippet['mention_2'], snippet['right'],
                                        snippet['direction'])
                instance_tuple.snippet.append(snippet_tuple)
            except:
                print(instance)
        if i == 0:
            if verbose:
                print('\nexample transformed as a named tuple:')
                print(instance_tuple)
        data.append(instance_tuple)
        labels.append(instance['relation'])

    return data, labels
    
train_data, train_labels = load_data('data/train.json.txt')

json example:
{'entity_1': 'Judy_Garland', 'relation': 'has_spouse', 'entity_2': 'David_Rose', 'snippet': [{'right': '. Garland married Rose to temporarily stop the affair , but the effect on Mercer lingered , adding to the emotional depth of his lyrics . Their affair', 'direction': 'fwd', 'mention_1': 'Judy Garland', 'left': 'thirty and his life and career were riding high . In 1941 , shortly after the death of his father , Mercer began an intense affair with nineteen-year-old', 'mention_2': 'David Rose', 'middle': 'while she was engaged to composer'}]}

example transformed as a named tuple:
PairExample(entity_1='Judy_Garland', entity_2='David_Rose', snippet=[Snippet(left='thirty and his life and career were riding high . In 1941 , shortly after the death of his father , Mercer began an intense affair with nineteen-year-old', mention_1='Judy Garland', middle='while she was engaged to composer', mention_2='David Rose', right='. Garland married Rose to temporarily stop the affair , but 

In [247]:
# Statistics over relations
def print_stats(labels):
    labels_counts = Counter(labels)
    print('{:20s} {:>10s} {:>10s}'.format('', '', 'rel_examples'))
    print('{:20s} {:>10s} {:>10s}'.format('relation', 'examples', '/all_examples'))
    print('{:20s} {:>10s} {:>10s}'.format('--------', '--------', '-------'))

    for k,v in labels_counts.items():
        print('{:20s} {:10d} {:10.2f}'.format(k, v, v /len(labels)))
    print('{:20s} {:>10s} {:>10s}'.format('--------', '--------', '-------'))
    print('{:20s} {:10d} {:10.2f}'.format('Total', len(labels), len(labels) /len(labels)))

print('Train set statistics:')
print_stats(train_labels)

Train set statistics:
                                rel_examples
relation               examples /all_examples
--------               --------    -------
author                     2653       0.27
worked_at                  1178       0.12
has_spouse                 3019       0.31
capital                     510       0.05
NO_REL                     2300       0.24
--------               --------    -------
Total                      9660       1.00


In [248]:
# check that each entity pair is assigned only one relation
pair_dict = {}
rel_dict = {}

for example, label in zip(train_data, train_labels):
    if (example.entity_1, example.entity_2) not in pair_dict.keys():
        pair_dict[(example.entity_1, example.entity_2)] = [label]
    else:
        pair_dict[(example.entity_1, example.entity_2)].append(label)
        print(example.entity_1, example.entity_2, label)

    if label not in rel_dict.keys():
        rel_dict[label] = [example]
    else:
        rel_dict[label].append(example)

print("Done building dictionary: \n")  
    
# example for each relation
for rel in rel_dict.keys():
    ex = rel_dict[rel][0]
    print(rel, ex.entity_1, ex.entity_2)

Done building dictionary: 

author Charlie_and_the_Chocolate_Factory Roald_Dahl
worked_at Carl-Henric_Svanberg Ericsson
has_spouse Judy_Garland David_Rose
capital Andalusia Seville
NO_REL Sichuan Tibet


In [249]:
# print full context
ex = train_data[0]
print(ex)
print("\n full context:")
ex_s = ex.snippet[0]
ex_context = ' '.join((ex_s.left, ex_s.mention_1.replace(" ", "_"), ex_s.middle, ex_s.mention_2.replace(" ", "_"), ex_s.right))
print(ex_context)

PairExample(entity_1='Judy_Garland', entity_2='David_Rose', snippet=[Snippet(left='thirty and his life and career were riding high . In 1941 , shortly after the death of his father , Mercer began an intense affair with nineteen-year-old', mention_1='Judy Garland', middle='while she was engaged to composer', mention_2='David Rose', right='. Garland married Rose to temporarily stop the affair , but the effect on Mercer lingered , adding to the emotional depth of his lyrics . Their affair', direction='fwd')])

 full context:
thirty and his life and career were riding high . In 1941 , shortly after the death of his father , Mercer began an intense affair with nineteen-year-old Judy Garland while she was engaged to composer David Rose . Garland married Rose to temporarily stop the affair , but the effect on Mercer lingered , adding to the emotional depth of his lyrics . Their affair


In [285]:
# get full context
def get_context(data, embed_mode=False):
    all_data = []
    for instance in data:
        s_context = []
        for s in instance.snippet:
            if embed_mode:
                s_context.append(' '.join((s.left, s.mention_1.replace(" ", "_"), s.middle, s.mention_2.replace(" ", "_"), s.right)))
            else:
                s_context.append(' '.join((s.left, s.mention_1, s.middle, s.mention_2, s.right)))
            #s_context.append(' '.join((s.left, s.middle, s.right)))
        all_data.append(' '.join(s_context))

    print(len(all_data))
    return all_data

### 2. EXTRACT FEATURES and BUILD CLASSIFIER

In [275]:
# Extract two simple features
def ExractSimpleFeatures(data, verbose=True):
    featurized_data = []
    for instance in data:
        featurized_instance = {
            'mid_words':'',
            'distance':np.inf,
            'left':[],
            'right':[],
            'mid':[]
        }
        for s in instance.snippet:
            if len(s.middle.split()) < featurized_instance['distance']:
                featurized_instance['mid_words'] = s.middle
                featurized_instance['distance'] = len(s.middle.split())
            featurized_instance['left'] = s.left
            featurized_instance['right'] = s.right
            featurized_instance['mid'] = s.middle
            # context = [s.left + s.right + s.middle]
            # vec_context = vectorizer.transform(context)
            # featurized_instance['left'] = vectorizer.transform([s.left])
            # featurized_instance['right'] = vectorizer.transform([s.right])
            # featurized_instance['mid'] = vectorizer.transform([s.middle])
        featurized_data.append(featurized_instance)
    if verbose:
        print(len(data))
        print(len(featurized_data))
        print(data[0])
        print(featurized_data[0])

    return featurized_data

In [299]:
def train_word2vec(data):
    ''' Input: list of contexts (each context is a string).
        Prepares data for training embeddings: tokenize with simple_preprocessing.
        Returns: embedding model
    '''
    data_tokenised = [doc.lower().split(" ") for doc in data]
    # path_model = Path("models") / "Word2Vec.model"

    # if path_model.exists():
    #     model = Word2Vec.load(str(path_model))
    # else:
    #     if not path_model.parent.exists():
    #         path_model.parent.mkdir(parents=True)

    model = Word2Vec(data_tokenised, size=100, min_count=1, sg=1)
        # model.save(str(path_model))

    return model

In [324]:
def extract_embeddings_feature(data, embed_model, verbose=True):
    DIMEN_SIZE = 100
    vectorized_data = np.zeros(shape=(len(data), DIMEN_SIZE))
    for i, instance in enumerate(data):
        instance_vectors = []
        for s in instance.snippet:
            word1 = s.mention_1.lower().replace(" ", "_")
            word2 = s.mention_2.lower().replace(" ", "_")
            word1_vector = embed_model.wv[word1]
            word2_vector = embed_model.wv[word2]
            instance_vectors.append(word1_vector)
            instance_vectors.append(word2_vector)
            
        mean_vector = np.mean(np.array([vec for vec in instance_vectors]), axis=0)
        vectorized_data[i] = mean_vector
        
    if verbose:
        print(len(data))
        print(len(vectorized_data))

    return vectorized_data

In [305]:
test_data, test_labels = load_data('data/test.json.txt', verbose=False)

all_train = get_context(train_data, embed_mode=True)
all_test = get_context(test_data, embed_mode=True)

# DATA ExractSimpleFeatures
train_simple_featurized = ExractSimpleFeatures(train_data, verbose=False)
test_simple_featurized = ExractSimpleFeatures(test_data, verbose=False)

9660
1840


In [306]:
# train embedding model
emb_model = train_word2vec(all_train)

In [327]:
# EMBEDDINGS
train_embed_vectorized = extract_embeddings_feature(train_data, emb_model)
# test_embed_vectorized = extract_embeddings_feature(test_data, emb_model)

9660
9660


In [328]:
from scipy.sparse import csr_matrix
# MODEL

# Transform labels to nimeric values
le = LabelEncoder()
train_labels_featurized = le.fit_transform(train_labels)

# Fit model one vs rest logistic regression    
#clf = make_pipeline(DictVectorizer(), LogisticRegression())

# if with CountVectorizer
bow_vectorizer = CountVectorizer(ngram_range=(2, 3))
TFiDF_vectorizer = TfidfVectorizer()

# clf = make_pipeline(bow_vectorizer, LogisticRegression())
clf = LogisticRegression()

### 3. TRAIN CLASSIFIER AND EVALUATE (CV)

In [329]:
def print_statistics_header():
    print('{:20s} {:>10s} {:>10s} {:>10s} {:>10s}'.format(
        'relation', 'precision', 'recall', 'f-score', 'support'))
    print('{:20s} {:>10s} {:>10s} {:>10s} {:>10s}'.format(
        '-' * 18, '-' * 9, '-' * 9, '-' * 9, '-' * 9))

def print_statistics_row(rel, result):
    print('{:20s} {:10.3f} {:10.3f} {:10.3f} {:10d}'.format(rel, *result))

def print_statistics_footer(avg_result):
    print('{:20s} {:>10s} {:>10s} {:>10s} {:>10s}'.format(
        '-' * 18, '-' * 9, '-' * 9, '-' * 9, '-' * 9))
    print('{:20s} {:10.3f} {:10.3f} {:10.3f} {:10d}'.format('macro-average', *avg_result))

def macro_average_results(results):
    avg_result = [np.average([r[i] for r in results.values()]) for i in range(3)]
    avg_result.append(np.sum([r[3] for r in results.values()]))
    return avg_result

def average_results(results):
    avg_result = [np.average([r[i] for r in results]) for i in range(3)]
    avg_result.append(np.sum([r[3] for r in results]))
    return avg_result
    
def evaluateCV(classifier, label_encoder, X, y, verbose=True):
    results = {}
    for rel in le.classes_:
        results[rel] = []
    if verbose:
        print_statistics_header()
        kfold = StratifiedKFold(n_splits = 5, shuffle=True, random_state=0) 
        for train_index, test_index in kfold.split(X, y):
            #print("TRAIN:", train_index, "TEST:", test_index)
            X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
            y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]
            clf.fit(X_train, y_train)
            pred_labels = classifier.predict(X_test)
            stats = precision_recall_fscore_support(y_test, pred_labels, beta=0.5)
            #print(stats)
            for rel in label_encoder.classes_:
                rel_id = label_encoder.transform([rel])[0]
            #print(rel_id,rel)
                stats_rel = [stat[rel_id] for stat in stats]
                results[rel].append(stats_rel)
        for rel in label_encoder.classes_:
            results[rel] = average_results(results[rel])
            if verbose:
                print_statistics_row(rel, results[rel])
    avg_result = macro_average_results(results)
    if verbose:
        print_statistics_footer(avg_result)
    return avg_result[2]  # return f_0.5 score as summary statistic

In [330]:
evaluateCV(clf, le, train_embed_vectorized, train_labels_featurized)

relation              precision     recall    f-score    support
------------------    ---------  ---------  ---------  ---------
NO_REL                    0.536      0.629      0.552       2300
author                    0.832      0.822      0.830       2653
capital                   0.705      0.498      0.650        510
has_spouse                0.782      0.851      0.795       3019
worked_at                 0.773      0.450      0.676       1178
------------------    ---------  ---------  ---------  ---------
macro-average             0.726      0.650      0.700       9660


0.7004649820878264

In [214]:
# BASELINE
evaluateCV(clf, le, train_embed_vectorized, train_labels_featurized)

relation              precision     recall    f-score    support
------------------    ---------  ---------  ---------  ---------
NO_REL                    0.669      0.721      0.679       2300
author                    0.828      0.836      0.829       2653
capital                   0.888      0.653      0.828        510
has_spouse                0.874      0.902      0.879       3019
worked_at                 0.755      0.643      0.730       1178
------------------    ---------  ---------  ---------  ---------
macro-average             0.803      0.751      0.789       9660


0.7890200518748698

In [239]:
# A check for the average F1 score

f_scorer = make_scorer(fbeta_score, beta=0.5, average='macro')

def evaluateCV_check(classifier, X, y, verbose=True):
    kfold = StratifiedKFold(n_splits = 5, shuffle=True, random_state=0) 
    scores = cross_val_score(classifier, X, y, cv=kfold, scoring = f_scorer)
    print("\nCross-validation scores (StratifiedKFold): ", scores)
    print("Mean cv score (StratifiedKFold): ", scores.mean())

In [240]:
evaluateCV_check(clf, train_embed_vectorized, train_labels_featurized)


Cross-validation scores (StratifiedKFold):  [0.77831464 0.77490296 0.78322361 0.77969274 0.78449379]
Mean cv score (StratifiedKFold):  0.7801255472002809


### 4. TEST PREDICTIONS and ANALYSIS

In [163]:
# Fit final model on the full train data
clf.fit(train_embed_vectorized, train_labels_featurized)

# Predict on test set
test_label_predicted = clf.predict(test_embed_vectorized)

# Deprecation warning explained: https://stackoverflow.com/questions/49545947/sklearn-deprecationwarning-truth-value-of-an-array
test_label_predicted_decoded = le.inverse_transform(test_label_predicted)
print(test_label_predicted_decoded[:5])
f = open("test_labels.txt", 'w', encoding="utf-8")
for label in test_label_predicted_decoded:
    f.write(label+'\n')

['capital' 'capital' 'worked_at' 'NO_REL' 'has_spouse']


  if diff:


In [146]:
# Feature analisys - print N most informative
# !! Make changes in this function when you change the pipleine!!
def printNMostInformative(classifier, label_encoder, N):
    """Prints features with the highest coefficient values, per class"""
    feature_names = classifier.named_steps['dictvectorizer'].get_feature_names()

    coef = classifier.named_steps['logisticregression'].coef_    
    print(coef.shape)
    for rel in label_encoder.classes_:
        rel_id = label_encoder.transform([rel])[0]
        coef_rel = coef[rel_id]
        coefs_with_fns = sorted(zip(coef_rel, feature_names))
        top_features = coefs_with_fns[-N:]
        print("\nClass {} best: ".format(rel))
        for feat in top_features:
            print(feat)        
        
print("Top features used to predict: ")
# show the top features
printNMostInformative(clf, le, 3)

Top features used to predict: 


AttributeError: 'LogisticRegression' object has no attribute 'named_steps'