In [1]:
import gzip
import numpy as np
import random
import os
import json

from collections import Counter, defaultdict, namedtuple
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, fbeta_score, make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from sklearn.preprocessing import FunctionTransformer,LabelEncoder
import numpy as np

#### additional imports
from sklearn.feature_extraction.text import CountVectorizer
# import spacy
# nlp = spacy.load('en')

In [2]:
############################################################################################
# 1. LOAD DATA
############################################################################################

PairExample = namedtuple('PairExample',
    'entity_1, entity_2, snippet')
Snippet = namedtuple('Snippet',
    'left, mention_1, middle, mention_2, right, direction')
def load_data(file, verbose=True):
    f = open(file,'r', encoding='utf-8')
    data = []
    labels = []
    for i,line in enumerate(f):
        instance = json.loads(line)
        if i==0:
            if verbose:
                print('json example:')
                print(instance)
        #'relation, entity_1, entity_2, snippet' fileds for each example
        #'left, mention_1, middle, mention_2, right, direction' for each snippet
        instance_tuple = PairExample(instance['entity_1'],instance['entity_2'],[])
        for snippet in instance['snippet']:
            try:
                snippet_tuple = Snippet(snippet['left'],snippet['mention_1'],
                                        snippet['middle'], 
                                        snippet['mention_2'],snippet['right'],
                                        snippet['direction'])
                instance_tuple.snippet.append(snippet_tuple)
            except:
                print(instance)
        if i==0:
            if verbose:
                print('\nexample transformed as a named tuple:')
                print(instance_tuple)
        data.append(instance_tuple)
        labels.append(instance['relation'])
    return data,labels
    
train_data, train_labels = load_data('train.json.txt', verbose=False)

In [3]:
print(train_data[0])
print(len(train_data))

PairExample(entity_1='Judy_Garland', entity_2='David_Rose', snippet=[Snippet(left='thirty and his life and career were riding high . In 1941 , shortly after the death of his father , Mercer began an intense affair with nineteen-year-old', mention_1='Judy Garland', middle='while she was engaged to composer', mention_2='David Rose', right='. Garland married Rose to temporarily stop the affair , but the effect on Mercer lingered , adding to the emotional depth of his lyrics . Their affair', direction='fwd')])
9660


In [4]:
def rebuild_corpus(data):
    """reconstruct training data with placeholders to build vocabulary"""
    corpus = []

    for ex in data:
        s = ex.snippet[0]
        corpus.append(
            ' '.join((s.left, 'ENTITY_1', s.middle, 'ENTITY_2', s.right))
        )

    return corpus


In [5]:
corpus = rebuild_corpus(train_data)
    
print(corpus[0])
print(len(corpus))  

thirty and his life and career were riding high . In 1941 , shortly after the death of his father , Mercer began an intense affair with nineteen-year-old ENTITY_1 while she was engaged to composer ENTITY_2 . Garland married Rose to temporarily stop the affair , but the effect on Mercer lingered , adding to the emotional depth of his lyrics . Their affair
9660


In [6]:
# def context_windows(data):
#     train_data_windows = []
#     for ex in train_data:
#         ex_windows = []
#         ex_windows.append(ex.snippet[0].left)
#         ex_windows.append(ex.snippet[0].middle)
#         ex_windows.append(ex.snippet[0].right)
#         train_data_windows.append(ex_windows)
#         del ex_windows
        
#     return train_data_windows

In [7]:
# contexts = context_windows(train_data)

In [8]:
# print(contexts[0])
# print(contexts[-1])

In [9]:
# https://www.youtube.com/watch?v=aCdg-d_476Y
vectorizer = CountVectorizer()
BOW = vectorizer.fit_transform(corpus)


In [10]:
# print(vectorizer)

print("ID for word thirty:", vectorizer.vocabulary_.get("thirty"))
# print(BOW[0])

print("length of vocab:", len(vectorizer.vocabulary_))

# print(vectorizer.vocabulary_.get("affair"))
# print(vectorizer.get_feature_names())
print(type(BOW))
print(type(BOW[0].toarray()))
print(BOW[0].shape)
print(BOW[1].shape)
print(BOW.shape)

ID for word thirty: 37523
length of vocab: 41671
<class 'scipy.sparse.csr.csr_matrix'>
<class 'numpy.ndarray'>
(1, 41671)
(1, 41671)
(9660, 41671)


In [11]:
# def build_BOW_features(train_data, vectorizer):
#     train_data_vectors = []
#     for text in train_data:
#         left = vectorizer.transform(text.snippet[0].left.split())
#         mid = vectorizer.transform(text.snippet[0].middle.split())
#         right = vectorizer.transform(text.snippet[0].right.split())
#         train_data_vectors.append((left, mid, right))
#     return train_data_vectors

# train_data_featurized = build_BOW_features(train_data, vectorizer)

In [12]:
# print(train_data_featurized[0][0].shape)
# print(train_data_featurized[0][1].shape)
# print(train_data_featurized[0][2].shape)

# print(train_data_featurized[1][0].shape)
# print(train_data_featurized[1][1].shape)
# print(train_data_featurized[1][2].shape)

In [13]:
train_data_featurized = BOW

# Transform labels to nimeric values
le = LabelEncoder()
train_labels_featurized = le.fit_transform(train_labels)

In [14]:
print(type(train_labels_featurized))
print(train_data_featurized.shape, train_labels_featurized.shape)

<class 'numpy.ndarray'>
(9660, 41671) (9660,)


In [15]:
lr = LogisticRegression()
lr.fit(train_data_featurized, train_labels_featurized)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [16]:
#########################################################################################
# 4. TEST PREDICTIONS and ANALYSIS
#########################################################################################

# Predict on test set
test_data, test_labels = load_data('test-covered.json.txt', verbose=False)
test_corpus = rebuild_corpus(test_data)
print(len(test_corpus))
print(test_corpus[0])

test_corpus_featurized = vectorizer.transform(test_corpus)
print(test_corpus_featurized.shape)

test_label_predicted = lr.predict(test_corpus_featurized)

1840
Geronimo Mercuriali and Giovanni Battista Morgagni . Ronchi dei Legionari ( Gorizia ) ( 2011-present ) is a comune ( municipality ) in the Province of Gorizia in the Italian region ENTITY_1 , located about 30 km northwest of ENTITY_2 and about 14 km southwest of Gorizia . Follow Us on Twitter ! Categories Events Multimedia Neural network Operating system Weather Wii Tag Cloud 3d 2008/09 Aircraft
(1840, 41671)


In [1]:
print(test_label_predicted)
# # Deprecation warning explained: https://stackoverflow.com/questions/49545947/sklearn-deprecationwarning-truth-value-of-an-array
test_label_predicted_decoded = le.inverse_transform(test_label_predicted)
print(test_label_predicted_decoded[:10])
f = open("test_labels.txt", 'w', encoding="utf-8")
for label in test_label_predicted_decoded:
    f.write(label+'\n')

NameError: name 'test_label_predicted' is not defined