### this script sets a baseline for realation extraction using frequency-based BOW model

#### add additional features

In [69]:
import gzip
import numpy as np
import random
import os
import json

from collections import Counter, defaultdict, namedtuple
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, fbeta_score, make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from sklearn.preprocessing import FunctionTransformer,LabelEncoder
import numpy as np

##### additional imports
import spacy
nlp = spacy.load('en')

In [70]:
##################################################################################################
# 1. LOAD DATA
##################################################################################################

PairExample = namedtuple('PairExample',
    'entity_1, entity_2, snippet')
Snippet = namedtuple('Snippet',
    'left, mention_1, middle, mention_2, right, direction')
def load_data(file, verbose=True):
    f = open(file,'r', encoding='utf-8')
    data = []
    labels = []
    for i,line in enumerate(f):
        instance = json.loads(line)
        if i==0:
            if verbose:
                print('json example:')
                print(instance)
        #'relation, entity_1, entity_2, snippet' fileds for each example
        #'left, mention_1, middle, mention_2, right, direction' for each snippet
        instance_tuple = PairExample(instance['entity_1'],instance['entity_2'],[])
        for snippet in instance['snippet']:
            try:
                snippet_tuple = Snippet(snippet['left'],snippet['mention_1'],snippet['middle'],
                                   snippet['mention_2'],snippet['right'],
                                    snippet['direction'])
                instance_tuple.snippet.append(snippet_tuple)
            except:
                print(instance)
        if i==0:
            if verbose:
                print('\nexample transformed as a named tuple:')
                print(instance_tuple)
        data.append(instance_tuple)
        labels.append(instance['relation'])
    return data,labels
    
train_data, train_labels = load_data('../data/train.json.txt')

json example:
{'relation': 'has_spouse', 'entity_1': 'Judy_Garland', 'entity_2': 'David_Rose', 'snippet': [{'left': 'thirty and his life and career were riding high . In 1941 , shortly after the death of his father , Mercer began an intense affair with nineteen-year-old', 'mention_1': 'Judy Garland', 'middle': 'while she was engaged to composer', 'mention_2': 'David Rose', 'right': '. Garland married Rose to temporarily stop the affair , but the effect on Mercer lingered , adding to the emotional depth of his lyrics . Their affair', 'direction': 'fwd'}]}

example transformed as a named tuple:
PairExample(entity_1='Judy_Garland', entity_2='David_Rose', snippet=[Snippet(left='thirty and his life and career were riding high . In 1941 , shortly after the death of his father , Mercer began an intense affair with nineteen-year-old', mention_1='Judy Garland', middle='while she was engaged to composer', mention_2='David Rose', right='. Garland married Rose to temporarily stop the affair , but 

In [71]:
# Statistics over relations
def print_stats(labels):
    labels_counts = Counter(labels)
    print('{:20s} {:>10s} {:>10s}'.format('', '', 'rel_examples'))
    print('{:20s} {:>10s} {:>10s}'.format('relation', 'examples', '/all_examples'))
    print('{:20s} {:>10s} {:>10s}'.format('--------', '--------', '-------'))
    for k,v in labels_counts.items():
        print('{:20s} {:10d} {:10.2f}'.format(k, v, v /len(labels)))
    print('{:20s} {:>10s} {:>10s}'.format('--------', '--------', '-------'))
    print('{:20s} {:10d} {:10.2f}'.format('Total', len(labels), len(labels) /len(labels)))

print('Train set statistics:')
print_stats(train_labels)

Train set statistics:
                                rel_examples
relation               examples /all_examples
--------               --------    -------
has_spouse                 3019       0.31
author                     2653       0.27
NO_REL                     2300       0.24
capital                     510       0.05
worked_at                  1178       0.12
--------               --------    -------
Total                      9660       1.00


In [72]:
# check that each entity pair is assigned only one relation
pair_dict={}
rel_dict={}
for example, label in zip(train_data,train_labels):
    if (example.entity_1,example.entity_2) not in pair_dict.keys():
        pair_dict[(example.entity_1,example.entity_2)] = [label]
        
    else:
        pair_dict[(example.entity_1,example.entity_2)].append(label)
        print(example.entity_1,example.entity_2,label)
    if label not in rel_dict.keys():
        rel_dict[label] = [example]
    else:
        rel_dict[label].append(example)
print("Done building dictionary")  
    
# example for each relation
for rel in rel_dict.keys():
    ex = rel_dict[rel][0]
    print(rel,ex.entity_1,ex.entity_2)

Done building dictionary
has_spouse Judy_Garland David_Rose
author Charlie_and_the_Chocolate_Factory Roald_Dahl
NO_REL Sichuan Tibet
capital Andalusia Seville
worked_at Carl-Henric_Svanberg Ericsson


In [116]:
# how to reconstruct full context

# ex = train_data[0]
# print(ex)
# print("\n full context:")
# s = ex.snippet[0]
# print(' '.join((s.left, s.mention_1, s.middle, s.mention_2, s.right)))

In [117]:
def lemmatize(doc):
    lemmas = []
    doc = nlp(doc)
    for w in doc:
        if w.lemma_ == "-PRON-":
            lemmas.append(w.orth_)
        else:
            lemmas.append(w.lemma_)
    lemmas = " ".join(lemmas)
    return lemmas

In [118]:
def rebuild_text(ex):
    rebuilt_ex = []
    for s in ex.snippet:
        text = ' '.join((s.left, s.mention_1, s.middle, s.mention_2, s.right))
        rebuilt_ex.append(text)
    return rebuilt_ex

In [140]:
def build_text_from_snippet(s):
    text = ' '.join((s.left, s.mention_1, s.middle, s.mention_2, s.right))
    return text

In [141]:
def rebuild_corpus(data):
    corpus = []
    for ex in data:
        corpus.append(rebuild_text(ex)) 
    return corpus

In [135]:
mini_corpus = rebuild_corpus(train_data[:100])

for ex in mini_corpus:
    doc = nlp(ex[0])
    print([sent.string for sent in doc.sents])
#     key_sent = sent.string for sent in doc.sents if sent.string 
#     sentences = [sent.string.strip() for sent in doc.sents]

['thirty and his life and career were riding high . ', 'In 1941 , shortly after the death of his father , Mercer began an intense affair with nineteen-year-old Judy Garland while she was engaged to composer David Rose . ', 'Garland married Rose to temporarily stop the affair , but the effect on Mercer lingered , adding to the emotional depth of his lyrics . ', 'Their affair']
['Latest activity 4 Other Sweet-Treat Wikia Summary of a Sweet Adventure Edit Charlie and the Chocolate Factory is a 2005 film adaptation of the 1964 book of the same name by Roald Dahl . ', 'Directed by Tim Burton , the film stars Freddie Highmore as Charlie Bucket and Johnny Depp as Willy Wonka . ', 'The storyline concerns a']
['various places claim the title , such as parts of southern Kham in northwestern Yunnan province , including the tourist destinations of Lijiang and Zhongdian . ', 'Places like Sichuan and Tibet also claim the real Shangri-La was in its territory . ', 'In 2001 , Tibet Autonomous Region pu

["that `` Lacedemonia [ meaning Sparta ] , [ was ] the noblest and best city governed that ever was '' . ", 'He commended it as a model for England . ', 'The Swiss-French philosopher Jean-Jacques Rousseau contrasted Sparta favourably with Athens in his Discourse on the Arts and Sciences , arguing that its austere constitution was preferable to the more cultured nature of Athenian life . ', 'Sparta was also used as a model of social']
['whom are also descendants of Hussaini Brahmins and mourn the death of Imam Hussain . ', 'There is also a significant migrant population of Bhumihars in Mauritius , Suriname , Trinidad and Tobago , Guyana and others . ', 'Bhumihars are commonly called Babhans which is the Pali word for Brahmins and is used to refer to Brahmins in Buddhist']
['he is a ghost ? ', "For more info about this book , visit the author 's official site at http : //www.jessicaverday.com/ or her blog at http : //jessicaverday.blogspot.com/ . ", 'Based on The Legend of Sleepy Hollow 

['extra-curricular fucking… because I would love him and at the end of the day , he would love me . ', 'We would be some urban version of Larry and Althea Flynt . ', 'I was ready for that . ', 'And in the meantime , until we exchange vows… ', 'I would have some fun', '… ', 'A couple of days later , I ’ m']
['Mihan Productions . ', 'Over his career as an animator , Hoover worked on such shows as Fat Albert and the Cosby Kids , The Archie Show , Tarzan , Flash Gordon , He-Man and the Masters of the Universe , ', 'She-Ra : Princess of Power , The Super Friends , The Smurfs , Men in Black : The Series']
['Films , Books , Film Reviews , Movies Film Review : ', 'The Shawshank Redemption *', '*Remember , spoilers abound ! ', '*', '* A few years ago , I read a great novella by Stephen King , called Rita Hayworth and the Shawshank Redemption . ', 'I knew a movie had been made about it , and I knew that it was a really big deal , but I guess that all']
['90s Show : Film , Moment by Moment 1978 F

['Harken Energy Corporation , of which he was a director . ', 'The sale raised the issue of whether it constituted illegal insider trading . ', "In House of Bush , House of Saud , Craig Unger asserts that at the time of Bush 's sale , Harken Energy `` was expected to run out of money in just three days '' ( ", 'p. 123 ) . ', 'In a']
['including Wiktionary ( a wiki dictionary ) , Wikibooks ( textbooks ) , and others , and owns all of their domain names . ', 'Previously , the site was hosted on the servers of Bomis , Inc. , a company mostly owned by Jimmy Wales . ', 'With the announcement of the Wikimedia Foundation on June 20 , 2003 , the ownership of all domain names was transferred to the Foundation . ', 'The site']
['and the study of myth . ', "He describes his writing and speaking style as `` mind- jazz on ancient texts '' . ", 'He is an astute reader of science , social science , history , and literature . ', 'He is the founder of the Lindisfarne Association . ', 'Tags : Politics ,

In [151]:
def extract_key_sents(data):
    key_sents = []
    for ex in data:
        m1 = ex.snippet[0].mention_1
        m2 = ex.snippet[0].mention_2
        text = build_text_from_snippet(ex.snippet[0])
        doc = nlp(text)
        for sent in doc.sents:
#             print(sent)
            if m1 in sent.string and m2 in sent.string:
                key_sents.append(sent)
                continue
                
    return key_sents

In [157]:
key_sents = extract_key_sents(train_data[:100])
print(type(key_sents[0]))
for sent in key_sents:
    for chunk in sent.noun_chunks:
        print(chunk.text, chunk.root.text, chunk.root.dep_,
          chunk.root.head.text)
    for token in sent: 
        print(token.text, token.dep_, token.head.text, token.head.pos_,
              [child for child in token.children])

<class 'spacy.tokens.span.Span'>
the death death pobj after
his father father pobj of
Mercer Mercer nsubj began
an intense affair affair dobj began
nineteen-year-old Judy Garland Garland pobj with
she she nsubjpass engaged
David Rose Rose dobj composer
In prep began VERB [1941]
1941 pobj In ADP []
, punct began VERB []
shortly advmod after ADP []
after prep began VERB [shortly, death]
the det death NOUN []
death pobj after ADP [the, of]
of prep death NOUN [father]
his poss father NOUN []
father pobj of ADP [his]
, punct began VERB []
Mercer nsubj began VERB []
began ROOT began VERB [In, ,, after, ,, Mercer, affair, engaged, .]
an det affair NOUN []
intense amod affair NOUN []
affair dobj began VERB [an, intense, with]
with prep affair NOUN [Garland]
nineteen nummod year NOUN []
- punct year NOUN []
year npadvmod old ADJ [nineteen, -]
- punct old ADJ []
old amod Garland PROPN [year, -]
Judy compound Garland PROPN []
Garland pobj with ADP [old, Judy]
while mark engaged VERB []
she nsubjp

real amod estate NOUN []
estate compound venture NOUN [real]
venture pobj in ADP [the, Corporation, estate, with]
with prep venture NOUN [Jim]
Jim pobj with ADP [and, McDougal]
and cc Jim PROPN []
Susan compound McDougal PROPN []
McDougal conj Jim PROPN [Susan]
at prep began VERB [time]
this det time NOUN []
time pobj at ADP [this]
. punct began VERB []
a jam-session segment segment attr was
the following all-star musicians musicians pobj of
Chet Atkins Atkins appos musicians
Boots Randolph Randolph conj Atkins
Roy Clark Clark conj Randolph
Floyd Cramer Cramer conj Clark
Charlie McCoy McCoy conj Cramer
Danny Davis Davis conj McCoy
Jethro Burns Burns conj Davis
Johnny Gimble Gimble conj Burns
This nsubj was VERB []
was ROOT was VERB [This, segment, ,, airing, composed, .]
a det segment NOUN []
jam compound session NOUN []
- punct session NOUN []
session compound segment NOUN [jam, -]
segment attr was VERB [a, session]
, punct was VERB []
airing advcl was VERB [from, ,]
from prep airing 

acquaintances conj friends NOUN [many, as]
as prep acquaintances NOUN [people]
the det people NOUN []
people pobj as ADP [the]
on prep friends NOUN []
Chris Robinson Robinson ROOT Robinson
Kate Hudson Hudson conj Robinson
13 years Artie Shaw Shaw conj Hudson
Ava Gardner Gardner conj Shaw
12 years years appos Hudson
Chris compound Robinson PROPN []
Robinson ROOT Robinson PROPN [Chris, and, Hudson]
and cc Robinson PROPN []
Kate compound Hudson PROPN []
Hudson conj Robinson PROPN [Kate, ,, Shaw, ,, years, .]
, punct Hudson PROPN []
13 nummod years NOUN []
years compound Shaw PROPN [13]
Artie compound Shaw PROPN []
Shaw conj Hudson PROPN [years, Artie, and, Gardner]
and cc Shaw PROPN []
Ava compound Gardner PROPN []
Gardner conj Shaw PROPN [Ava]
, punct Hudson PROPN []
12 nummod years NOUN []
years appos Hudson PROPN [12]
. punct Hudson PROPN []
I I nsubj am
a regular occasion occasion pobj on
both John Astin Astin pobj with
Patty Duke Duke conj Astin
» punct am VERB []
I nsubj am VERB []


Vaughan pobj as ADP [Sarah, ,, Reeves]
, punct Vaughan PROPN []
Dianne compound Reeves PROPN []
Reeves conj Vaughan PROPN [Dianne, ,, Monk]
, punct Reeves PROPN []
Thelonious compound Monk PROPN []
Monk conj Reeves PROPN [Thelonious, ,, Peterson]
, punct Monk PROPN []
Oscar compound Peterson PROPN []
Peterson conj Monk PROPN [Oscar, ,, Davis]
, punct Peterson PROPN []
Miles compound Davis PROPN []
Davis conj Peterson PROPN [Miles, ,, Tjader]
, punct Davis PROPN []
Cal compound Tjader PROPN []
Tjader conj Davis PROPN [Cal, ,, McRae]
, punct Tjader PROPN []
Carmen compound McRae PROPN []
McRae conj Tjader PROPN [Carmen, ,, Hancock]
, punct McRae PROPN []
Herbie compound Hancock PROPN []
Hancock conj McRae PROPN [Herbie, ,, Adderley]
, punct Hancock PROPN []
Nat compound Adderley PROPN []
Adderley conj Hancock PROPN [Nat, ,, and, Gillespie]
, punct Adderley PROPN []
and cc Adderley PROPN []
Dizzy compound Gillespie PROPN []
Gillespie conj Adderley PROPN [Dizzy]
. punct Festival PROPN []
T

tumultuous amod romance NOUN []
romance dobj had VERB [a, brief, ,, tumultuous, with]
with prep romance NOUN [Shearer]
Hollywood compound queen NOUN []
queen compound Shearer PROPN [Hollywood]
Norma compound Shearer PROPN []
Shearer pobj with ADP [queen, Norma, ,, died]
, punct Shearer PROPN []
whose poss husband NOUN []
husband nsubj died VERB [whose, ,, Thalberg]
, punct husband NOUN []
Irving compound Thalberg PROPN []
Thalberg conj husband NOUN [Irving, ,, head]
, punct Thalberg PROPN []
head appos Thalberg PROPN [of, at]
of prep head NOUN [production]
production pobj of ADP []
at prep head NOUN [MGM]
MGM pobj at ADP []
, punct died VERB []
had aux died VERB []
died relcl Shearer PROPN [husband, ,, had, earlier]
two nummod years NOUN []
years npadvmod earlier ADV [two]
earlier advmod died VERB [years]
. punct had VERB []
The ad ad nsubj features
Freddie Mac employees employees dobj targets
a small picture picture dobj features
the face face pobj of
CEO Charles Haldeman Haldeman pob

Rose pobj by ADP [founder, Kevin, ,, with]
, punct Rose PROPN []
with prep Rose PROPN [Culver]
Leah compound Culver PROPN []
Culver pobj with ADP [Leah, ,, Burka]
, punct Culver PROPN []
Daniel compound Burka PROPN []
Burka conj Culver PROPN [Daniel, ,, and, Allen]
, punct Burka PROPN []
and cc Burka PROPN []
Shawn compound Allen PROPN []
Allen conj Burka PROPN [Shawn]
. punct created VERB []
Nigeria Nigeria pobj in
April April pobj in
an event event dobj staged
Ghana Ghana pobj in
celebrities celebrities dobj brought
Damon Dash Dash pobj like
his wife wife appos Dash
designer Rachel Roy Roy conj wife
Ghanaian-born , London -based designer Ozwald Boateng Boateng dobj brought
” intj began VERB []
, punct began VERB []
began ROOT began VERB [”, ,, in, in, and, staged, .]
in prep began VERB [Nigeria]
Nigeria pobj in ADP []
in prep began VERB [April]
April pobj in ADP [2006]
2006 nummod April PROPN []
and cc began VERB []
recently advmod staged VERB []
staged conj began VERB [recently, eve

spread conj worked VERB [liberalism, there]
liberalism dobj spread VERB []
there advmod spread VERB []
. punct mentioned VERB []
71 ] Iraq Reuters Reuters nsubj reported
hundreds hundreds nsubj protested
the film film pobj against
Baghdad Baghdad pobj in
Sadr City City attr s
Basra Basra pobj in
[ punct reported VERB []
71 nummod Reuters PROPN []
] punct Reuters PROPN []
Iraq compound Reuters PROPN []
Reuters nsubj reported VERB [71, ], Iraq]
reported ROOT reported VERB [[, Reuters, protested, .]
that mark protested VERB []
hundreds nsubj protested VERB []
protested ccomp reported VERB [that, hundreds, against, s]
against prep protested VERB [film]
the det film NOUN []
film pobj against ADP [the, in]
in prep film NOUN [Baghdad]
Baghdad pobj in ADP []
‘ punct s NOUN []
s prep protested VERB [‘, City, and, in]
Sadr compound City PROPN []
City attr s NOUN [Sadr]
and cc s NOUN []
in conj s NOUN [Basra]
Basra pobj in ADP []
. punct reported VERB []
guests guests nsubj waiting
Garbo Garbo ns

the det Kingdom PROPN []
United compound Kingdom PROPN []
Kingdom dobj ruled VERB [the, United]
from prep ruled VERB [1714]
1714 pobj from ADP []
until prep ruled VERB [1901]
1901 pobj until ADP []
. punct is VERB []
praise praise dobj won
Christine Mannon Mannon dobj playing
`` glows glows conj won
mature sexual allure allure pobj with
Daily Telegraph Telegraph appos glows
a revival revival pobj in
Eugene O'Neill 's Mourning Becomes Electra Electra pobj of
Howard Davies Davies pobj by
won ROOT won VERB [praise, (, ,, Standard, ;, glows, .]
praise dobj won VERB [playing]
playing acl praise NOUN [Mannon]
Christine compound Mannon PROPN []
Mannon dobj playing VERB [Christine]
( punct won VERB [cool]
`` punct cool ADJ []
defiantly advmod cool ADJ []
cool intj ( PUNCT [``, defiantly, ,, camp, '']
, punct cool ADJ []
camp conj cool ADJ [and, skittish]
and cc camp NOUN []
skittish conj camp NOUN []
'' punct cool ADJ []
, punct won VERB []
Evening compound Standard PROPN []
Standard npadvmod 

fiction compound modes NOUN [documentary, -]
modes pobj through ADP [mixed, fiction, (, Santos, )]
( punct modes NOUN []
Nelson compound Santos PROPN []
Pereira compound Santos PROPN []
dos compound Santos PROPN []
Santos appos modes NOUN [Nelson, Pereira, dos]
) punct modes NOUN []
, punct feature VERB []
to aux tropical ADJ []
tropical advcl feature VERB [to, allegory, ,, and, reflexivity]
allegory dobj tropical ADJ [(, Rocha, )]
( punct allegory NOUN []
Glauber compound Rocha PROPN []
Rocha appos allegory NOUN [Glauber]
) punct allegory NOUN []
, punct tropical ADJ []
and cc tropical ADJ []
absurdist amod reflexivity NOUN []
reflexivity conj tropical ADJ [absurdist, (, Ruiz, )]
( punct reflexivity NOUN []
Raul compound Ruiz PROPN []
Ruiz appos reflexivity NOUN [Raul]
) punct reflexivity NOUN []
. punct feature VERB []
Wikipedia Wikipedia pobj In
Kurt Vonnegut 's book Wikipedia Wikipedia nsubjpass depopulated
Deadeye Dick Dick appos Wikipedia
an American town town conj Dick
Midland C

by prep Pageant PROPN [Robinson]
Barbara compound Robinson PROPN []
Robinson pobj by ADP [Barbara]
. punct based VERB []
He He nsubj brought
400 Christians Christians dobj brought
Baghdad Baghdad pobj from
Nineveh Nineveh conj Baghdad
Jerusalem Jerusalem conj Nineveh
Kodungallur Kodungallur pobj to
He nsubj brought VERB []
brought ROOT brought VERB [He, Christians, from, to, .]
400 nummod Christians PROPN []
Christians dobj brought VERB [400]
from prep brought VERB [Baghdad]
Baghdad pobj from ADP [,, Nineveh]
, punct Baghdad PROPN []
Nineveh conj Baghdad PROPN [,, and, Jerusalem]
, punct Nineveh PROPN []
and cc Nineveh PROPN []
Jerusalem conj Nineveh PROPN []
to prep brought VERB [Kodungallur]
Kodungallur pobj to ADP []
. punct brought VERB []
Jonathon Jonathon nsubjpass interviewed
Robert Scoble Scoble pobj by
Rackspace Rackspace pobj of
Jonathon nsubjpass interviewed VERB []
is auxpass interviewed VERB []
interviewed ROOT interviewed VERB [Jonathon, is, here, by, .]
here advmod inter

In [83]:
##################################################################################################
# 2. EXTRACT FEATURES and BUILD CLASSIFIER
##################################################################################################

# Turn data into numerical features - simple BOW pipeline

def SelectContext(data, use_lemmas=False, verbose=True):
    only_context_data = []
    for instance in data:
        instance_context = []
        for s in instance.snippet:
            if use_lemmas:  
                context = ' '.join((lemmatize(s.left), "m1", lemmatize(s.middle), "m2", lemmatize(s.right)))
                instance_context.append(context)
            else:
                context = ' '.join((s.left, "m1", s.middle, "m2", s.right))
                instance_context.append(context)
        only_context_data.append(' '.join(instance_context))
    if verbose:
        print(len(data))
        print(len(only_context_data))
        print(data[0])
        print(only_context_data[0])
    return only_context_data


In [84]:
# Transform dataset to features
train_data_featurized = SelectContext(train_data, use_lemmas=False)

# Transform labels to nimeric values
le = LabelEncoder()
train_labels_featurized = le.fit_transform(train_labels)

# Fit model one vs rest logistic regression    
clf = make_pipeline(CountVectorizer(), LogisticRegression())

9660
9660
PairExample(entity_1='Judy_Garland', entity_2='David_Rose', snippet=[Snippet(left='thirty and his life and career were riding high . In 1941 , shortly after the death of his father , Mercer began an intense affair with nineteen-year-old', mention_1='Judy Garland', middle='while she was engaged to composer', mention_2='David Rose', right='. Garland married Rose to temporarily stop the affair , but the effect on Mercer lingered , adding to the emotional depth of his lyrics . Their affair', direction='fwd')])
thirty and his life and career were riding high . In 1941 , shortly after the death of his father , Mercer began an intense affair with nineteen-year-old m1 while she was engaged to composer m2 . Garland married Rose to temporarily stop the affair , but the effect on Mercer lingered , adding to the emotional depth of his lyrics . Their affair


In [85]:
##################################################################################################
# 3. TRAIN CLASSIFIER AND EVALUATE (CV)
##################################################################################################

def print_statistics_header():
    print('{:20s} {:>10s} {:>10s} {:>10s} {:>10s}'.format(
        'relation', 'precision', 'recall', 'f-score', 'support'))
    print('{:20s} {:>10s} {:>10s} {:>10s} {:>10s}'.format(
        '-' * 18, '-' * 9, '-' * 9, '-' * 9, '-' * 9))

def print_statistics_row(rel, result):
    print('{:20s} {:10.3f} {:10.3f} {:10.3f} {:10d}'.format(rel, *result))

def print_statistics_footer(avg_result):
    print('{:20s} {:>10s} {:>10s} {:>10s} {:>10s}'.format(
        '-' * 18, '-' * 9, '-' * 9, '-' * 9, '-' * 9))
    print('{:20s} {:10.3f} {:10.3f} {:10.3f} {:10d}'.format('macro-average', *avg_result))

def macro_average_results(results):
    avg_result = [np.average([r[i] for r in results.values()]) for i in range(3)]
    avg_result.append(np.sum([r[3] for r in results.values()]))
    return avg_result

def average_results(results):
    avg_result = [np.average([r[i] for r in results]) for i in range(3)]
    avg_result.append(np.sum([r[3] for r in results]))
    return avg_result
    
def evaluateCV(classifier, label_encoder, X, y, verbose=True):
    results = {}
    for rel in le.classes_:
            results[rel] = []
    if verbose:
        print_statistics_header()
        kfold = StratifiedKFold(n_splits = 5, shuffle=True, random_state=0) 
        for train_index, test_index in kfold.split(X, y):
            #print("TRAIN:", train_index, "TEST:", test_index)
            X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
            y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]
            clf.fit(X_train, y_train)
            pred_labels = classifier.predict(X_test)
            stats = precision_recall_fscore_support(y_test, pred_labels, beta=0.5)
            #print(stats)
            for rel in label_encoder.classes_:
                rel_id = label_encoder.transform([rel])[0]
            #print(rel_id,rel)
                stats_rel = [stat[rel_id] for stat in stats]
                results[rel].append(stats_rel)
        for rel in label_encoder.classes_:
            results[rel] = average_results(results[rel])
            if verbose:
                print_statistics_row(rel, results[rel])
    avg_result = macro_average_results(results)
    if verbose:
        print_statistics_footer(avg_result)
    return avg_result[2]  # return f_0.5 score as summary statistic

In [86]:
evaluateCV(clf,le,train_data_featurized,train_labels_featurized)

relation              precision     recall    f-score    support
------------------    ---------  ---------  ---------  ---------
NO_REL                    0.662      0.717      0.672       2300
author                    0.817      0.815      0.817       2653
capital                   0.879      0.639      0.817        510
has_spouse                0.859      0.903      0.868       3019
worked_at                 0.732      0.609      0.703       1178
------------------    ---------  ---------  ---------  ---------
macro-average             0.790      0.736      0.775       9660


0.7753647029950688

In [None]:
# A check for the average F1 score

f_scorer = make_scorer(fbeta_score, beta=0.5, average='macro')

def evaluateCV_check(classifier, X, y, verbose=True):
    kfold = StratifiedKFold(n_splits = 5, shuffle=True, random_state=0) 
    scores = cross_val_score(classifier, X, y, cv=kfold, scoring = f_scorer)
    print("\nCross-validation scores (StratifiedKFold): ", scores)
    print("Mean cv score (StratifiedKFold): ", scores.mean())

In [None]:
evaluateCV_check(clf,train_data_featurized,train_labels_featurized)

In [None]:
##################################################################################################
# 4. TEST PREDICTIONS and ANALYSIS
##################################################################################################

# Fit final model on the full train data
clf.fit(train_data_featurized, train_labels_featurized)

# Predict on test set
test_data, test_labels = load_data('../data/test-covered.json.txt', verbose=False)
print(len(test_labels))
test_data_featurized = SelectContext(test_data, verbose=False)
test_label_predicted = clf.predict(test_data_featurized)
print(len(test_label_predicted))
# Deprecation warning explained: https://stackoverflow.com/questions/49545947/sklearn-deprecationwarning-truth-value-of-an-array
test_label_predicted_decoded = le.inverse_transform(test_label_predicted)
print(len(test_label_predicted_decoded))
print(test_label_predicted_decoded[:2])
f = open("outputs/test_labels.txt", 'w', encoding="utf-8")
for label in test_label_predicted_decoded:
    f.write(label+'\n')

In [None]:
# Feature analisys - print N most informative
# !! Make changes in this function when you change the pipleine!!
def printNMostInformative(classifier,label_encoder,N):
    """Prints features with the highest coefficient values, per class"""
    feature_names = classifier.named_steps['countvectorizer'].get_feature_names()

    coef = classifier.named_steps['logisticregression'].coef_    
    print(coef.shape)
    for rel in label_encoder.classes_:
        rel_id = label_encoder.transform([rel])[0]
        coef_rel = coef[rel_id]
        coefs_with_fns = sorted(zip(coef_rel, feature_names))
        top_features = coefs_with_fns[-N:]
        print("\nClass {} best: ".format(rel))
        for feat in top_features:
            print(feat)        
        
print("Top features used to predict: ")
# show the top features
printNMostInformative(clf,le,2)