## Load processed dataframe

In [1]:
import pandas as pd
load_processed_corpus_df = pd.read_pickle("./processed_corpus.pkl")
print(load_processed_corpus_df.shape)
load_processed_corpus_df.head(10)

(25248397, 3)


Unnamed: 0,page_identifier,sentence_number,sentence_text
0,"Bedadi,_Ethiopia",0,bedadi be a village in south western ethiopia
1,"Bedadi,_Ethiopia",1,locate in seka chekorsa a woreda in the jimma ...
2,"Bedadi,_Ethiopia",4,the central statistical agency have not publis...
3,"Bedadi,_Ethiopia",7,category populate place in the oromia region
4,Big_Apple_Circus,0,the big apple circus be a circus base in new y...
5,Big_Apple_Circus,1,open in 1977 later become a nonprofit organiza...
6,Big_Apple_Circus,2,the circus have be know for it community outre...
7,Big_Apple_Circus,3,big apple circus file for chapter 11 bankruptc...
8,Big_Apple_Circus,4,the circus will be renew in october 2017 for i...
9,Bayete,0,bayete may refer to


## Build a dictionary to store index in dataframe

In [5]:
page_dictionary = {}

for index, row in load_processed_corpus_df.iterrows(): 
    page_dictionary.setdefault(row['page_identifier'],[]).append(index)
    

In [9]:
# page keys are the page identifier, keys of the page_dictionary
page_keys = list(page_dictionary.keys())

In [10]:
# testing
print("Length of keys = {}".format(len(page_keys)))
print(page_keys[:10])
print(page_dictionary['Big_Apple_Circus'])
print(page_dictionary['Alexander_McNair'])

Length of keys = 5396106
['Bedadi,_Ethiopia', 'Big_Apple_Circus', 'Bayete', 'Barkhalbina', 'Bihovo', 'Barry_Ditewig', 'Bawbee', 'Battle_of_Kenapacomaqua', 'Bertram,_California', 'Benjamin_D._Pritchard']
[4, 5, 6, 7, 8]
[15590516, 15590517, 15590518, 15590519, 15590520, 15590521, 15590522, 15590523, 15590524, 15590525, 15590526, 15590527, 15590528, 15590529, 15590530, 15590531, 15590532, 15590533]


## Use dictionary to retrieve the sentence text.

In [15]:
# input: a page identifier string, keys are all the keys of the page_dictionary
# output: return a list of strings which contains all the relevanted sentence text

def retrieve_sentenceText(claim_word,page_keys,page_dictionary,df):
    retrieved_sentence = []
    if claim_word in page_keys:
        retrieved_index = page_dictionary[claim_word]      # all indexes in the dataframe
        for index in retrieved_index:
            retrieved_sentence.append(df.loc[index, 'sentence_text'])  # retrieve all the raw doc txt
    return retrieved_sentence

In [16]:
print(retrieve_sentenceText('Big_Apple_Circus', page_keys,page_dictionary,load_processed_corpus_df))

['the big apple circus be a circus base in new york city', 'open in 1977 later become a nonprofit organization it become a tourist attraction', 'the circus have be know for it community outreach program include clown care a well a it humane treatment of animal', 'big apple circus file for chapter 11 bankruptcy protection in november 2016 and exit bankruptcy in february 2017 after it asset be buy by compass partner', 'the circus will be renew in october 2017 for it 40th anniversary season']


## Load devset dataset.

In [17]:
import os
import json

with open('devset.json', 'r') as f1:  # load dev dataset
        dev_data = json.load(f1) 
print("Length of the dev data is: " + str(len(dev_data)))

with open('devset_result.json', 'r') as f2:  # store result 
        res_data = json.load(f2) 
print("Length of the dev result data is: " + str(len(res_data)))

with open('test-unlabelled.json', 'r') as f3:  # store result 
     test_data = json.load(f3) 
print("Length of the test data is: " + str(len(test_data)))


Length of the dev data is: 5001
Length of the dev result data is: 5001
Length of the test data is: 14997


## Build TF IDF Vectorizer

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import datetime
import gc

gc.collect()
start1 = datetime.datetime.now()

tfidf_vectorizer = TfidfVectorizer(max_features = 5000)
tfidf_list = tfidf_vectorizer.fit(load_processed_corpus_df['sentence_text'].tolist())

pickle.dump(tfidf_list, open("tfidf5000list.pickle","wb"))

end1 = datetime.datetime.now()
print(end1-start1)

0:08:04.874127


## Rule on Claim: find the Upper cased words.

Rule1: First Continuous Upper words.

In [79]:
from itertools import groupby

def ranges(lst):
    pos = (j - i for i, j in enumerate(lst))
    t = 0
    for i, els in groupby(pos):
        l = len(list(els))
        el = lst[t]
        t += l
        yield list(range(el, el+l))

def find_upper_word(claim):
    res = ""
    res_index = []      
    words = claim.split()
    for index, word in enumerate(words):
        if word[0].isupper():
            res_index.append(index)
        
    for group in list(ranges(res_index)):
        # find the first group of upper case continuous words
        if len(group) >= 2:
            for i in group:
                res = res + '_' + words[i]
            return res[1:]
    # return null if no two or more continuous upper words
    return res

In [81]:
test_claim = "Brad Wilk helped co-found Rage in 1962."
print(find_upper_word(test_claim))
test_claim2 = "The Faroe Islands are no longer part of the Kingdom of Mercia."
print(find_upper_word(test_claim2))
test_claim3 = "Down With Love is a 2003 comedy film."
print(find_upper_word(test_claim2))

Brad_Wilk
The_Faroe_Islands
