## Load File

In [2]:
import os
import json
import pandas as pd

with open('train.json', 'r') as f:  # load training dataset
        train_data = json.load(f)   
print("Length of the train data is: " + str(len(train_data)))

with open('devset.json', 'r') as f1:  # load dev dataset
        dev_data = json.load(f1) 
print("Length of the dev data is: " + str(len(dev_data)))

with open('devset_result.json', 'r') as f2:  # store result 
        res_data = json.load(f2) 
print("Length of the dev result data is: " + str(len(res_data)))

with open('test-unlabelled.json', 'r') as f3:  # store result 
     test_data = json.load(f3) 
print("Length of the test data is: " + str(len(test_data)))
        
# appeand all the wiki txt sentences to one document
def loadfile(folder): 
    corpus = []
    list_of_files = os.listdir(folder)
    
    for file in list_of_files:
        try:
            filename = os.path.join(folder, file)
            with open(filename, 'r') as doc:
                for line in doc:
                    corpus.append(line) 
        except Exception as e:
            print("No files found here!")
            raise e
    return corpus

def loadTextFile(folder): 
    corpus = []
    list_of_files = os.listdir(folder)
    
    for file in list_of_files:
        try:
            filename = os.path.join(folder, file)
            with open(filename, 'r') as doc:
                for line in doc:
                    corpus.append(' '.join(line.split()[2:])) 
        except Exception as e:
            print("No files found here!")
            raise e
    return corpus

corpus = loadfile("wiki-pages-text")
text_corpus = loadTextFile("wiki-pages-text")
print("Length of the corpus is: " + str(len(corpus)))
print("Length of the corpus is: " + str(len(text_corpus)))

Length of the train data is: 145449
Length of the dev data is: 5001
Length of the dev result data is: 5001
Length of the test data is: 14997
Length of the corpus is: 25248397
Length of the corpus is: 25248397


## Pandas split data.

In [149]:
corpus_df = pd.DataFrame(corpus[:100])
corpus_df.columns = ['text']

corpus_df['page_identifier'] = corpus_df.text.apply(lambda x: x.split(' ')[0])  
corpus_df['sentence_number'] = corpus_df.text.apply(lambda x: x.split(' ')[1]) 
corpus_df['sentence_text'] = corpus_df.text.apply(lambda x: x.split(' ')[2:])  
corpus_df['sentence_text'] = [','.join(map(str, l)) for l in corpus_df['sentence_text']]
corpus_df["sentence_text"] = corpus_df['sentence_text'].str.replace(',',' ')
corpus_df = corpus_df.drop('text', 1)

print(corpus_df.shape)
corpus_df.head(10)

(100, 3)


Unnamed: 0,page_identifier,sentence_number,sentence_text
0,Alexander_McNair,0,Alexander McNair -LRB- May 5 1775 -- March 1...
1,Alexander_McNair,1,He was the first Governor of Missouri from its...
2,Alexander_McNair,4,McNair was born in Lancaster in the Province o...
3,Alexander_McNair,5,His grandfather David McNair Sr. immigra...
4,Alexander_McNair,6,David McNair Jr. Alexander 's father -LRB-...
5,Alexander_McNair,7,Alexander went to school as a child and atte...
6,Alexander_McNair,8,He reached an agreement with his mother and br...
7,Alexander_McNair,9,Alexander was defeated .\n
8,Alexander_McNair,10,He became a member of the Pennsylvania militia...
9,Alexander_McNair,13,In 1804 McNair traveled to what is now Misso...


## Preprocess

In [150]:
import nltk

tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
lemmatizer = nltk.stem.WordNetLemmatizer()

nltk.download('wordnet')
def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

def pre_process(comment) -> str:
    # lower cased
    comment = comment.lower()
    # tokenize
    words = tokenizer.tokenize(comment)
    # lemmatize 
    words = [lemmatize(w) for w in words]
    processed_comment = " ".join(words)
    return processed_comment

def process_dataset(dataset: pd.DataFrame) -> pd.DataFrame:
    processed_corpus = dataset['sentence_text'].apply(lambda text: pre_process(text))
    return processed_corpus

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zhangyiming/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Preprocess the corpus dataframe

In [151]:
import datetime
start = datetime.datetime.now()

processed_corpus_df = pd.DataFrame(process_dataset(corpus_df))
processed_corpus_df.to_pickle("./processed_corpus.pkl")

end = datetime.datetime.now()
print(end-start)
processed_corpus_df.head(10)

0:00:00.052291


Unnamed: 0,sentence_text
0,alexander mcnair lrb may 5 1775 march 18 1826 ...
1,he be the first governor of missouri from it e...
2,mcnair be bear in lancaster in the province of...
3,his grandfather david mcnair sr immigrate to p...
4,david mcnair jr alexander s father lrb b 1736 ...
5,alexander go to school a a child and attend on...
6,he reach an agreement with his mother and brot...
7,alexander be defeat
8,he become a member of the pennsylvania militia...
9,in 1804 mcnair travel to what be now missouri ...


## Load processed corpus

In [152]:
load_processed_corpus_df = pd.read_pickle("./processed_corpus.pkl")
print(load_processed_corpus_df.shape)
load_processed_corpus_df.head(10)

(100, 1)


Unnamed: 0,sentence_text
0,alexander mcnair lrb may 5 1775 march 18 1826 ...
1,he be the first governor of missouri from it e...
2,mcnair be bear in lancaster in the province of...
3,his grandfather david mcnair sr immigrate to p...
4,david mcnair jr alexander s father lrb b 1736 ...
5,alexander go to school a a child and attend on...
6,he reach an agreement with his mother and brot...
7,alexander be defeat
8,he become a member of the pennsylvania militia...
9,in 1804 mcnair travel to what be now missouri ...


## Use Sklearn to build TF-IDF

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

start1 = datetime.datetime.now()

tfidf_vectorizer = TfidfVectorizer(max_features = 20000)
#tfidf = tfidf_vectorizer.fit(load_processed_corpus_df['sentence_text'])
tfidf = tfidf_vectorizer.fit(text_corpus)

pickle.dump(tfidf, open("tfidf.pickle","wb"))

end1 = datetime.datetime.now()
print(end1-start1)

NameError: name 'datetime' is not defined

## Build a dictionary for page identifier.

In [138]:
page_dictionary = {}

for id, doc in enumerate(corpus):
    page_dictionary.setdefault((doc.split()[0].replace("_", " ")), []).append(id)

# keys are the page identifier
page_keys = list(page_dictionary.keys())
print("Length of keys = {}".format(len(page_keys)))
print(page_keys[:10])


Length of keys = 5396106
['Alexander McNair', 'Alatskivi', 'An American Girl Story - Maryellen 1955-COLON- Extraordinary Christmas', 'Alta Outcome Document', 'Al ‘Urban', 'Albano buoy system', 'Amuka', 'Aleksandr Abdulkhalikov', 'Amalia Ciardi Duprè', 'Akbil']


## Build a word dictionary for each word in page identifier.

In [139]:
import numpy as np

def create_keywords(keys, dic):
    keywords = {}
    for key in keys:
        words = key.split()
        for word in words:
            if word not in keywords:
                keywords[word] = [dic[key]]
            else:
                keywords[word].append(dic[key])
    return keywords

# keywords should then remove stop words
keywords = create_keywords(page_keys, page_dictionary)
print(len(keywords))

1841142


In [140]:
flatten = lambda l: [item for sublist in l for item in sublist]
print(len(flatten(keywords['Aleksandr'])))
print(len(flatten(keywords['Kaepernick'])))
print(len(flatten(keywords['Colin'])))

4156
8
7661


In [141]:
# use retrieved sentence index to retrieve the raw sentence
# only return the sentence text part

retrieved_corpus = []
for sentence in flatten(keywords['Kaepernick']):
    #print(sentence)
    #print(corpus[sentence])
    retrieved_corpus.append(' '.join(corpus[sentence].split()[2:]))

print(len(retrieved_corpus))
print(retrieved_corpus[:5])

8
['Colin Rand Kaepernick -LRB- -LSB- ` kæpərnɪk -RSB- ; born November 3 , 1987 -RRB- is an American football quarterback who is currently a free agent .', 'Kaepernick played collegiate football at the University of Nevada where he was named the Western Athletic Offensive Player of the Year twice and was the Most Valuable Player of the 2008 Humanitarian Bowl .', 'Kaepernick was selected by the San Francisco 49ers in the second round of the 2011 NFL Draft .', "Kaepernick began his professional career as a backup to Alex Smith , but became the 49ers ' starter in the middle of the 2012 season after Smith suffered a concussion .", "He remained the team 's starting quarterback for the rest of the season and went on to lead the 49ers to their first Super Bowl appearance since 1994 , losing to the Baltimore Ravens ."]


In [142]:
corpus_keywords = pd.DataFrame(list(keywords.items()))
corpus_keywords.columns = ['Word','Index']
corpus_keywords.head()

Unnamed: 0,Word,Index
0,Alexander,"[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13..."
1,McNair,"[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13..."
2,Alatskivi,"[[18, 19, 20, 21], [1291710, 1291711, 1291712,..."
3,An,"[[22, 23, 24, 25], [2305, 2306], [3514, 3515, ..."
4,American,"[[22, 23, 24, 25], [80, 81, 82, 83], [143], [1..."


In [143]:
corpus_keywords.loc[corpus_keywords['Word'] == 'Colin']

Unnamed: 0,Word,Index
63373,Colin,"[[317928, 317929, 317930, 317931, 317932, 3179..."


## Find corpus index for all the claim tokens.

In [39]:
retrieved_sentences = []  # results raw sentences contains the page identifiers that is in the claim

test_claims = []
test_claim1 = "Aaron Burr killed Alexander Alexander Hamilton in Seaside Heights, New Jersey."
test_claim2 = "CHiPs is an American romance film."
test_claims.append(test_claim1)
test_claims.append(test_claim2)

for test_claim in test_claims:
    for token in tokenizer.tokenize(test_claim):
        print(token)
        if token in keywords:
            print(keywords[token])
            retrieved_sentences.append(keywords[token])
        
print("\n")      
flatten = lambda l: [item for sublist in l for item in sublist]
# retrieved sentence
print(retrieved_sentences[0])
print(retrieved_sentences[1])
#print(flatten(retrieved_sentences))
    

Aaron
Burr
killed
Alexander
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
Hamilton
in
Seaside
Heights
New
Jersey
CHiPs
is
an
American
[80, 81, 82, 83]
romance
film


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
[80, 81, 82, 83]


## Find all the corpus text for the claim tokens.

In [89]:
# use retrieved sentence index to retrieve the raw sentence
# only return the sentence text part

retrieved_corpus = []
for sentence in retrieved_sentences[0]:
    #print(sentence)
    #print(corpus[sentence])
    retrieved_corpus.append(' '.join(corpus[sentence].split()[2:]))

print(len(retrieved_corpus))
print(retrieved_corpus[:5])


18
['Alexander McNair -LRB- May 5 , 1775 -- March 18 , 1826 -RRB- was an American frontiersman and politician .', 'He was the first Governor of Missouri from its entry as a state in 1820 , until 1824 .', 'McNair was born in Lancaster in the Province of Pennsylvania and grew up in Mifflin County .', 'His grandfather , David McNair , Sr. , immigrated to Pennsylvania from Donaghmore , County Donegal , Ireland around 1733 and had Scottish ancestors from Loch Lomond .', "David McNair , Jr. , Alexander 's father -LRB- b. 1736 -RRB- , fought with General George Washington in the Trenton and Princeton campaigns in the winter of 1776 -- 77 , and died in February 1777 as a result of wounds received in battle and exposure when Alexander was less than two years old ."]


## Calculate cosine between claim and retrieved text.

In [None]:
corpus_test_claim = pd.DataFrame(list(test))
corpus_test_claim.columns = ['Word','Index']
corpus_test_claim.head()

In [157]:
from scipy.spatial.distance import cosine

print(test_claim1)
print("\n")
print(retrieved_corpus)

test_claim1_list = []
test_claim1_list.append(test_claim1)
tfidf_claim1 = tfidf.transform(test_claim1_list)
print(tfidf_claim1.shape)

tfidf_retrieved = tfidf.transform(retrieved_corpus)
print(tfidf_retrieved.shape)
print(tfidf_retrieved[0].shape)

print(1 - cosine(tfidf_claim1.toarray(),tfidf_retrieved[0].toarray()))
print(1 - cosine(tfidf_claim1.toarray(),tfidf_retrieved[1].toarray()))
print(1 - cosine(tfidf_claim1.toarray(),tfidf_retrieved[2].toarray()))
#cosine_value = 1 - cosine(tfidf_claim1,tfidf_retrieved)
#print(cosine_value)



Aaron Burr killed Alexander Hamilton in Seaside Heights, New Jersey.


['Colin Rand Kaepernick -LRB- -LSB- ` kæpərnɪk -RSB- ; born November 3 , 1987 -RRB- is an American football quarterback who is currently a free agent .', 'Kaepernick played collegiate football at the University of Nevada where he was named the Western Athletic Offensive Player of the Year twice and was the Most Valuable Player of the 2008 Humanitarian Bowl .', 'Kaepernick was selected by the San Francisco 49ers in the second round of the 2011 NFL Draft .', "Kaepernick began his professional career as a backup to Alex Smith , but became the 49ers ' starter in the middle of the 2012 season after Smith suffered a concussion .", "He remained the team 's starting quarterback for the rest of the season and went on to lead the 49ers to their first Super Bowl appearance since 1994 , losing to the Baltimore Ravens .", 'During the 2013 season , his first full season as a starter , Kaepernick helped the 49ers reach the NFC Cha

## Top 5 Score

In [None]:
from scipy.spatial.distance import cosine

query_rep = tfidf_claim1.toarray()
docs_rep = tfidf_retrieved_corpus.toarray()

query_doc_cos_dist = [cosine(query_rep, doc_rep) for doc_rep in docs_rep]
query_doc_sort_index = np.argsort(np.array(query_doc_cos_dist))

print_count = 0
for rank, sort_index in enumerate(query_doc_sort_index):
    print ('Rank : ', rank, ' Consine : ', 1 - query_doc_cos_dist[sort_index],' Review : ', datax['Review Text'][sort_index])
    if print_count == 4 :
        break
    else:
        print_count += 1

In [133]:
import nltk
from nltk.corpus import brown
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

nltk.download("brown")

brown_docs = []
lemmatizer = WordNetLemmatizer()

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

# iterate brown corpus and save words as documents
for paragraph in brown.paras():
    brown_word = set()
    for sentence in paragraph:
        for word in sentence:
            # remove words that are not alphabetic
            if word.isalpha():
                # lower-cased
                word = word.lower()
                # lemmatized
                # word = lemmatizer.lemmatize(word)
                word = lemmatize(word)
                brown_word.add(word)
                    
    brown_docs.append(brown_word)


[nltk_data] Downloading package brown to
[nltk_data]     /Users/zhangyiming/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [136]:
print(len(brown_docs))
print(brown_docs[0])

15667
{'county', 'take', 'an', 'any', 'grand', 'say', 'recent', 'evidence', 'jury', 'irregularity', 'of', 'produce', 'that', 'place', 'fulton', 'no', 'friday', 'the', 'election', 'investigation', 'primary'}


## Tokenize the claim.

In [23]:
 for key in list(res_data)[:10]:
    res_data[key]["evidence"] = []
    print(res_data[key]["claim"])
    claim_query = res_data[key]['claim']
    claim_tokens = tokenizer.tokenize(claim_query)
    print(claim_tokens)
    print("\n")

    for claim_token in claim_tokens:
        if claim_token in keywords:
            print(keywords[claim_token])
        
    

Colin Kaepernick became a starting quarterback during the 49ers 63rd season in the National Football League.
['Colin', 'Kaepernick', 'became', 'a', 'starting', 'quarterback', 'during', 'the', '49ers', '63rd', 'season', 'in', 'the', 'National', 'Football', 'League']


Tilda Swinton is a vegan.
['Tilda', 'Swinton', 'is', 'a', 'vegan']


Fox 2000 Pictures released the film Soul Food.
['Fox', '2000', 'Pictures', 'released', 'the', 'film', 'Soul', 'Food']


Anne Rice was born in New Jersey.
['Anne', 'Rice', 'was', 'born', 'in', 'New', 'Jersey']


Telemundo is a English-language television network.
['Telemundo', 'is', 'a', 'English', 'language', 'television', 'network']


Damon Albarn's debut album was released in 2011.
['Damon', 'Albarn', 's', 'debut', 'album', 'was', 'released', 'in', '2011']


There is a capital called Mogadishu.
['There', 'is', 'a', 'capital', 'called', 'Mogadishu']


Savages was exclusively a German film.
['Savages', 'was', 'exclusively', 'a', 'German', 'film']


Happin

# Bert

In [None]:
import numpy as np
from bert_serving.client import BertClient
from termcolor import colored

topk = 5

questions = retrieved_corpus
with BertClient(port=4000, port_out=4001) as bc:
    doc_vecs = bc.encode(questions)

    while True:
        query = input(colored('your question: ', 'green'))
        query_vec = bc.encode([query])[0]
        # compute normalized dot product as score
        score = np.sum(query_vec * doc_vecs, axis=1) / np.linalg.norm(doc_vecs, axis=1)
        topk_idx = np.argsort(score)[::-1][:topk]
        print('top %d questions similar to "%s"' % (topk, colored(query, 'green')))
        for idx in topk_idx:
            print('> %s\t%s' % (colored('%.1f' % score[idx], 'cyan'), colored(questions[idx], 'yellow')))