## Library

In [1]:
import os
import gc
import json
import pickle
import datetime
import nltk
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from itertools import groupby
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import coo_matrix

## Load  dataframe

In [2]:
# load_processed_corpus_df = pd.read_pickle("./processed_corpus.pkl")
load_processed_corpus_df = pd.read_csv("./new_wiki.csv")
print(load_processed_corpus_df.shape)
load_processed_corpus_df.head(10)

  interactivity=interactivity, compiler=compiler, result=result)


(25248397, 3)


Unnamed: 0,page_identifier,sentence_number,text
0,Alexander_McNair,0,Alexander_McNair 0 Alexander McNair Alexander ...
1,Alexander_McNair,1,Alexander_McNair 1 Alexander McNair He was the...
2,Alexander_McNair,4,Alexander_McNair 4 Alexander McNair McNair was...
3,Alexander_McNair,5,Alexander_McNair 5 Alexander McNair His grandf...
4,Alexander_McNair,6,Alexander_McNair 6 Alexander McNair David McNa...
5,Alexander_McNair,7,Alexander_McNair 7 Alexander McNair Alexander ...
6,Alexander_McNair,8,Alexander_McNair 8 Alexander McNair He reached...
7,Alexander_McNair,9,Alexander_McNair 9 Alexander McNair Alexander ...
8,Alexander_McNair,10,Alexander_McNair 10 Alexander McNair He became...
9,Alexander_McNair,13,"Alexander_McNair 13 Alexander McNair In 1804 ,..."


## Load  traning, dev, test dataset.

In [3]:
with open('devset.json', 'r') as f1:  # load dev dataset
        dev_data = json.load(f1) 
print("Length of the dev data is: " + str(len(dev_data)))

with open('devset_result.json', 'r') as f2:  # store result 
        res_data = json.load(f2) 
print("Length of the dev result data is: " + str(len(res_data)))

with open('test-unlabelled.json', 'r') as f3:  # store result 
     test_data = json.load(f3) 
print("Length of the test data is: " + str(len(test_data)))

Length of the dev data is: 5001
Length of the dev result data is: 5001
Length of the test data is: 14997


## Preprocess

In [4]:
nltk.download('stopwords')

lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

def pre_process(comment) -> str:
    # lower cased
    comment = comment.lower()
    # tokenize
    words =  nltk.tokenize.word_tokenize(comment)
    # lemmatize 
    words = [lemmatize(w) for w in words]
    
    stop_words = nltk.corpus.stopwords.words('english')
    words = [w for w in words if not w in stop_words]
    processed_comment = " ".join(words)
    return processed_comment


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhangyiming/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Build a dictionary to store index in dataframe ( only run once)
## Deal with LRB in dictionary. (only run once)

In [5]:
page_dictionary = {}

for index, row in load_processed_corpus_df.iterrows(): 
    if isinstance(row['page_identifier'], float):
        continue
    page_dictionary.setdefault(row['page_identifier'],[]).append(index)

def solve_LRB(dictionary):
    for key, value in dictionary.items():
        if '_-LRB' in key:
            if key.split('_-LRB')[0] in dictionary:
                for v in value:
                    dictionary[key.split('_-LRB')[0]].append(v)

    return dictionary

lrb_dictionary = solve_LRB(page_dictionary)

## Delete keys in dictionary.

In [230]:
delete_words_list = ['The','Part', 'Most','Water', 'How', 'Love','Speech','American','President','German','Irish',
                     'Indian','Spanish','Japan','Califorina','Americans','Chinese','British','Monday','Tuesday',
                    'Wednesday','Thursday', 'Friday','Saturday', 'Sunday','January','February','March','April',
                     'May', 'June', 'July', 'August', 'September', 'October', 'November','December']

for word in delete_words_list:
    try:
        del lrb_dictionary[word]
    except KeyError:
        pass
    

In [229]:
# page keys are the page identifier, keys of the page_dictionary
page_keys = list(lrb_dictionary.keys())

## Use dictionary to retrieve the sentence text.

In [231]:
# input: a page identifier string, keys are all the keys of the page_dictionary
# output: return a list of strings which contains all the relevanted sentence text,also with page_identifier and sentence number so
# that it can be written as evidence part.

def retrieve_sentenceText(claim_word,page_keys,page_dictionary,df):
    retrieved_sentence = []
    if claim_word in page_keys:
        retrieved_index = page_dictionary[claim_word]      # all indexes in the dataframe
        for index in retrieved_index:
             # retrieve all the raw doc txt
            retrieved_sentence.append(df.loc[index, 'text'])
    return retrieved_sentence

## Rule on Claim: find the Upper cased words.

Rule1: First Continuous Upper words.

In [232]:
def find_upper_word(claim, page_keys):
    res_list = []
    res_index = []      
    words = claim.split()
    
    start = ""
    temp = ""
    i = 0
    while i < len(words):         
        # first step: find uppercase word in the claim
        if words[i][0].isupper():
            temp = words[i]        
            start = temp  # start(as a cache)                        
            for j in range(i,len(words)-1):
                temp = temp + '_' + words[j+1]
                #print(temp)
                if temp in page_keys:
                    start = temp  # matchs the word as long as possible
                    i = j + 1
                
                if j - i > 2:
                    break
                
            res_list.append(start)  
        i += 1                    
    return res_list


In [233]:
test_claim1 = "International students come to the University of Mississippi from 90 cities"
print(find_upper_word(test_claim1,page_keys))
test_claim2 = "Home for the Holidays stars an American actress"
print(find_upper_word(test_claim2,page_keys))
test_claim3 = "Water is part of the History of Earth"
print(find_upper_word(test_claim3,page_keys))
test_claim4 = "How to Train Your Dragon 2 used real dragons"
print(find_upper_word(test_claim4,page_keys))

['International', 'University_of_Mississippi']
['Home_for_the_Holidays', 'American']
['Water', 'History_of_Earth']
['How_to_Train_Your_Dragon_2']


In [21]:
# TESTING LRB
# print(find_upper_word("Savages was exclusively a German film."))
# print(retrieve_sentenceText(find_upper_word("Savages was exclusively a German film.")[0], page_keys,lrb_dictionary,load_processed_corpus_df))

## Method1 :

### TFIDF Vectorizer and SVD

In [234]:
def retrieval_evidence_func(query,page_keys,page_dictionary,load_processed_corpus_df):
        
    res = []
    # Determine if the first word in query is "There" and "A"
    if query.split()[0] == "There":
        query = query.replace("There","there") 
        
    if query.split()[0] == "A":
        query = query.replace("A","a") 
    
    # remove all 's 
    for word in query.split():
        if (word[-2:len(word)]) == "'s":
            query = query.replace(word,word[:-2])
            
     # remove all '
    for word in query.split():
        if (word[-1:len(word)]) == "'":
            query = query.replace(word,word[:-1])
            
    # remove all .
    for word in query.split():
        if (word[-1:len(word)]) == ".":
            query = query.replace(word,word[:-1])
            
    query_corpus = []
    if len(find_upper_word(query,page_keys)) >= 1:
        for query_word in find_upper_word(query,page_keys):
            query_corpus.extend(retrieve_sentenceText(query_word, page_keys,page_dictionary,load_processed_corpus_df))
        
        if len(query_corpus) == 1:
            res.append([query_corpus[0].split()[0],int(query_corpus[0].split()[1])])
        else:
            # build a tfidf model on the query corpus
            tfidf_vectorizer = TfidfVectorizer()
            tfidf_matrix = tfidf_vectorizer.fit_transform(query_corpus)
            tfidf_matrix = tfidf_matrix.T

            # apply svd
            K= 2 # number of desirable features 
            U, s, VT = np.linalg.svd(tfidf_matrix.toarray())
            #tfidf_reduced = np.dot(U[:,:K], np.dot(np.diag(s[:K]), VT[:K, :]))
            terms_rep = np.dot(U[:,:K], np.diag(s[:K]))
            docs_rep = np.dot(np.diag(s[:K]), VT[:K, :]).T

            # calculate query rep
            query_rep = []
            for q in pre_process(query).split():
                if q in tfidf_vectorizer.vocabulary_:
                    query_rep.append(tfidf_vectorizer.vocabulary_[q])
                else:
                    continue
            query_rep = np.mean(terms_rep[query_rep],axis=0)

            # calculate cosine similarity between the query and retrieved sentences
            query_doc_cos_dist = []
            for doc_rep in docs_rep:
                query_doc_cos_dist.append(cosine(query_rep, doc_rep))

            query_doc_sort_index = np.argsort(np.array(query_doc_cos_dist))
            # retrievel top 3
            count = 0
            for rank, sort_index in enumerate(query_doc_sort_index):
                res.append([query_corpus[sort_index].split()[0],int(query_corpus[sort_index].split()[1])])
                if count == 3:
                    break
                else:
                    count += 1     
        return res
    
    else:
        return []

## Method2: 
### Count Vectorizer

In [235]:
def retrieval_evidence_func2(query,page_keys,page_dictionary,load_processed_corpus_df):
    res = []
    # Determine if the first word in query is "There" and "A"
    if query.split()[0] == "There":
        query = query.replace("There","there") 
        
    if query.split()[0] == "A":
        query = query.replace("A","a") 
    
    # remove all 's 
    for word in query.split():
        if (word[-2:len(word)]) == "'s":
            query = query.replace(word,word[:-2])
            
     # remove all '
    for word in query.split():
        if (word[-1:len(word)]) == "'":
            query = query.replace(word,word[:-1])
            
    # remove all .
    for word in query.split():
        if (word[-1:len(word)]) == ".":
            query = query.replace(word,word[:-1])
    
    for word in query.split():
        if (word[-1:len(word)]) == ",":
            query = query.replace(word,word[:-1])
            
    query_corpus = []
    if len(find_upper_word(query,page_keys)) >= 1:
        for query_word in find_upper_word(query,page_keys):
            query_corpus.extend(retrieve_sentenceText(query_word, page_keys,page_dictionary,load_processed_corpus_df))
        
        if len(query_corpus) == 1:
            res.append([query_corpus[0].split()[0],int(query_corpus[0].split()[1])])
        else:
            # preprocess claim
            processed_query_claim = pre_process(query)
            # preprocess evidences
            processed_query_corpus = []
            for corpus in query_corpus:
                processed_query_corpus.append(pre_process(corpus))

            count_vectorizer = CountVectorizer()

            processed_query_corpus.insert(0, processed_query_claim)

            count = count_vectorizer.fit_transform(processed_query_corpus)

            query_rep = count[0].todense()
            docs_rep = count[1:].todense()

            # calculate cosine similarity between the query and retrieved sentences
            query_doc_cos_dist = []

            # cosine distance, and hence no need to revese argsort result
            for doc_rep in docs_rep:
                query_doc_cos_dist.append(cosine(query_rep, doc_rep))

            query_doc_sort_index = np.argsort(np.array(query_doc_cos_dist))

            count = 0
            for rank, sort_index in enumerate(query_doc_sort_index):
                res.append([query_corpus[sort_index].split()[0],int(query_corpus[sort_index].split()[1])])
                if count == 2:
                    break
                else:
                    count += 1   
        return res
        
    else:
        return []

## Testing

In [236]:
# testing
test_claim1 = "Brad Wilk helped co-found Rage in 1962."
test_claim2 = "The Faroe Islands are no longer part of the Kingdom of Mercia."
test_claim3 = "Down With Love is a 2003 comedy film."
test_claim4 = "Telemundo is a English-language television network."
# To test LRB
test_claim5 = 'Hourglass is performed by a Russian singer-songwriter.'
# To test first word "There"
test_claim6 = "There are no musical or creative works in existence that have been created by Phillip Glass."
# To test 's ' in the word
test_claim7 = "Damon Albarn's debut album was released in 2011."

test_claim8 = "Temple of the Dog celebrated the 37th anniversary of their self-titled album."
test_claim9 = "Savages was exclusively a German film."

print(retrieval_evidence_func2(test_claim1,page_keys,lrb_dictionary,load_processed_corpus_df))
print(retrieval_evidence_func2(test_claim2,page_keys,lrb_dictionary,load_processed_corpus_df))
print(retrieval_evidence_func2(test_claim3,page_keys,lrb_dictionary,load_processed_corpus_df))
print(retrieval_evidence_func2(test_claim4,page_keys,lrb_dictionary,load_processed_corpus_df))
print(retrieval_evidence_func2(test_claim5,page_keys,lrb_dictionary,load_processed_corpus_df))
print(retrieval_evidence_func2(test_claim6,page_keys,lrb_dictionary,load_processed_corpus_df))
print(retrieval_evidence_func2(test_claim7,page_keys,lrb_dictionary,load_processed_corpus_df))

[['Brad_Wilk', 4], ['Brad_Wilk', 0], ['Rage', 0]]
[['Faroe_Islands', 9], ['Faroe_Islands', 1], ['Faroe_Islands', 11]]
[['With_Love', 0], ['Down_-LRB-film-RRB-', 2], ['Down_-LRB-film-RRB-', 9]]
[['Telemundo', 0], ['Telemundo', 9], ['Telemundo', 8]]
[['Russian', 18], ['Russian', 23], ['Russian', 16]]
[['Glass', 20], ['Glass', 14], ['Glass', 10]]
[['Damon_Albarn', 4], ['Damon_Albarn', 13], ['Damon_Albarn', 12]]


In [None]:
# To test: can not find 
test1 = "Boyhood is about Mason Evans, Jr's childhood."
test2 = "In 1986, Tatum O'Neal got married."


## Write result to  devset file.

In [237]:
i = 0
for key in list(res_data):
    res_data[key]["evidence"] = []
    query= (res_data[key]["claim"])
    res_data[key]["evidence"] = retrieval_evidence_func2(query,page_keys,lrb_dictionary,load_processed_corpus_df)
    i += 1
    print(i)
    # print(retrieval_evidence_func(query,page_keys,page_dictionary,load_processed_corpus_df))
    
with open('result_dev_520_4_tf5.json', 'w') as f:
    json.dump(res_data, f, indent = 4)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208


KeyboardInterrupt: 

## Write result to test file.

In [None]:
i = 0
for key in list(test_data):
    test_data[key]["evidence"] = []
    query= (test_data[key]["claim"])
    test_data[key]["evidence"] = retrieval_evidence_func2(query,page_keys,lrb_dictionary,load_processed_corpus_df)
    i += 1
    print(i)
    # print(retrieval_evidence_func(query,page_keys,page_dictionary,load_processed_corpus_df))
    
with open('result_test_519_tf3.json', 'w') as f:
    json.dump(test_data, f, indent = 4)

## DataFrame for BERT Training

In [None]:
with open('train.json', 'r') as f4:  # store result 
        train_data = json.load(f4) 

claim_list = []
evidence_list = []
result_list = []

def bert_data(train_data,page_keys,lrb_dictionary,load_processed_corpus_df):
    for key in list(train_data):
        claim = (train_data[key]["claim"])
        claim_list.append(claim)
        
        res = []
        # Determine if the first word in query is "There" and "A"
        if claim.split()[0] == "There":
            claim = claim.replace("There","there") 

        if claim.split()[0] == "A":
            claim = claim.replace("A","a") 

        # remove all 's 
        for word in claim.split():
            if (word[-2:len(word)]) == "'s":
                claim = claim.replace(word,word[:-2])

         # remove all '
        for word in claim.split():
            if (word[-1:len(word)]) == "'":
                claim = claim.replace(word,word[:-1])

        # remove all .
        for word in claim.split():
            if (word[-1:len(word)]) == ".":
                claim = claim.replace(word,word[:-1])

        if len(find_upper_word(claim,page_keys)) >= 1:
            for claim_word in find_upper_word(claim,page_keys):
                claim_corpus = retrieve_sentenceText(claim_word, page_keys,lrb_dictionary,load_processed_corpus_df)        
                    
                evidence_list.append(claim_corpus)
        