### Information Retrieval

This is the document retrieval and sentence retrieval part of the project.

####  1. Unpack the zip file.
    
Unpack the 'wiki-pages-text.zip' in the current directory.

In [5]:
import zipfile
def unpack():
    with zipfile.ZipFile('wiki-pages-text.zip') as file:
        file.extractall()
unpack()

#### 2. Load file. 

Load the training dataset and the wiki txt file.

In [117]:
import os
import json
with open('train.json', 'r') as f:  # load training dataset
        train_data = json.load(f)   
print("Length of the train data is: " + str(len(train_data)))

with open('devset.json', 'r') as f1:  # load dev dataset
        dev_data = json.load(f1) 
        
# appeand all the wiki txt sentences to one document
def loadfile(folder): 
    document = []
    list_of_files = os.listdir(folder)
    
    for file in list_of_files:
        try:
            filename = os.path.join(folder, file)
            with open(filename, 'r') as doc:
                for line in doc:
                    document.append(line)     
        except Exception as e:
            print("No files found here!")
            raise e
    return document
document = loadfile("wiki-pages-text")

print("Length of the document is: " + str(len(document)))

Length of the train data is: 145449


Length of the document is: 25248397


#### 3. Preprocess 
Preprocess includes: strip punctuations, tokenize,lemma, lower case, remove stop words.

In [113]:
import nltk
import re
from nltk.corpus import stopwords
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

stop_words = set(stopwords.words('english')) 

def preprocess(sentence):
    norm_sentence = []
    sentence = re.sub(r'[^\w\s]', '', sentence) # strip punctuations: remove ',' '.',etc.
    tokens = nltk.tokenize.word_tokenize(sentence)  
                
    for token in tokens:
        token = lemmatize(token)
        token = token.lower() 
        if (token == "no" or token == "not"):  # keep no from stop words as it is useful for analysis
            norm_sentence.append(token)  
        if token not in stop_words:         # remove stop words
            norm_sentence.append(token)                  
    return norm_sentence

#### 4. Build inverted index.
Compute document term frequency, build <b> inverted index </b> and then uses BM25 to rank.



In [114]:
from collections import Counter

def doc_term_freq(doc):
    doc_term_freqs = []
    for sentence in doc:
        doc_term_freqs.append(Counter(sentence)) 
    return doc_term_freqs

class InvertedIndex:
    def __init__(self, vocab, doc_term_freqs):
        self.vocab = vocab
        self.doc_len = [0] * len(doc_term_freqs)
        self.doc_term_freqs = [[] for i in range(len(vocab))]
        self.doc_ids = [[] for i in range(len(vocab))]
        self.doc_freqs = [0] * len(vocab)
        self.total_num_docs = 0
        self.total_doc_len = 0
        for docid, term_freqs in enumerate(doc_term_freqs):
            doc_len = sum(term_freqs.values())
            self.total_doc_len += doc_len
            self.doc_len[docid] = doc_len
            self.total_num_docs += 1
            for term, freq in term_freqs.items():
                term_id = vocab[term]
                self.doc_ids[term_id].append(docid)
                self.doc_term_freqs[term_id].append(freq)
                self.doc_freqs[term_id] += 1

    def num_terms(self):
        return len(self.doc_ids)

    def num_docs(self):
        return self.total_num_docs

    def docids(self, term):
        term_id = self.vocab[term]
        return self.doc_ids[term_id]

    def freqs(self, term):
        term_id = self.vocab[term]
        return self.doc_term_freqs[term_id]

    def f_t(self, term):
        term_id = self.vocab[term]
        return self.doc_freqs[term_id]

#### 5. BM25 
Okapi BM25 function is used to rank the sentences.

In [122]:
from math import log, sqrt

def BM25(query, index, k=5):
    k1 = 1.2
    k3 = 1.5
    b = 0.75
    # scores stores doc ids and their scores
    scores = Counter()
    length_average = index.total_doc_len/index.total_num_docs  # average length of document 
    
    for term in query:
        if term not in list(vocab.keys()):     # skip if the query word is not in the vocab
            continue
        N = index.num_docs()      # N: total number of documents
        ft = index.f_t(term)       # ft: document frequency of term
        docs = index.docids(term)    # docs: all doc ids that contain the term 
        dft = index.freqs(term)      # dft: all document freqs of the term 
        fqt = Counter(query)[term]   # fqt: frequency of tern t in query
        
        for num, docid in enumerate(docs):                                                                       
            fdt = dft[num]                                #fdt: frequency of term t in document d   
            length = sqrt(abs(index.doc_len[docid]))      # length: length of the doc
            #BM25 consists of three parts
            idf = log((N - ft + 0.5)/(ft + 0.5))
            tf = ((k1 + 1) * fdt)/(k1 * ((1-b) + b * length/length_average) + fdt)
            query_tf = (k3 + 1) * fqt / (k3 + fqt)
            
            scores[docid] +=  idf * tf * query_tf           
    return scores.most_common(k)

#### 6. Use page identifier to reduce the search space.

Build a dictionary that consists the <b>page identifier</b> as the key and the document index(ranges form 0 to 25248397 ） as value.

Examples looks like: "Alexander McNair" : [1,2,3,4...18] 

<b>To do:</b> the keywords list may also consider synonyms for each keyword.


In [118]:
final_dic = {}

for id, doc in enumerate(document):
    final_dic.setdefault((doc.split()[0].replace("_", " ")), []).append(id)

keys = list(final_dic.keys())
print("Length of keys = {}".format(len(keys)))

Length of keys = 5396106


#### 7. Retrieval Evidence
<b>7.1</b> Given a query, tokenize it first, then for each token in the query, try to find it in the keys list. 

In [149]:
# Given a query, returns a list that contains all the document index values.
def extract_sentences(query, keys, final_dic):
    retrievaled_sentences = []
    for word in  nltk.tokenize.word_tokenize(query):
        for key in keys:
            if re.search(word, key):
                retrievaled_sentences.append(final_dic[key])
            
    retrievaled_sentences = [item for sublist in retrievaled_sentences for item in sublist]
    return retrievaled_sentences


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 216, 217, 260, 538, 670, 671, 684, 685, 686, 687, 776, 777, 896, 897, 898, 992, 993, 994, 995, 1065, 1066, 1067, 1068, 1069, 1123, 1143, 1144, 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1215, 1216, 1217, 1218, 1219, 1220, 1221, 1222, 1223, 1224, 1225, 1226, 1227, 1228, 1229, 1230, 1231, 1232, 1393, 1394, 1395, 1484, 1588, 1589, 1590, 1591, 1592, 1593, 1654, 1655, 2221, 2222, 2223, 2224, 2225, 2268, 2269, 2270, 2271, 2272, 2273, 2279, 2280, 883, 884, 885, 886, 887, 888]
108


<b>7.2</b> If there is a match or substring match, then retrieve that sentence by using index value to find the raw sentence in txt file. 

<b>7.3</b> Find all the related sentences by this way and then builds a inverted index and 
uses BM25 to rank and to retrieval top K sentences as the evidence.


In [158]:
# Use the result from above step, find all the raw sentences in txt file.
def retrieval_evidence(query, keys, final_dic):
    processed_doc = [] # processed_docs stores the list of processed docs
    vocab = {}
    unique_id = 0
    
    retrievaled_sentences = extract_sentences(query,keys,final_dic)
    
    # find the row sentences and save them in processed_doc
    for retrievaled_sentence in retrievaled_sentences:
        norm_sentence = preprocess(document[retrievaled_sentence])
        for token in norm_sentence:
            if token not in vocab:
                vocab.update({token: unique_id})     
                unique_id = unique_id + 1
        processed_doc.append(norm_sentence) 
    
    # calculate doc term freqs and build an inverted index
    doc_term_freqs = doc_term_freq(processed_doc)
    invindex = InvertedIndex(vocab, doc_term_freqs)
    bm25_results = BM25(processed_query, invindex)
    
    for rank, res in enumerate(bm25_results):
        print("RANK {:2d} DOCID {:8d} SCORE {:.3f} CONTENT {:}".format(rank+1,res[0],res[1],document[res[0]]))
    # return processed_doc,vocab

# test_query = "Nikolaj Coster-Waldau worked with the Fox Broadcasting Company."
test_query = "Alexander Alatskivi"
retrieval_evidence(test_query,keys[:100],final_dic)

RANK  1 DOCID        4 SCORE 1.477 CONTENT Alexander_McNair 6 David McNair , Jr. , Alexander 's father -LRB- b. 1736 -RRB- , fought with General George Washington in the Trenton and Princeton campaigns in the winter of 1776 -- 77 , and died in February 1777 as a result of wounds received in battle and exposure when Alexander was less than two years old .

RANK  2 DOCID        7 SCORE 1.386 CONTENT Alexander_McNair 9 Alexander was defeated .

RANK  3 DOCID       14 SCORE 1.282 CONTENT Alexander_McNair 20 Later in September , he enlisted as a private in the First Regiment of Mounted Militia commanded by Colonel Alexander McNair .

RANK  4 DOCID        0 SCORE 1.274 CONTENT Alexander_McNair 0 Alexander McNair -LRB- May 5 , 1775 -- March 18 , 1826 -RRB- was an American frontiersman and politician .

RANK  5 DOCID        5 SCORE 1.274 CONTENT Alexander_McNair 7 Alexander went to school as a child , and attended one term at the College of Philadelphia -LRB- now the University of Pennsylvania

'\nprocessed_doc = [] \nvocab = {}\nretrieval_sen = extract_sentences(test_query,keys[:500],final_dic)\nprocessed_doc,vocab = retrieval_evidence(test_query,retrieval_sen)\n\nprint("Number of sentences = {}".format(len(processed_doc)))\nprint("Number of unique terms = {}".format(len(vocab)))\n\n\n\ndoc_term_freqs = doc_term_freq(processed_doc)\n\ninvindex = InvertedIndex(vocab, doc_term_freqs)\n\n# print inverted index stats\nprint("documents = {}".format(invindex.num_docs()))\nprint("number of terms = {}".format(invindex.num_terms()))\n\nquery = "Alexander Alatskivi"\nprocessed_query = preprocess(query)\n\nbm25_results = BM25(processed_query, invindex)\nfor rank, res in enumerate(bm25_results):\n    print("RANK {:2d} DOCID {:8d} SCORE {:.3f} CONTENT {:}".format(rank+1,res[0],res[1],document[res[0]]))\n'

<b>Predict evidence for 'train.json' dataset  </b>



In [124]:
def get_evidence(train_data):
    for key in list(train_data)[:10]:
        train_data[key]["predicted_evidence"] = []
        
        bm25_result = BM25(preprocess(train_data[key]["claim"]), invindex)
        for rank, res in enumerate(bm25_result):
            train_data[key]["predicted_evidence"].append([document[res[0]].split()[0], document[res[0]].split()[1]])
    return train_data

predicted_train = get_evidence(train_data)

for key in list(predicted_train)[:10]:
    print(predicted_train[key])
    print("\n")

{'claim': 'Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.', 'label': 'SUPPORTS', 'evidence': [['Fox_Broadcasting_Company', 0], ['Nikolaj_Coster-Waldau', 7]], 'predicted_evidence': [['Alexander_McNair', '24']]}


{'claim': 'Roman Atwood is a content creator.', 'label': 'SUPPORTS', 'evidence': [['Roman_Atwood', 1], ['Roman_Atwood', 3]], 'predicted_evidence': []}


{'claim': 'History of art includes architecture, dance, sculpture, music, painting, poetry literature, theatre, narrative, film, photography and graphic arts.', 'label': 'SUPPORTS', 'evidence': [['History_of_art', 2]], 'predicted_evidence': []}


{'claim': 'Adrienne Bailon is an accountant.', 'label': 'REFUTES', 'evidence': [['Adrienne_Bailon', 0]], 'predicted_evidence': []}


{'claim': 'System of a Down briefly disbanded in limbo.', 'label': 'NOT ENOUGH INFO', 'evidence': [], 'predicted_evidence': []}


{'claim': 'Homeland is an American television spy thriller based on the Israeli television series Prisoners 