### Information Retrieval

This is the document retrieval and sentence retrieval part of the project.
Method 

#### 1. Unpack the zip file.
    
Unpack the 'wiki-pages-text.zip' in the current directory.

In [5]:
import zipfile

def unpack():
    with zipfile.ZipFile('wiki-pages-text.zip') as file:
        file.extractall()
unpack()

#### 2. Load file. 

Load the training dataset and the wiki txt file.

In [1]:
import os
import json
with open('train.json', 'r') as f:  # load training dataset
        train_data = json.load(f)   

print("Length of the train data is: " + str(len(train_data)))

# Top 3 instances in train data
print("Top 3 instances in train data")
for key in list(train_data)[:3]:
    print(key, train_data[key])
print("\n")

# appeand all the wiki txt sentences to one document
def loadfile(folder): 
    document = []
    list_of_files = os.listdir(folder)
    print(list_of_files)
    
    for file in list_of_files:
        try:
            filename = os.path.join(folder, file)
            with open(filename, 'r') as doc:
                for line in doc:
                    document.append(line)     
        except Exception as e:
            print("No files found here!")
            raise e
            
    return document

document = loadfile("wiki-pages-text")

print("\n")
print("Length of the document is: " + str(len(document)))
print("\n")

# index0 wiki file, index1 sentence
print(document[0])
print(document[1])
print(document[2])
print("\n")
        


Length of the train data is: 145449
Top 3 instances in train data
75397 {'claim': 'Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.', 'label': 'SUPPORTS', 'evidence': [['Fox_Broadcasting_Company', 0], ['Nikolaj_Coster-Waldau', 7]]}
150448 {'claim': 'Roman Atwood is a content creator.', 'label': 'SUPPORTS', 'evidence': [['Roman_Atwood', 1], ['Roman_Atwood', 3]]}
214861 {'claim': 'History of art includes architecture, dance, sculpture, music, painting, poetry literature, theatre, narrative, film, photography and graphic arts.', 'label': 'SUPPORTS', 'evidence': [['History_of_art', 2]]}


['wiki-009.txt', 'wiki-021.txt', 'wiki-035.txt', 'wiki-034.txt', 'wiki-020.txt', 'wiki-008.txt', 'wiki-036.txt', 'wiki-022.txt', 'wiki-023.txt', 'wiki-037.txt', 'wiki-033.txt', 'wiki-027.txt', 'wiki-026.txt', 'wiki-032.txt', 'wiki-024.txt', 'wiki-030.txt', 'wiki-018.txt', 'wiki-019.txt', 'wiki-031.txt', 'wiki-025.txt', 'wiki-042.txt', 'wiki-056.txt', 'wiki-081.txt', 'wiki-095.txt', 'wiki-09

<b>3. Preprocess </b>

3.1 Preprocess the sentence: strip punctuations, tokenize,lemma, lower case, remove stop words.

In [22]:
import nltk
import re
from nltk.corpus import stopwords
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

stop_words = set(stopwords.words('english')) 

processed_doc = [] # processed_docs stores the list of processed docs
vocab = {}
unique_id = 0

def preprocess(sentence):
    norm_sentence = []
    sentence = re.sub(r'[^\w\s]', '', sentence) # strip punctuations: remove ',' '.',etc.
    tokens = nltk.tokenize.word_tokenize(sentence)  
                
    for token in tokens:
        token = lemmatize(token)
        token = token.lower() 
        if (token == "no" or token == "not"):  # keep no from stop words as it is useful for analysis
            norm_sentence.append(token)  
        if token not in stop_words:         # remove stop words
            norm_sentence.append(token)                  
    return norm_sentence

 # testing 10000 sentences
for sentence in document[:10000]:
    norm_sentence = preprocess(sentence)
    
    for token in norm_sentence:
        if token not in vocab:
            vocab.update({token: unique_id})     
            unique_id = unique_id + 1
                
    processed_doc.append(norm_sentence) 

print("\n")
print("Number of sentences = {}".format(len(processed_doc)))
print("Number of unique terms = {}".format(len(vocab)))

print("\n")
print(processed_doc[0])
print(processed_doc[1])
print(processed_doc[2])



Number of sentences = 10000
Number of unique terms = 25147


['alexander_mcnair', '0', 'alexander', 'mcnair', 'lrb', 'may', '5', '1775', 'march', '18', '1826', 'rrb', 'american', 'frontiersman', 'politician']
['alexander_mcnair', '1', 'first', 'governor', 'missouri', 'entry', 'state', '1820', '1824']
['alexander_mcnair', '4', 'mcnair', 'bear', 'lancaster', 'province', 'pennsylvania', 'grow', 'mifflin', 'county']


3.2 Calculate document term frequence.

In [29]:
from collections import Counter

def doc_term_freq(doc):
    doc_term_freqs = []
    
    for sentence in doc:
        doc_term_freqs.append(Counter(sentence)) 
    
    return doc_term_freqs

doc_term_freqs = doc_term_freq(processed_doc)

print("Number of doc term freqs = {}".format(len(doc_term_freqs)))
print("\n")
print(doc_term_freqs[0])
print(doc_term_freqs[0]['lrb'])
print(doc_term_freqs[1])
print(doc_term_freqs[2])

Number of doc term freqs = 10000


Counter({'alexander_mcnair': 1, '0': 1, 'alexander': 1, 'mcnair': 1, 'lrb': 1, 'may': 1, '5': 1, '1775': 1, 'march': 1, '18': 1, '1826': 1, 'rrb': 1, 'american': 1, 'frontiersman': 1, 'politician': 1})
1
Counter({'alexander_mcnair': 1, '1': 1, 'first': 1, 'governor': 1, 'missouri': 1, 'entry': 1, 'state': 1, '1820': 1, '1824': 1})
Counter({'alexander_mcnair': 1, '4': 1, 'mcnair': 1, 'bear': 1, 'lancaster': 1, 'province': 1, 'pennsylvania': 1, 'grow': 1, 'mifflin': 1, 'county': 1})


3.3 Build Inverted Index.

In [27]:
class InvertedIndex:
    def __init__(self, vocab, doc_term_freqs):
        self.vocab = vocab
        self.doc_len = [0] * len(doc_term_freqs)
        self.doc_term_freqs = [[] for i in range(len(vocab))]
        self.doc_ids = [[] for i in range(len(vocab))]
        self.doc_freqs = [0] * len(vocab)
        self.total_num_docs = 0
        self.total_doc_len = 0
        for docid, term_freqs in enumerate(doc_term_freqs):
            doc_len = sum(term_freqs.values())
            self.total_doc_len += doc_len
            self.doc_len[docid] = doc_len
            self.total_num_docs += 1
            for term, freq in term_freqs.items():
                term_id = vocab[term]
                self.doc_ids[term_id].append(docid)
                self.doc_term_freqs[term_id].append(freq)
                self.doc_freqs[term_id] += 1

    def num_terms(self):
        return len(self.doc_ids)

    def num_docs(self):
        return self.total_num_docs

    def docids(self, term):
        term_id = self.vocab[term]
        return self.doc_ids[term_id]

    def freqs(self, term):
        term_id = self.vocab[term]
        return self.doc_term_freqs[term_id]

    def f_t(self, term):
        term_id = self.vocab[term]
        return self.doc_freqs[term_id]

invindex = InvertedIndex(vocab, doc_term_freqs)

# print inverted index stats
print("documents = {}".format(invindex.num_docs()))
print("number of terms = {}".format(invindex.num_terms()))

documents = 10000
number of terms = 25147


3.4 Uses score function to rank the sentences.

\begin{equation*}
Score(Q,d) = \frac{1}{\sqrt{|d|}} \times \sum_{i=1}^q \log(1 + f_{d,t}) * \log( \frac{N}{f_t} ) 
\end{equation*}

In [28]:
from math import log, sqrt

# given a query and an index returns a list of the k highest scoring documents as tuples containing <docid,score>
def query_tfidf(query, index, k=5):
    
    # scores stores doc ids and their scores
    scores = Counter()
    for term in query:   
        N = index.num_docs()      # N: total number of documents
        ft = index.f_t(term)       # ft: document frequency of term
        docs = index.docids(term)    # docs: all doc ids that contain the term 
        dft = index.freqs(term)      # dft: all document freqs of the term         
        
        for num, docid in enumerate(docs):                # num: index used for iterate the docs                                                        
            fdt = dft[num]                                #fdt: frequency of term t in document d   
            length = sqrt(abs(index.doc_len[docid]))      # length: length of the doc
            tfidf = log(1 + fdt)*log(N/ft)                 # tfidf: construct the score formula 
            scores[docid] += tfidf/length                  # score: the final score

    return scores.most_common(k)

query = "Alexander_McNair"

processed_query = preprocess(query)
print("Processed query is = {}".format(processed_query))

results = query_tfidf(processed_query, invindex)
for rank, res in enumerate(results):
    print("RANK {:2d} DOCID {:8d} SCORE {:.3f} CONTENT {:}".format(rank+1,res[0],res[1],document[res[0]]))
    # print("RANK "+ str(rank+1) + " Evidence: " + str(document[res[0]][:100]))



Processed query is = ['alexander_mcnair']
RANK  1 DOCID        7 SCORE 2.190 CONTENT Alexander_McNair 9 Alexander was defeated .

RANK  2 DOCID       16 SCORE 1.549 CONTENT Alexander_McNair 24 After his time as governor , he worked in the Indian Department until his death .

RANK  3 DOCID        1 SCORE 1.460 CONTENT Alexander_McNair 1 He was the first Governor of Missouri from its entry as a state in 1820 , until 1824 .

RANK  4 DOCID       17 SCORE 1.460 CONTENT Alexander_McNair 25 He died of influenza , and is buried in Calvary Cemetery in St. Louis .

RANK  5 DOCID        2 SCORE 1.385 CONTENT Alexander_McNair 4 McNair was born in Lancaster in the Province of Pennsylvania and grew up in Mifflin County .



<b>BM25  </b>

Use BM25 to rank score. 

In [9]:
def BM25(query, index, k=5):
    k1 = 1.2
    k3 = 1.5
    b = 0.75
    # scores stores doc ids and their scores
    scores = Counter()
    length_average = index.total_doc_len/index.total_num_docs  # average length of document 
    
    for term in query:
        N = index.num_docs()      # N: total number of documents
        ft = index.f_t(term)       # ft: document frequency of term
        docs = index.docids(term)    # docs: all doc ids that contain the term 
        dft = index.freqs(term)      # dft: all document freqs of the term 
        fqt = Counter(query)[term]   # fqt: frequency of tern t in query
        
        for num, docid in enumerate(docs):                                                                       
            fdt = dft[num]                                #fdt: frequency of term t in document d   
            length = sqrt(abs(index.doc_len[docid]))      # length: length of the doc
            
            #BM25 consists of three parts
            idf = log((N - ft + 0.5)/(ft + 0.5))
            tf = ((k1 + 1) * fdt)/(k1 * ((1-b) + b * length/length_average) + fdt)
            query_tf = (k3 + 1) * fqt / (k3 + fqt)
            
            scores[docid] +=  idf * tf * query_tf           
    return scores.most_common(k)
    
bm25_results = BM25(processed_query, invindex)
for rank, res in enumerate(bm25_results):
    print("RANK {:2d} DOCID {:8d} SCORE {:.3f} CONTENT {:}".format(rank+1,res[0],res[1],document[res[0]]))


RANK  1 DOCID        0 SCORE 25147.000 CONTENT Alexander_McNair 0 Alexander McNair -LRB- May 5 , 1775 -- March 18 , 1826 -


