### Information Retrieval

This is the document retrieval and sentence retrieval part of the project.
Method 

#### 1. Unpack the zip file.
    
Unpack the 'wiki-pages-text.zip' in the current directory.

In [5]:
import zipfile

def unpack():
    with zipfile.ZipFile('wiki-pages-text.zip') as file:
        file.extractall()
unpack()

#### 2. Load file. 

Load the training dataset and the wiki txt file.

In [1]:
import os
import json

# load training dataset
with open('train.json', 'r') as f:
        train_data = json.load(f)  

print("Length of the train data is: " + str(len(train_data)))

# Top 3 instances in train data
print("Top 3 instances in train data")
for key in list(train_data)[:3]:
    print(key, train_data[key])

print("\n")

# appeand all the wiki txt sentences to one document
def loadfile(folder): 
    document = []
    list_of_files = os.listdir(folder)
    print(list_of_files)
    
    for file in list_of_files:
        try:
            filename = os.path.join(folder, file)
            with open(filename, 'r') as doc:
                for line in doc:
                    document.append(line)
                 
        except Exception as e:
            print("No files found here!")
            raise e
            
    return document

document = loadfile("wiki-pages-text")

print("\n")
print("Length of the document is: " + str(len(document)))
print("\n")

# index0 wiki file, index1 sentence
print(document[0])
print(document[1])
print(document[2])
print("\n")

        


Length of the train data is: 145449
Top 3 instances in train data
75397 {'claim': 'Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.', 'label': 'SUPPORTS', 'evidence': [['Fox_Broadcasting_Company', 0], ['Nikolaj_Coster-Waldau', 7]]}
150448 {'claim': 'Roman Atwood is a content creator.', 'label': 'SUPPORTS', 'evidence': [['Roman_Atwood', 1], ['Roman_Atwood', 3]]}
214861 {'claim': 'History of art includes architecture, dance, sculpture, music, painting, poetry literature, theatre, narrative, film, photography and graphic arts.', 'label': 'SUPPORTS', 'evidence': [['History_of_art', 2]]}


['wiki-009.txt', 'wiki-021.txt', 'wiki-035.txt', 'wiki-034.txt', 'wiki-020.txt', 'wiki-008.txt', 'wiki-036.txt', 'wiki-022.txt', 'wiki-023.txt', 'wiki-037.txt', 'wiki-033.txt', 'wiki-027.txt', 'wiki-026.txt', 'wiki-032.txt', 'wiki-024.txt', 'wiki-030.txt', 'wiki-018.txt', 'wiki-019.txt', 'wiki-031.txt', 'wiki-025.txt', 'wiki-042.txt', 'wiki-056.txt', 'wiki-081.txt', 'wiki-095.txt', 'wiki-09

<b>3. Method one : TF-IDF  </b>

3.1 Preprocess the sentence: strip punctuations, tokenize,stem, lower case, remove stop words.

In [2]:
import nltk
import re
from nltk.corpus import stopwords

stemmer = nltk.stem.PorterStemmer()
stop_words = set(stopwords.words('english')) 

# processed_docs stores the list of processed docs
preprocess = []
processed_doc = []
vocab = {}

def preprocess(document):
    # vocab contains (term, term id) pairs
    vocab = {}
    # unique id for each term
    unique_id = 0
    
    # for each sentence in the document
    for sentence in document: 
        
        norm_sentence = []
        
        # strip punctuations: remove ',' '.',etc.
        sentence = re.sub(r'[^\w\s]', '', sentence)
        
        # tokenize words
        tokens = nltk.tokenize.word_tokenize(sentence)  
                
        for token in tokens:
            # stem the token
            token = stemmer.stem(token)  
            # lower case the token
            token = token.lower()  
        
            # remove stop words
            if token not in stop_words:
                norm_sentence.append(token)  
                if token not in vocab:
                    vocab.update({token: unique_id})     
                    unique_id = unique_id + 1
                
        processed_doc.append(norm_sentence)
        
    return processed_doc, vocab

# testing: only first 100 sentences
preprocess = preprocess(document[:100])

# processed_doc is the document that all the sentences after preprocessed.
processed_doc = preprocess[0]
# vocab is all the unique tokens of the document
vocab = preprocess[1]

print("Number of sentences = {}".format(len(processed_doc)))
print("Number of unique terms = {}".format(len(vocab)))

print("\n")
print(processed_doc[0])
print(processed_doc[1])
print(processed_doc[2])

Number of sentences = 100
Number of unique terms = 800


['alexander_mcnair', '0', 'alexand', 'mcnair', 'lrb', 'may', '5', '1775', 'march', '18', '1826', 'rrb', 'wa', 'american', 'frontiersman', 'politician']
['alexander_mcnair', '1', 'wa', 'first', 'governor', 'missouri', 'entri', 'state', '1820', '1824']
['alexander_mcnair', '4', 'mcnair', 'wa', 'born', 'lancast', 'provinc', 'pennsylvania', 'grew', 'mifflin', 'counti']


3.2 Calculate document term frequence.

In [3]:
from collections import Counter

def doc_term_freq(doc):
    doc_term_freqs = []
    
    for sentence in doc:
        doc_term_freqs.append(Counter(sentence)) 
    
    return doc_term_freqs

doc_term_freqs = doc_term_freq(processed_doc)

# first 100 sentences
print("Number of doc term freqs = {}".format(len(doc_term_freqs)))
print("\n")

print(doc_term_freqs[0])
print(doc_term_freqs[1])
print(doc_term_freqs[2])

Number of doc term freqs = 100


Counter({'alexander_mcnair': 1, '0': 1, 'alexand': 1, 'mcnair': 1, 'lrb': 1, 'may': 1, '5': 1, '1775': 1, 'march': 1, '18': 1, '1826': 1, 'rrb': 1, 'wa': 1, 'american': 1, 'frontiersman': 1, 'politician': 1})
Counter({'alexander_mcnair': 1, '1': 1, 'wa': 1, 'first': 1, 'governor': 1, 'missouri': 1, 'entri': 1, 'state': 1, '1820': 1, '1824': 1})
Counter({'alexander_mcnair': 1, '4': 1, 'mcnair': 1, 'wa': 1, 'born': 1, 'lancast': 1, 'provinc': 1, 'pennsylvania': 1, 'grew': 1, 'mifflin': 1, 'counti': 1})


3.3 Build Inverted Index.

In [12]:
class InvertedIndex:
    def __init__(self, vocab, doc_term_freqs):
        self.vocab = vocab
        self.doc_len = [0] * len(doc_term_freqs)
        self.doc_term_freqs = [[] for i in range(len(vocab))]
        self.doc_ids = [[] for i in range(len(vocab))]
        self.doc_freqs = [0] * len(vocab)
        self.total_num_docs = 0
        self.max_doc_len = 0
        for docid, term_freqs in enumerate(doc_term_freqs):
            doc_len = sum(term_freqs.values())
            self.max_doc_len = max(doc_len, self.max_doc_len)
            self.doc_len[docid] = doc_len
            self.total_num_docs += 1
            for term, freq in term_freqs.items():
                term_id = vocab[term]
                self.doc_ids[term_id].append(docid)
                self.doc_term_freqs[term_id].append(freq)
                self.doc_freqs[term_id] += 1

    def num_terms(self):
        return len(self.doc_ids)

    def num_docs(self):
        return self.total_num_docs

    def docids(self, term):
        term_id = self.vocab[term]
        return self.doc_ids[term_id]

    def freqs(self, term):
        term_id = self.vocab[term]
        return self.doc_term_freqs[term_id]

    def f_t(self, term):
        term_id = self.vocab[term]
        return self.doc_freqs[term_id]

invindex = InvertedIndex(vocab, doc_term_freqs)

# print inverted index stats
print("documents = {}".format(invindex.num_docs()))
print("number of terms = {}".format(invindex.num_terms()))
print("longest document length = {}".format(invindex.max_doc_len))

print("\n")
print(vocab)

documents = 100
number of terms = 800
longest document length = 37


{'alexander_mcnair': 0, '0': 1, 'alexand': 2, 'mcnair': 3, 'lrb': 4, 'may': 5, '5': 6, '1775': 7, 'march': 8, '18': 9, '1826': 10, 'rrb': 11, 'wa': 12, 'american': 13, 'frontiersman': 14, 'politician': 15, '1': 16, 'first': 17, 'governor': 18, 'missouri': 19, 'entri': 20, 'state': 21, '1820': 22, '1824': 23, '4': 24, 'born': 25, 'lancast': 26, 'provinc': 27, 'pennsylvania': 28, 'grew': 29, 'mifflin': 30, 'counti': 31, 'hi': 32, 'grandfath': 33, 'david': 34, 'sr': 35, 'immigr': 36, 'donaghmor': 37, 'doneg': 38, 'ireland': 39, 'around': 40, '1733': 41, 'scottish': 42, 'ancestor': 43, 'loch': 44, 'lomond': 45, '6': 46, 'jr': 47, 'father': 48, 'b': 49, '1736': 50, 'fought': 51, 'gener': 52, 'georg': 53, 'washington': 54, 'trenton': 55, 'princeton': 56, 'campaign': 57, 'winter': 58, '1776': 59, '77': 60, 'die': 61, 'februari': 62, '1777': 63, 'result': 64, 'wound': 65, 'receiv': 66, 'battl': 67, 'exposur': 68, 'less': 69, 

3.4 Uses score function to rank the sentences.

\begin{equation*}
Score(Q,d) = \frac{1}{\sqrt{|d|}} \times \sum_{i=1}^q \log(1 + f_{d,t}) * \log( \frac{N}{f_t} ) 
\end{equation*}

In [17]:
from math import log, sqrt

# given a query and an index returns a list of the k highest scoring documents as tuples containing <docid,score>
def query_tfidf(query, index, k=10):
    
    # scores stores doc ids and their scores
    scores = Counter()

    for term in query:   
        # N: total number of documents
        N = index.num_docs()    
        # ft: document frequency of term 
        ft = index.f_t(term) 
        # docs: all doc ids that contain the term 
        docs = index.docids(term)
        # dft: all document freqs of the term
        dft = index.freqs(term)                 
        
        # num: index used for iterate the docs
        for num, docid in enumerate(docs):                   
            #fdt: frequency of term t in document d                                                   
            fdt = dft[num]
            # length: length of the doc
            length = sqrt(abs(index.doc_len[docid]))        
            
            # tfidf: construct the score formula 
            tfidf = log(1 + fdt)*log(N/ft) 
            # score: the final score
            scores[docid] += tfidf/length 

    return scores.most_common(k)


# testing: query is a claim from the train dataset

# query = "Alexander Lukashenko is a head of state"
query = "Alexander princeton is a of state"
# stemmed_query = nltk.stem.PorterStemmer().stem(query).split()

# strip punctuations: remove ',' '.',etc.
stemmed_query = []
query = re.sub(r'[^\w\s]', '', query)      
tokens = nltk.tokenize.word_tokenize(query)  
                
for token in tokens:
    token = stemmer.stem(token)  
    token = token.lower()  
    if token not in stop_words:
        stemmed_query.append(token)  
print("stemmed query is = {}".format(stemmed_query))

      
results = query_tfidf(stemmed_query, invindex)
for rank, res in enumerate(results):
    print("RANK {:2d} DOCID {:8d} SCORE {:.3f} CONTENT {:}".format(rank+1,res[0],res[1],document[res[0]][:75]))


stemmed query is = ['alexand', 'princeton', 'state']
RANK  1 DOCID        4 SCORE 1.096 CONTENT Alexander_McNair 6 David McNair , Jr. , Alexander 's father -LRB- b. 1736 -
RANK  2 DOCID        7 SCORE 0.929 CONTENT Alexander_McNair 9 Alexander was defeated .

RANK  3 DOCID        1 SCORE 0.617 CONTENT Alexander_McNair 1 He was the first Governor of Missouri from its entry as 
RANK  4 DOCID       64 SCORE 0.588 CONTENT Alabama_elections,_2018 0 A general election will be held in the U.S. state
RANK  5 DOCID        9 SCORE 0.563 CONTENT Alexander_McNair 13 In 1804 , McNair traveled to what is now Missouri , the
RANK  6 DOCID       14 SCORE 0.555 CONTENT Alexander_McNair 20 Later in September , he enlisted as a private in the Fi
RANK  7 DOCID        5 SCORE 0.536 CONTENT Alexander_McNair 7 Alexander went to school as a child , and attended one t
RANK  8 DOCID       38 SCORE 0.521 CONTENT Alta_Outcome_Document 26 Indigenous peoples ' increasing involvement with U
RANK  9 DOCID       65 SCO