## Information Retrieval

This is the document retrieval and sentence retrieval part of the project.

###  1. Unpack the zip file.
    
Unpack the 'wiki-pages-text.zip' in the current directory.

In [2]:
import zipfile
def unpack():
    with zipfile.ZipFile('wiki-pages-text.zip') as file:
        file.extractall()
# unpack()

### 2. Load file. 

Load the training dataset and the wiki txt file.

In [3]:
import os
import json
import pandas as pd

with open('train.json', 'r') as f:  # load training dataset
        train_data = json.load(f)   
print("Length of the train data is: " + str(len(train_data)))

with open('devset.json', 'r') as f1:  # load dev dataset
        dev_data = json.load(f1) 
print("Length of the dev data is: " + str(len(dev_data)))

with open('devset_result.json', 'r') as f2:  # store result 
        res_data = json.load(f2) 
print("Length of the dev result data is: " + str(len(res_data)))

with open('test-unlabelled.json', 'r') as f3:  # store result 
     test_data = json.load(f3) 
print("Length of the test data is: " + str(len(test_data)))
        
# appeand all the wiki txt sentences to one document
def loadfile(folder): 
    corpus = []
    list_of_files = os.listdir(folder)
    
    for file in list_of_files:
        try:
            filename = os.path.join(folder, file)
            with open(filename, 'r') as doc:
                for line in doc:
                    corpus.append(line)     
        except Exception as e:
            print("No files found here!")
            raise e
    return corpus
corpus = loadfile("wiki-pages-text")
print("Length of the corpus is: " + str(len(corpus)))


Length of the train data is: 145449
Length of the dev data is: 5001
Length of the dev result data is: 5001
Length of the test data is: 14997
Length of the corpus is: 25248397


#### Pandas process the data.

In [24]:
corpus_df = pd.DataFrame(corpus[:1000])
corpus_df.columns = ['text']

corpus_df['page_identifier'] = corpus_df.text.apply(lambda x: x.split(' ')[0])  
corpus_df['sentence_number'] = corpus_df.text.apply(lambda x: x.split(' ')[1]) 
corpus_df['sentence_text'] = corpus_df.text.apply(lambda x: x.split(' ')[2:])  
corpus_df['sentence_text'] = [','.join(map(str, l)) for l in corpus_df['sentence_text']]
corpus_df["sentence_text"] = corpus_df['sentence_text'].str.replace(',',' ')
corpus_df = corpus_df.drop('text', 1)

print(corpus_df.shape)
corpus_df.head(10)

(1000, 3)


Unnamed: 0,page_identifier,sentence_number,sentence_text
0,Alexander_McNair,0,Alexander McNair -LRB- May 5 1775 -- March 1...
1,Alexander_McNair,1,He was the first Governor of Missouri from its...
2,Alexander_McNair,4,McNair was born in Lancaster in the Province o...
3,Alexander_McNair,5,His grandfather David McNair Sr. immigra...
4,Alexander_McNair,6,David McNair Jr. Alexander 's father -LRB-...
5,Alexander_McNair,7,Alexander went to school as a child and atte...
6,Alexander_McNair,8,He reached an agreement with his mother and br...
7,Alexander_McNair,9,Alexander was defeated .\n
8,Alexander_McNair,10,He became a member of the Pennsylvania militia...
9,Alexander_McNair,13,In 1804 McNair traveled to what is now Misso...


### 3. Preprocess 

Preprocess includes: strip punctuations, tokenize,lemma, lower case, remove stop words.

In [25]:
import nltk

tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
lemmatizer = nltk.stem.WordNetLemmatizer()

nltk.download('wordnet')
def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

def pre_process(comment) -> str:
    # lower cased
    comment = comment.lower()
    # tokenize
    words = tokenizer.tokenize(comment)
    # lemmatize 
    words = [lemmatize(w) for w in words]
    processed_comment = " ".join(words)
    return processed_comment

def process_dataset(dataset: pd.DataFrame) -> pd.DataFrame:
    processed_corpus = dataset['sentence_text'].apply(lambda text: pre_process(text))
    return processed_corpus

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zhangyiming/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### Save the pre-processed corpus dataframe.

In [26]:
# import datetime
# start = datetime.datetime.now()
processed_corpus_df = pd.DataFrame(process_dataset(corpus_df))
processed_corpus_df.to_pickle("./processed_corpus.pkl")
processed_corpus_df.head(10)
# end = datetime.datetime.now()
# print(end-start)

Unnamed: 0,sentence_text
0,alexander mcnair lrb may 5 1775 march 18 1826 ...
1,he be the first governor of missouri from it e...
2,mcnair be bear in lancaster in the province of...
3,his grandfather david mcnair sr immigrate to p...
4,david mcnair jr alexander s father lrb b 1736 ...
5,alexander go to school a a child and attend on...
6,he reach an agreement with his mother and brot...
7,alexander be defeat
8,he become a member of the pennsylvania militia...
9,in 1804 mcnair travel to what be now missouri ...


#### Load pre-processed corpus dataframe.

In [8]:
load_processed_corpus_df = pd.read_pickle("./processed_corpus.pkl")
print(load_processed_corpus_df.shape)
load_processed_corpus_df.head(10)

(1000, 1)


Unnamed: 0,sentence_text
0,alexander mcnair lrb may 5 1775 march 18 1826 ...
1,he be the first governor of missouri from it e...
2,mcnair be bear in lancaster in the province of...
3,his grandfather david mcnair sr immigrate to p...
4,david mcnair jr alexander s father lrb b 1736 ...
5,alexander go to school a a child and attend on...
6,he reach an agreement with his mother and brot...
7,alexander be defeat
8,he become a member of the pennsylvania militia...
9,in 1804 mcnair travel to what be now missouri ...


### 4. Use Sklearn to build tf-idf.

tfidf_vectorizer is the tf-idf model.

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(load_processed_corpus_df['sentence_text'])
tfidf = tfidf.T

print('Vocabulary Size : ', len(tfidf_vectorizer.get_feature_names()))
print('Shape of Matrix : ', tfidf.shape)

pickle.dump(tfidf, open("tfidf.pickle","wb"))

Vocabulary Size :  4650
Shape of Matrix :  (4650, 1000)


#### Apply SVD 

In [44]:
import numpy as np

# Applying SVD
K= 20 # number of desirable features 
U, s, VT = np.linalg.svd(tfidf.toarray())
tfidf_reduced = np.dot(U[:,:K], np.dot(np.diag(s[:K]), VT[:K, :]))

# Getting document and term representation
terms_rep = np.dot(U[:,:K], np.diag(s[:K])) # M X K matrix where M = Vocabulary Size and N = Number of documents
docs_rep = np.dot(np.diag(s[:K]), VT[:K, :]).T # N x K matrix 

print(terms_rep.shape)
print(docs_rep.shape)

(4650, 20)
(1000, 20)


####  Test tfidf with query.

In [54]:
import math

load_tfidf = pickle.load(open("tfidf.pickle","rb"))

def lsa_query(query):
    query_rep = [vectorizer.vocabulary_[x] for x in pre_process(query).split()]
    query_rep = np.mean(terms_rep[query_rep],axis=0)
    return query_rep

# cosine distance of two vectors
def cosine_similarity(v1, v2):
    dot_product = 0.0
    magnitude_a = 0.0
    magnitude_b = 0.0
    for a, b in zip(v1, v2):
        dot_product += a * b
        magnitude_a += a ** 2
        magnitude_b += b ** 2
    if magnitude_a == 0.0 or magnitude_b == 0.0:
        return None
    else:
        return dot_product / (math.sqrt(magnitude_a * magnitude_b))

query_rep = "Alexander  Alatskivi"
pre_query= lsa_query(query_rep)
print(pre_query)
print(docs_rep[0])
print(cosine_similarity(pre_query,docs_rep[0]))

query_doc_cos_dist = [cosine_similarity(query_rep, doc_rep) for doc_rep in docs_rep]
query_doc_sort_index = np.argsort(np.array(query_doc_cos_dist))

print_count = 0
for rank, sort_index in enumerate(query_doc_sort_index):
    print ('Rank : ', rank, ' Consine : ', query_doc_cos_dist[sort_index],' Sentence : ', load_processed_corpus_df['sentence_text'][sort_index])
    if print_count == 4 :
        break
    else:
        print_count += 1


[-0.08030084  0.07131535 -0.02114174 -0.01343357  0.00086819 -0.00232813
 -0.10948688 -0.00657705 -0.03682033 -0.01810879 -0.02005786  0.02662247
 -0.04509342  0.0063877  -0.03200761  0.03458823  0.02123711  0.05318392
  0.02635511 -0.02561251]
[-0.13757185  0.19350078 -0.02065905  0.12258203 -0.02870684  0.02541737
 -0.18322714  0.01045996 -0.11076183  0.03505023 -0.06895175  0.04991295
 -0.05618872  0.05047644 -0.0666606   0.09385609 -0.0368271   0.00063961
 -0.07268559 -0.12724088]
0.7501418340639319


TypeError: can't multiply sequence by non-int of type 'numpy.float64'

### Use page identifier to reduce the search space.

Build a dictionary that consists the <b>page identifier</b> as the key and the document index(ranges form 0 to 25248397 ） as value.

Examples looks like: "Alexander McNair" : [1,2,3,4...18] 

<b>To do:</b> the keywords list may also consider synonyms for each keyword.


In [36]:
final_dic = {}

for id, doc in enumerate(corpus):
    final_dic.setdefault((doc.split()[0].replace("_", " ")), []).append(id)

keys = list(final_dic.keys())
print("Length of keys = {}".format(len(keys)))

Length of keys = 5396106


###  Build a dictionary for all the unique words in the keys.

Keywords is a dictionary that stores all the unique page identifier word and its corresponing document index list.

In [71]:

def create_keywords(keys, final_dic):
    keywords = {}
    for key in keys:
        words = key.split()
        for word in words:
            print(word)
            keywords[word] = final_dic[key]
    return keywords

# keywords should then remove stop words
print(keys[:20])
keywords = create_keywords(keys[:20], final_dic)
#print("Length of keywords = {}".format(len(keywords)))
#print(keywords)
#print('\n')
#print(keywords.get('Alexander'))

['Alexander McNair', 'Alatskivi', 'An American Girl Story - Maryellen 1955-COLON- Extraordinary Christmas', 'Alta Outcome Document', 'Al ‘Urban', 'Albano buoy system', 'Amuka', 'Aleksandr Abdulkhalikov', 'Amalia Ciardi Duprè', 'Akbil', 'All These Years', 'Ambulance services of Victoria', 'Alabama elections, 2018', 'Akarsu, Ardanuç', 'Albertinovac', 'All-India Yadav Mahasabha', 'Alejandro Rodríguez López', 'Aleksandr Luzin', 'American Thighs', 'Alan Brown -LRB-Australian politician-RRB-']
Alexander
McNair
Alatskivi
An
American
Girl
Story
-
Maryellen
1955-COLON-
Extraordinary
Christmas
Alta
Outcome
Document
Al
‘Urban
Albano
buoy
system
Amuka
Aleksandr
Abdulkhalikov
Amalia
Ciardi
Duprè
Akbil
All
These
Years
Ambulance
services
of
Victoria
Alabama
elections,
2018
Akarsu,
Ardanuç
Albertinovac
All-India
Yadav
Mahasabha
Alejandro
Rodríguez
López
Aleksandr
Luzin
American
Thighs
Alan
Brown
-LRB-Australian
politician-RRB-


In [67]:
import numpy as np
def calculate_cosines(claim_tfidf, evi_tfidf) -> np.ndarray:
    cosines = np.zeros((claim_tfidf.shape[0], 1))
    for i in range(len(cosines)):
        claim_vector = claim_tfidf[i]
        evi_vector = evi_tfidf[i]
        cosine_matrix = cosine_similarity([claim_vector.toarray()[0], evi_vector.toarray()[0]])
        cosines[i][0] = cosine_matrix[0][1]
    return cosines

print(keywords.get('Alexander'))
# testQuery = "Alexander Alatskivi"
# query_tfidf = tfidf_vectorizer.transform(testQuery)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



###  Retrieval Evidence
Given a query, tokenize it first, then for each token in the query, find it in the keywords dictionary. 


To do: smater select the in the range of its alphabet.

In [64]:
# Given a query, returns a list that contains all the document index values.
def extract_sentences(query, keys, final_dic):
    retrievaled_sentences = []
    for word in  nltk.tokenize.word_tokenize(query):
        retrievaled_sentences.append([keywords.get(word)])
            
    retrievaled_sentences = [item for sublist in retrievaled_sentences for item in sublist]
    return retrievaled_sentences

testQuery = "Alexander Alatskivi"
print(extract_sentences(testQuery,keys[:100],final_dic))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



If there is a match or substring match, then retrieve that sentence by using index value to find the raw sentence in txt file. 

Find all the related sentences by this way and then builds a inverted index and 
uses BM25 to rank and to retrieval top K sentences as the evidence.


In [14]:
# Use the result from above step, find all the raw sentences in txt file.
# return a list of the doc id orderd by rnaking tfidf or bm25 score.

def retrieval_evidence(query, keys, final_dic):
    processed_doc = [] # processed_docs stores the list of processed docs
    vocab = {}
    unique_id = 0
    rank_result = []
    
    retrievaled_sentences = extract_sentences(query,keys,final_dic)
    
    # find the row sentences and save them in processed_doc
    for retrievaled_sentence in retrievaled_sentences:
        norm_sentence = preprocess(document[retrievaled_sentence])
        for token in norm_sentence:
            if token not in vocab:
                vocab.update({token: unique_id})     
                unique_id = unique_id + 1
        processed_doc.append(norm_sentence) 
    
    # calculate doc term freqs and build an inverted index
    doc_term_freqs = doc_term_freq(processed_doc)
    invindex = InvertedIndex(vocab, doc_term_freqs)
    
    processed_query = preprocess(query)
    bm25_results = BM25(processed_query, invindex, vocab)
    tfidf_results = tfidf(processed_query, invindex, vocab)
    
    for rank, res in enumerate(tfidf_results):
        # print("RANK {:2d} DOCID {:8d} SCORE {:.3f} CONTENT {:}".format(rank+1,res[0],res[1],document[res[0]]))
        rank_result.append((res[0]))
    return rank_result

# test_query = "Nikolaj Coster-Waldau worked with the Fox Broadcasting Company."
# test_query = "Alexander Alatskivi"
# print(retrieval_evidence(test_query,keys[:100],final_dic))

### Write the result to json file.

The dataset used is 'devset.json', the predicted result is 'devset_result.json'.


In [18]:
def get_evidence(res_data):
    for key in list(res_data)[:10]:
        res_data[key]["evidence"] = []
        tfidf_result = retrieval_evidence(res_data[key]["claim"],keys[:5000],final_dic)
        for res in tfidf_result:
            res_data[key]["evidence"].append([document[res].split()[0], document[res].split()[1]])
    return res_data

#testing, for top 10 instances in the dev, and only consider top 100 sentences in the documents.
predicted_train = get_evidence(res_data)

#for key in list(predicted_train)[:10]:
 #   print(predicted_train[key])
  #  print("\n")

### Gensim library.

Use gensim library to calculate the coscine similarity in IR.

In [30]:
import gensim

raw_documents = ["I'm taking the show on the road.",
                 "My socks are a force multiplier.",
                 "I am the barber who cuts everyone's hair who doesn't cut their own.",
                 "Legend has it that the mind is a mad monkey.",
                 "I make my own fun."]

from nltk.tokenize import word_tokenize
gen_docs = [[w.lower() for w in word_tokenize(text)] for text in raw_documents]

dictionary = gensim.corpora.Dictionary(gen_docs)
print(gen_docs)
print(dictionary)
print(dictionary.token2id['socks'])
print("Number of words in dictionary:",len(dictionary))
for i in range(len(dictionary)):
    print(i, dictionary[i])

corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
print(corpus)

tf_idf = gensim.models.TfidfModel(corpus)
print(tf_idf)












[['i', "'m", 'taking', 'the', 'show', 'on', 'the', 'road', '.'], ['my', 'socks', 'are', 'a', 'force', 'multiplier', '.'], ['i', 'am', 'the', 'barber', 'who', 'cuts', 'everyone', "'s", 'hair', 'who', 'does', "n't", 'cut', 'their', 'own', '.'], ['legend', 'has', 'it', 'that', 'the', 'mind', 'is', 'a', 'mad', 'monkey', '.'], ['i', 'make', 'my', 'own', 'fun', '.']]
Dictionary(36 unique tokens: ["'m", '.', 'i', 'on', 'road']...)
13
Number of words in dictionary: 36
0 'm
1 .
2 i
3 on
4 road
5 show
6 taking
7 the
8 a
9 are
10 force
11 multiplier
12 my
13 socks
14 's
15 am
16 barber
17 cut
18 cuts
19 does
20 everyone
21 hair
22 n't
23 own
24 their
25 who
26 has
27 is
28 it
29 legend
30 mad
31 mind
32 monkey
33 that
34 fun
35 make
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2)], [(1, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1)], [(1, 1), (2, 1), (7, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 2)], [(1, 