## Information Retrieval

This is the document retrieval and sentence retrieval part of the project.

###  1. Unpack the zip file.
    
Unpack the 'wiki-pages-text.zip' in the current directory.

In [2]:
import zipfile
def unpack():
    with zipfile.ZipFile('wiki-pages-text.zip') as file:
        file.extractall()
# unpack()

### 2. Load file. 

Load the training dataset and the wiki txt file.

In [1]:
import os
import json
import pandas as pd

with open('train.json', 'r') as f:  # load training dataset
        train_data = json.load(f)   
print("Length of the train data is: " + str(len(train_data)))

with open('devset.json', 'r') as f1:  # load dev dataset
        dev_data = json.load(f1) 
print("Length of the dev data is: " + str(len(dev_data)))

with open('devset_result.json', 'r') as f2:  # store result 
        res_data = json.load(f2) 
print("Length of the dev result data is: " + str(len(res_data)))

with open('test-unlabelled.json', 'r') as f3:  # store result 
     test_data = json.load(f3) 
print("Length of the test data is: " + str(len(test_data)))
        
# appeand all the wiki txt sentences to one document
def loadfile(folder): 
    corpus = []
    list_of_files = os.listdir(folder)
    
    for file in list_of_files:
        try:
            filename = os.path.join(folder, file)
            with open(filename, 'r') as doc:
                for line in doc:
                    corpus.append(line)     
        except Exception as e:
            print("No files found here!")
            raise e
    return corpus
corpus = loadfile("wiki-pages-text")
print("Length of the corpus is: " + str(len(corpus)))


Length of the train data is: 145449
Length of the dev data is: 5001
Length of the dev result data is: 5001
Length of the test data is: 14997
Length of the corpus is: 25248397


#### Pandas process the data.

In [80]:
corpus_df = pd.DataFrame(corpus[:1000])
corpus_df.columns = ['text']

corpus_df['page_identifier'] = corpus_df.text.apply(lambda x: x.split(' ')[0])  
corpus_df['sentence_number'] = corpus_df.text.apply(lambda x: x.split(' ')[1]) 
corpus_df['sentence_text'] = corpus_df.text.apply(lambda x: x.split(' ')[2:])  
corpus_df['sentence_text'] = [','.join(map(str, l)) for l in corpus_df['sentence_text']]
corpus_df["sentence_text"] = corpus_df['sentence_text'].str.replace(',',' ')
corpus_df = corpus_df.drop('text', 1)

print(corpus_df.shape)
corpus_df.head(10)

(1000, 3)


Unnamed: 0,page_identifier,sentence_number,sentence_text
0,Alexander_McNair,0,Alexander McNair -LRB- May 5 1775 -- March 1...
1,Alexander_McNair,1,He was the first Governor of Missouri from its...
2,Alexander_McNair,4,McNair was born in Lancaster in the Province o...
3,Alexander_McNair,5,His grandfather David McNair Sr. immigra...
4,Alexander_McNair,6,David McNair Jr. Alexander 's father -LRB-...
5,Alexander_McNair,7,Alexander went to school as a child and atte...
6,Alexander_McNair,8,He reached an agreement with his mother and br...
7,Alexander_McNair,9,Alexander was defeated .\n
8,Alexander_McNair,10,He became a member of the Pennsylvania militia...
9,Alexander_McNair,13,In 1804 McNair traveled to what is now Misso...


### 3. Preprocess 

Preprocess includes: strip punctuations, tokenize,lemma, lower case, remove stop words.

In [70]:
import nltk

tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
lemmatizer = nltk.stem.WordNetLemmatizer()

nltk.download('wordnet')
def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

def pre_process(comment) -> str:
    # lower cased
    comment = comment.lower()
    # tokenize
    words = tokenizer.tokenize(comment)
    # lemmatize 
    words = [lemmatize(w) for w in words]
    processed_comment = " ".join(words)
    return processed_comment

def process_dataset(dataset: pd.DataFrame) -> pd.DataFrame:
    corpus = dataset['sentence_text']
    processed_corpus = corpus.apply(lambda text: pre_process(text))
    dataset['sentence_text'] = processed_corpus.iloc[0: len(dataset)]
    return dataset

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zhangyiming/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### Save the pre-processed corpus dataframe.

In [81]:
# import datetime
# start = datetime.datetime.now()
processed_corpus_df = pd.DataFrame(process_dataset(corpus_df))
processed_corpus_df.to_pickle("./processed_corpus.pkl")
processed_corpus_df.head(10)
# end = datetime.datetime.now()
# print(end-start)

Unnamed: 0,page_identifier,sentence_number,sentence_text
0,Alexander_McNair,0,alexander mcnair lrb may 5 1775 march 18 1826 ...
1,Alexander_McNair,1,he be the first governor of missouri from it e...
2,Alexander_McNair,4,mcnair be bear in lancaster in the province of...
3,Alexander_McNair,5,his grandfather david mcnair sr immigrate to p...
4,Alexander_McNair,6,david mcnair jr alexander s father lrb b 1736 ...
5,Alexander_McNair,7,alexander go to school a a child and attend on...
6,Alexander_McNair,8,he reach an agreement with his mother and brot...
7,Alexander_McNair,9,alexander be defeat
8,Alexander_McNair,10,he become a member of the pennsylvania militia...
9,Alexander_McNair,13,in 1804 mcnair travel to what be now missouri ...


#### Load pre-processed corpus dataframe.

In [72]:
load_processed_corpus_df = pd.read_pickle("./processed_corpus.pkl")
print(load_processed_corpus_df.shape)
load_processed_corpus_df.head(10)

(1000, 3)


Unnamed: 0,page_identifier,sentence_number,sentence_text
0,Alexander_McNair,0,alexander mcnair lrb may 5 1775 march 18 1826 ...
1,Alexander_McNair,1,he be the first governor of missouri from it e...
2,Alexander_McNair,4,mcnair be bear in lancaster in the province of...
3,Alexander_McNair,5,his grandfather david mcnair sr immigrate to p...
4,Alexander_McNair,6,david mcnair jr alexander s father lrb b 1736 ...
5,Alexander_McNair,7,alexander go to school a a child and attend on...
6,Alexander_McNair,8,he reach an agreement with his mother and brot...
7,Alexander_McNair,9,alexander be defeat
8,Alexander_McNair,10,he become a member of the pennsylvania militia...
9,Alexander_McNair,13,in 1804 mcnair travel to what be now missouri ...


### 4. Use Sklearn to build tf-idf.

tfidf_vectorizer is the tf-idf model.

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(load_processed_corpus_df['sentence_text'])
tfidf = tfidf.T

print('Vocabulary Size : ', len(tfidf_vectorizer.get_feature_names()))
print('Shape of Matrix : ', tfidf.shape)

pickle.dump(tfidf, open("tfidf.pickle","wb"))

Vocabulary Size :  4650
Shape of Matrix :  (4650, 1000)


### 5. Apply SVD 

In [59]:
import numpy as np

load_tfidf = pickle.load(open("tfidf.pickle","rb"))
# Applying SVD
K= 20 # number of desirable features 
U, s, VT = np.linalg.svd(load_tfidf.toarray())
# tfidf_reduced = np.dot(U[:,:K], np.dot(np.diag(s[:K]), VT[:K, :]))

print(U.shape)
print(s.shape)
print(VT.shape)

# Getting document and term representation
terms_rep = np.dot(U[:,:K], np.diag(s[:K])) # M X K matrix where M = Vocabulary Size and N = Number of documents
docs_rep = np.dot(np.diag(s[:K]), VT[:K, :]).T # N x K matrix 

print(terms_rep.shape)
print(docs_rep.shape)
print(terms_rep[0])

(4650, 4650)
(1000,)
(1000, 1000)
(4650, 20)
(1000, 20)
[-0.07261009 -0.00454805  0.03801582  0.01416727  0.05270882 -0.00769401
  0.01417197 -0.04272573 -0.06352088  0.00386057  0.01474623  0.01431363
  0.12041152 -0.0422148   0.05369463  0.02702167  0.01134609  0.01666593
  0.00552886  0.00071745]


### PCA , TurncatedSVD, Scipy SVD, SparseSVD

In [60]:
from sklearn.decomposition import PCA 
from sklearn.decomposition import TruncatedSVD
from sparsesvd import sparsesvd
import numpy
from scipy.sparse.linalg import svds
from sklearn.utils.extmath import randomized_svd

# pca = PCA(n_components='mle')
# pca_tfidf = pca.fit_transform(load_tfidf)
# print(pca_tfidf)

k1= 20 # number of desirable features 
svd = TruncatedSVD(n_components=k1, n_iter=7, random_state=42)
svd_tfidf = svd.fit_transform(load_tfidf)
# print(svd_tfidf.shape)
# smat = scipy.sparse.csc_matrix(load_tfidf.toarray()) # convert to sparse CSC format
# U1, s1, VT1 = sparsesvd(smat, k1)
# U1 = U1.T

U1, s1, VT1 = randomized_svd(load_tfidf.toarray(), k1)

print(U1.shape)
print(s1.shape)
print(VT1.shape)
# Getting document and term representation
terms_rep1 = np.dot(U1[:,:k1], np.diag(s1[:k1])) # M X K matrix where M = Vocabulary Size and N = Number of documents
docs_rep1 = np.dot(np.diag(s1[:k1]), VT1[:k1, :]).T # N x K matrix 
print(terms_rep1.shape)
print(docs_rep1.shape)
print(terms_rep1[0])

(4650, 20)
(20,)
(20, 1000)
(4650, 20)
(1000, 20)
[ 0.07261009 -0.00454648  0.03795112  0.01444103  0.05371103 -0.00810654
  0.01351493  0.04319767 -0.06231174 -0.00734784  0.02139308  0.00312849
  0.12001401 -0.02885562  0.0619723  -0.01373013 -0.00822798 -0.00194202
 -0.05417265 -0.05236209]


###  6. Test tfidf with query.

In [64]:
import math
from scipy.spatial.distance import cosine

def lsa_query(query):
    query_rep = []
    for q in pre_process(query).split():
        if q in tfidf_vectorizer.vocabulary_:
            query_rep.append(tfidf_vectorizer.vocabulary_[q])
        else:
            continue
    query_rep = np.mean(terms_rep[query_rep],axis=0)
    return query_rep

query_rep = "Alexander  Alatskivi"
pre_query= lsa_query(query_rep)
print(pre_query)
print(docs_rep[0])
print(1- cosine(pre_query,docs_rep[0]))

query_doc_cos_dist = []

for doc_rep in docs_rep:
    query_doc_cos_dist.append(cosine(pre_query, doc_rep))

print(len(query_doc_cos_dist))
print(query_doc_cos_dist[0])
print(query_doc_cos_dist[1])

query_doc_sort_index = np.argsort(np.array(query_doc_cos_dist))

count = 0
for rank, sort_index in enumerate(query_doc_sort_index):
    print ('Rank : ', rank, ' Consine : ', 1 - query_doc_cos_dist[sort_index], 'Page Identifier: ',load_processed_corpus_df['page_identifier'][sort_index], 'Sentence number: ', load_processed_corpus_df['sentence_number'][sort_index], ' Sentence : ', load_processed_corpus_df['sentence_text'][sort_index])
    if count == 4 :
        break
    else:
        count += 1


[-0.08030084  0.07131535 -0.02114174 -0.01343357  0.00086819 -0.00232813
 -0.10948688 -0.00657705 -0.03682033 -0.01810879 -0.02005786  0.02662247
 -0.04509342  0.0063877  -0.03200761  0.03458823  0.02123711  0.05318392
  0.02635511 -0.02561251]
[-0.13757185  0.19350078 -0.02065905  0.12258203 -0.02870684  0.02541737
 -0.18322714  0.01045996 -0.11076183  0.03505023 -0.06895175  0.04991295
 -0.05618872  0.05047644 -0.0666606   0.09385609 -0.0368271   0.00063961
 -0.07268559 -0.12724088]
0.750141834063932
1000
0.24985816593606802
0.9036018061784973
Rank :  0  Consine :  0.8558589155525097 Page Identifier:  Alexander_McNair Sentence number:  9  Sentence :  alexander be defeat
Rank :  1  Consine :  0.8481061499565726 Page Identifier:  Alexander_Carson_-LRB-filmmaker-RRB- Sentence number:  0  Sentence :  alexander carson be a canadian filmmaker
Rank :  2  Consine :  0.8476412327038444 Page Identifier:  Alexander_Frederick,_Landgrave_of_Hesse Sentence number:  0  Sentence :  alexander frederi

  dist = 1.0 - uv / np.sqrt(uu * vv)


###  7. Retrieval Evidence and write the result to json file.


In [56]:
from scipy.spatial.distance import cosine

load_tfidf = pickle.load(open("tfidf.pickle","rb"))

def lsa_query(query):
    query_rep = []
    for q in pre_process(query).split():
        if q in tfidf_vectorizer.vocabulary_:
            query_rep.append(tfidf_vectorizer.vocabulary_[q])
        else:
            continue
    query_rep = np.mean(terms_rep1[query_rep],axis=0)
    return query_rep

def retrieval_evidence(res_data):
    for key in list(res_data)[:10]:
        res_data[key]["evidence"] = []
        lsa_query(res_data[key]["claim"])
        
        query_doc_cos_dist = []
        for doc_rep in docs_rep1:
            query_doc_cos_dist.append(cosine(pre_query, doc_rep))
        query_doc_sort_index = np.argsort(np.array(query_doc_cos_dist))
        
        # retrieval top 5 evidence
        count = 0
        for rank, sort_index in enumerate(query_doc_sort_index):
            res_data[key]["evidence"].append([load_processed_corpus_df['page_identifier'][sort_index],int(load_processed_corpus_df['sentence_number'][sort_index])])
            if count == 4 :
                break
            else:
                count += 1
                
    return res_data

#testing, for top 10 instances in the dev, and only consider top 100 sentences in the documents.
predicted_train = retrieval_evidence(res_data)

for key in list(predicted_train)[:10]:
    print(predicted_train[key])
    print("\n")

  dist = 1.0 - uv / np.sqrt(uu * vv)


{'claim': 'Colin Kaepernick became a starting quarterback during the 49ers 63rd season in the National Football League.', 'label': 'NOT ENOUGH INFO', 'evidence': [['Bavarian_Film_Awards_-LRB-Production_Design-RRB-', 0], ['Beverly_Lynne', 0], ['Billy_Hogg_-LRB-Scottish_footballer-RRB-', 5], ['Baumgarten_-LRB-surname-RRB-', 0], ['Benedetta_of_Cagliari', 12]]}


{'claim': 'Tilda Swinton is a vegan.', 'label': 'NOT ENOUGH INFO', 'evidence': [['Bavarian_Film_Awards_-LRB-Production_Design-RRB-', 0], ['Beverly_Lynne', 0], ['Billy_Hogg_-LRB-Scottish_footballer-RRB-', 5], ['Baumgarten_-LRB-surname-RRB-', 0], ['Benedetta_of_Cagliari', 12]]}


{'claim': 'Fox 2000 Pictures released the film Soul Food.', 'label': 'SUPPORTS', 'evidence': [['Bavarian_Film_Awards_-LRB-Production_Design-RRB-', 0], ['Beverly_Lynne', 0], ['Billy_Hogg_-LRB-Scottish_footballer-RRB-', 5], ['Baumgarten_-LRB-surname-RRB-', 0], ['Benedetta_of_Cagliari', 12]]}


{'claim': 'Anne Rice was born in New Jersey.', 'label': 'NOT ENOUG

### 8. Another method: Gensim library tfidf model and LSA .

Use gensim library to calculate the coscine similarity in IR.

In [30]:
import gensim

raw_documents = ["I'm taking the show on the road.",
                 "My socks are a force multiplier.",
                 "I am the barber who cuts everyone's hair who doesn't cut their own.",
                 "Legend has it that the mind is a mad monkey.",
                 "I make my own fun."]

from nltk.tokenize import word_tokenize
gen_docs = [[w.lower() for w in word_tokenize(text)] for text in raw_documents]

dictionary = gensim.corpora.Dictionary(gen_docs)
print(gen_docs)
print(dictionary)
print(dictionary.token2id['socks'])
print("Number of words in dictionary:",len(dictionary))
for i in range(len(dictionary)):
    print(i, dictionary[i])

corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
print(corpus)

tf_idf = gensim.models.TfidfModel(corpus)
print(tf_idf)












[['i', "'m", 'taking', 'the', 'show', 'on', 'the', 'road', '.'], ['my', 'socks', 'are', 'a', 'force', 'multiplier', '.'], ['i', 'am', 'the', 'barber', 'who', 'cuts', 'everyone', "'s", 'hair', 'who', 'does', "n't", 'cut', 'their', 'own', '.'], ['legend', 'has', 'it', 'that', 'the', 'mind', 'is', 'a', 'mad', 'monkey', '.'], ['i', 'make', 'my', 'own', 'fun', '.']]
Dictionary(36 unique tokens: ["'m", '.', 'i', 'on', 'road']...)
13
Number of words in dictionary: 36
0 'm
1 .
2 i
3 on
4 road
5 show
6 taking
7 the
8 a
9 are
10 force
11 multiplier
12 my
13 socks
14 's
15 am
16 barber
17 cut
18 cuts
19 does
20 everyone
21 hair
22 n't
23 own
24 their
25 who
26 has
27 is
28 it
29 legend
30 mad
31 mind
32 monkey
33 that
34 fun
35 make
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2)], [(1, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1)], [(1, 1), (2, 1), (7, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 2)], [(1, 

In [84]:
print(load_processed_corpus_df.shape)
processed_corpus_df.head(10)

(1000, 3)


Unnamed: 0,page_identifier,sentence_number,sentence_text
0,Alexander_McNair,0,alexander mcnair lrb may 5 1775 march 18 1826 ...
1,Alexander_McNair,1,he be the first governor of missouri from it e...
2,Alexander_McNair,4,mcnair be bear in lancaster in the province of...
3,Alexander_McNair,5,his grandfather david mcnair sr immigrate to p...
4,Alexander_McNair,6,david mcnair jr alexander s father lrb b 1736 ...
5,Alexander_McNair,7,alexander go to school a a child and attend on...
6,Alexander_McNair,8,he reach an agreement with his mother and brot...
7,Alexander_McNair,9,alexander be defeat
8,Alexander_McNair,10,he become a member of the pennsylvania militia...
9,Alexander_McNair,13,in 1804 mcnair travel to what be now missouri ...


### Whoosh Library.

#### Build Schema.

In [87]:
from whoosh.qparser import *
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED,NUMERIC
from whoosh.analysis import StemmingAnalyzer,StandardAnalyzer

def init_search():
    schema = Schema(page_identifier=TEXT(stored=True),
                    sentence_number=NUMERIC(stored=True),
                    sentence_text=TEXT(stored=True),
                   )
    return schema
    
schema = init_search()


#### Index document

In [90]:
from whoosh import index
import os, os.path

def create_index(schema):
    #to create an index in a dictionary
    if not os.path.exists("indexdir"):
        os.mkdir("indexdir")
    ix = index.create_in("indexdir", schema)
    #open an existing index object
    ix = index.open_dir("indexdir")
    return ix
ix = create_index(schema)

def write_index(index):
    #create a writer object to add documents to the index
    writer = ix.writer()
    
    writer.add_document(page_identifier=u"Alexander_McNair",
                sentence_number=u"0",
                sentence_text=u"Alexander McNair -LRB- May 5 , 1775 -- March 18 , 1826 -RRB- was an American frontiersman and politician.")
    
    writer.add_document(page_identifier=u"Alexander_McNair",
                sentence_number=u"1",
                sentence_text=u"He was the first Governor of Missouri from its entry as a state in 1820 , until 1824 .")
    writer.commit()
       
write_index(ix)
    

#### Parse the query

In [93]:
#parsing the query, simple parser with default field
parser=QueryParser("sentence_text",schema=schema) 
result=parser.parse(u"alexander")
print(result)


sentence_text:alexander


#### Searcher

In [None]:
#searcher object is used for searching the matched documents
#you can open the searcher using a with statement so the searcher is automatically closed when you’re done with it
#ix is the document index we created before

with ix.searcher() as searcher:
    results=searcher.search(result)#The Results object acts like a list of the matched documents.
    print (results[0])