# Inverted Index and Query Processing
<author>&copy; Professor Yuefeng Li (QUT) </author>


## Task 1. 
For the given two XML documents (you can download them from week 3 workshop and then save them in a folder, e.g., ‘data’), design a python function index_docs() to index them (please remove stop words and index stems only). 

The returned index should be a dictionary {term:{docID1:freq1, DocID2:freq2}, …}

In [1]:
import glob, os
import string
from stemming.porter2 import stem

def index_docs(inputpath,stop_words):
    Index = {}    # initialize the index
    os.chdir(inputpath)
    for file_ in glob.glob("*.xml"):
        start_end = False
        for line in open(file_):
            line = line.strip()
            if(start_end == False):
                if line.startswith("<newsitem "):
                    for part in line.split():
                        if part.startswith("itemid="):
                            docid = part.split("=")[1].split("\"")[1]
                            break  
                if line.startswith("<text>"):
                    start_end = True  
            elif line.startswith("</text>"):
                break
            else:
                line = line.replace("<p>", "").replace("</p>", "")
                line = line.translate(str.maketrans('','', string.digits)).translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
                for term in line.split():
                    term = stem(term.lower())
                    if len(term) > 2 and term not in stop_words:
                        try:
                            try:
                                Index[term][docid] += 1
                            except KeyError:
                                Index[term][docid]=1
                        except KeyError:  
                            Index[term] = {docid:1} 
    return Index

In [2]:
# Note that text preprocessing happens before terms are indexed, where terms are stemmed.
stopwords_f = open('../week4/prac/common-english-words.txt', 'r') # wk3
stop_words = stopwords_f.read().split(',')
Index = index_docs("data", stop_words)
Index

{'pro': {'809495': 4},
 'iranian': {'809495': 1},
 'hizbollah': {'809495': 7},
 'fighter': {'809495': 1},
 'lebanon': {'809495': 11},
 'rain': {'809495': 1, '741299': 1},
 'score': {'809495': 1},
 'katyusha': {'809495': 4},
 'rocket': {'809495': 5},
 'northern': {'809495': 3},
 'israel': {'809495': 13},
 'tuesday': {'809495': 3, '809481': 1},
 'prompt': {'809495': 1},
 'threat': {'809495': 1},
 'prime': {'809495': 1},
 'minist': {'809495': 2, '6146': 1},
 'benjamin': {'809495': 1},
 'netanyahu': {'809495': 5},
 'tough': {'809495': 2},
 'isra': {'809495': 9},
 'respons': {'809495': 3},
 'quot': {'809495': 13, '741299': 4, '6146': 2},
 'quiet': {'809495': 3},
 'side': {'809495': 4},
 'border': {'809495': 2},
 'told': {'809495': 2},
 'report': {'809495': 2},
 'stand': {'809495': 2, '741299': 1},
 'damag': {'809495': 1},
 'hous': {'809495': 1},
 'dure': {'809495': 1, '741299': 1, '6146': 1},
 'tour': {'809495': 2},
 'town': {'809495': 3},
 'kiryat': {'809495': 1},
 'shmona': {'809495': 1},

## Task 2. 
Design a python function **doc_at_a_time(I, Q)**, where index I is a Dictionary of term:Directionary of (itemId:freq), which returns a dictionary of docId:relevance for the given query Q (a term:freq dictionary).

In [5]:
def doc_at_a_time(I, Q):  # index I is a Dirctionary of term:Directionary of (itemId:freq)
    L={}    # L is the selected inverted list
    R={}    # R is a directionary of docId:relevance
    for list in I.items():
        for id in list[1].items(): # get all document IDs with value 0
            R[id[0]]=0
        if (list[0] in Q):     # select inverted lists based on the query
                L[list[0]]= I[list[0]]
    for (d, sd) in R.items():
        for (term, f) in L.items():
            if (d in f):
                sd = sd + f[d]*Q[term]
        R[d] = sd
    return R

In [8]:
Query = {'formula':1, 'one':1} 
result1 = doc_at_a_time(Index, Query)
result1

{'809495': 1, '741299': 2, '809481': 0, '6146': 0}

## Task 3. 
Design a python function **term_at_a_time(I, Q)**, where index I is a Dictionary of term:Directionary of (itemId:freq), which returns a dictionary of docId:relevance for the given query Q (a term:freq dictionary).

In [9]:
def term_at_a_time(I, Q):  # index I is a Dirctionary of term:Directionary of (itemId:freq)
    L={}    # L is the selected inverted list
    R={}    # R is a directionary of docId:relevance
    for list in I.items():
        for id in list[1].items(): # get all document IDs with value 0
            R[id[0]]=0
        if (list[0] in Q):     # select inverted lists based on the query
                L[list[0]]= I[list[0]]
    for (term, li) in L.items():  # traversal of the selected inverted list
        for (d, f) in li.items(): # for each occurence of doc, update R 
                R[d] = R[d]  + f*Q[term]
    return R

In [10]:
Query = {'formula':1, 'one':1} 
result2 = term_at_a_time(Index, Query)
result2

{'809495': 1, '741299': 2, '809481': 0, '6146': 0}

## Task 4. 
Design a python main program to call the above three functions for a query, e.g., Query = {'formula':1, 'one':1}.

In [3]:
#if __name__ == '__main__':

import sys

    #if len(sys.argv) != 2:
     #   sys.stderr.write("USAGE: %s <coll-file>\n" % sys.argv[0])
      #  sys.exit()

curr_path=os.getcwd()
print(curr_path)
    
stopwords_f = open('common-english-words.txt', 'r')
stop_words = stopwords_f.read().split(',')
stopwords_f.close()
#Index = index_docs(sys.argv[1], stop_words) #create an index for all terms in <text>, data structure {'w1':{'ID1':2, 'ID2':1}, 'w2':{'ID3':1, 'ID1':3}}
"""    for term in coll.items():
        print "Term --- %s" % (term[0])
        for id in coll[term[0]].items(): 
            print "   Document ID: %s and frequency: %d" % (id[0], id[1]) """
#Query = {'leaderboard':1, 'british':1}
#print(Index)

data_path = curr_path+'/data'
Index = index_docs(data_path, stop_words) #create an index for all terms in <text>, data structure {'w1':{'ID1':2, 'ID2':1}, 'w2':{'ID3':1, 'ID1':3}}
os.chdir(curr_path)
print(Index)

Query = {'formula':1, 'one':1} 
result1 = doc_at_a_time(Index, Query)
result2 = term_at_a_time(Index, Query)
x1 = sorted(result1.items(), key=lambda x: x[1],reverse=True)
x2 = sorted(result2.items(), key=lambda x: x[1],reverse=True)
print('Document_at_a_time result--------')
for (id, w) in x1:
    if w>0:
        print('Document ID: '+id + ' and relevance weight: ' + str(w))
print('Term_at_a_time result --------')
for (id, w) in x2:
    if w>0:
        print('Document ID: ' + id + ' and relevance weight: ' + str(w))

/workspaces/cab420-workspace/work/search-engine-technology/week6/data


FileNotFoundError: [Errno 2] No such file or directory: 'common-english-words.txt'

In [6]:
# We assume the jupyter starts from your working directory.
# We use os methods to find the current working directory 'curre_path', then data's directory 'data_path'.
# Note we need to go back to the current working directory after call index_docs as it changed the working directory.
# You may change the 'Query' to test more queries.  