In [5]:
'''
This program does the following work - 

1. Builds an inverted index from data files in /data folder and 
saves the index in index.docx file in the same workng directory

2. Returns document IDs for Boolean queries.
Query format - ( term1 op term2 )
Operators - and, or, not (all lower case)

Stemmer used - Porter Stemmer from NLTK package
Lemmantizer - NLTK package based Lemmantizer
Tokenizer - NLTK based Tokenizer

Short Flow of events - 

1. From the given path of input docs, generate the Inverted index and store it in a data structure.
Also store it in a file index.docx.

2. Take the input query from the user in format - ( term op term ).

3. Show the list of doc ID which contain the term

Test Queries and output - test.txt

'''


''' Imports'''
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from docx import Document
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import glob
import errno
import collections
import os

'''
__download the stop words__
__Uncomment and run if the terms aren't present__
nltk.download('stopwords')
nltk.download('wordnet')

'''
#initializing data structures
dictG=[['',0]] #Final Dictionary
queryG=[] #Query
listC=[['',0]] 
docsG=[] #Docslist

#Proccess docs for index building
def buildIndex(fileName, docID):

    words = set() #set to store words from each doc
    
    #from each doc, append words into raw words list
    document=Document(fileName)
    for p in document.paragraphs:
        listT=p.text.split()
        for word in listT:
            words.add(word)    
    #print(words)
    
    #Removing Stop words
    stop_words=list(stopwords.words('english'))    
    filteredQ=[w for w in list(words) if not w in stop_words]
    #print('***************** filtered sentence after removing stop words*****************')
    #print(filteredQ)
    
    #Lemmentizing
    lemmatizer = WordNetLemmatizer()
    lemmaList=[]
    for word in filteredQ:
        lemmaList.append(lemmatizer.lemmatize(word))
    #print('*************** After lemmatization**********')
    #print(lemmaList)
    
    flagD=0
    flagDuplicateDoc=0
    for word in lemmaList:
        for element in dictG:
            if(element[0]==word):
                flagD=1
                for x in element[1:]:
                    if(x==docID):
                        flagDuplicateDoc=1
                        break
                if(flagDuplicateDoc==0):
                    element.append(docID)
                if(flagDuplicateDoc==1):
                    flagDuplicateDoc=0
        if(flagD==0):
            dictG.append([word,docID])
        if(flagD==1):
            flagD=0

'''Proccesses input Boolean query'''
def processQuery():
    global queryG
    flagQ=0
    queryRaw=input('Enter your query - ')
    queryT=queryRaw.split()
    #lemmeantize the query
    lemmatizer = WordNetLemmatizer()
    for word in queryT:
        queryG.append(lemmatizer.lemmatize(word))
    
    obtainTermsFromDictionary()
    getRequiredDocs()

''' Obtains terms from Index - Returns - list of 
terms+docID ['term',id,id...]'''
def obtainTermsFromDictionary():
    global dictG
    global listC
    listC=[['',0]]
    for word in queryG:
        for element in dictG:
            if(word==element[0]):
                listC.append(element)
    listC.pop(0)

'''Gets the associated doc IDs, by infix evaluation of list of doc Ids using stack'''
def getRequiredDocs():
    
    print(queryG) #given the current query
    ListA = list() #temporary list of docs for boolean query
    termCount = 0 #current termcount
    templist = listC[termCount][1:] #doc list from 1st term    
    stack = list(); #stack to manage the query
    
    while len(queryG) > 0:
        
        c = queryG.pop(0)

        if isinstance(c, collections.Sequence):
            if c in ['and','or','not']: 
                stack.append(c)
            else:
                for e in listC:
                    if e[0] == c:
                        stack.append(listC[listC.index(e)][1:])
        
        if ')' in c:
            
            num2 = stack.pop()
            op = stack.pop()
            num1 = stack.pop()
            
            if op == "and":
                stack.append(And(num1,num2))
            if op == "or":
                stack.append(Or(num1,num2))
            if op == "not":
                stack.append(Not(num1,num2))
          
    print('The Docs against the query are - ')
    print(stack[0])

'''Logical Not operation - Args(list,list)
returns - list'''
def Not(list1,list2):
    
    ListA = list()
    if(len(list1) >= len(list2)):
        for x in list1:
            if x not in list2:
                ListA.append(x)
    elif(len(list2) >= len(list1)):            
        for x in list2:
             if x not in list1:
                ListA.append(x)
    return ListA

'''Logical Or operation - Args(list,list)
returns - list'''
def Or(list1,list2):
    
    ListA = list()
    for x in list1:
        ListA.append(x)
        for y in list2:
            if(x != y):
                ListA.append(y)
    
    return list(set(ListA))

'''Logical And operation - Args(list,list)
returns - list'''
def And(list1,list2):
    
    ListA = list()
    for x in list1:
        for y in list2:
            if(x == y):
                ListA.append(x)
    
    return ListA

'''Final running of code'''
#Replace path according to your working directory structure
#path='C:\\Users\\MARS-Utkarsh_Mankad\\Downloads\\IR-Assignment-2017HT12262\\Assignment\\data\\*.docx'
path = os.path.join(os.path.dirname('C:\\Users\\MARS-Utkarsh_Mankad\\Downloads\\IR-Assignment-2017HT12262\\Assignment\\'), 'data\\*.docx')
files=glob.glob(path)
docID=0

for name in files:
    docID+=1
    buildIndex(name,docID)

'''
__Code to create the index file__

document = Document()
document.add_heading('Index', 0)

for element in dictG:
    document.add_paragraph(str(element),style='ListBullet')

#save index to a file
document.save('index.docx')

print(len(dictG))

'''
print('*****Inverted Index and Boolean retrival *********')
#Receive and Proccess the Query
processQuery()   

731
*****Inverted Index and Boolean retrival *********
Enter your query - radiology
['radiology']
The Docs against the query are - 
[1]


In [1]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True