In [23]:
import os
import nltk
import pandas as pd
import time
import string
import re
import tkinter
import sys
from tqdm import tqdm
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from IPython.display import clear_output

### 0.0 List the Files in the specific directory (Section 0.0)

In [2]:
def listFile(d):    
    path = [os.path.abspath(os.path.join(d,i)) for i in os.listdir(d)]
    return path

### 0.1 Read the file and return the content (Section 0.1)

In [3]:
def readFile(d):
    file = open(d,"r",encoding='utf-8')
    content = file.read()
    return content

### 1. Create token (Section 1)
- input (content, document id)
- output (pairs of token and document id)

In [4]:
#unoptimized (input - content, document number)
def createToken(content, dnum):
    tokens = list()
    tokenizer = WhitespaceTokenizer()
    for t in tokenizer.tokenize(content):
        tokens = tokens + [[t,dnum]]
    return tokens

In [5]:
#optimized (input takes only content)
def createTokenOptimized(content):
    tokens = list()
    tokenizer = WhitespaceTokenizer()
    for token in tokenizer.tokenize(content):
        tokens.append(token)
    return tokens

### 2. Linguistic (lower, stemming)

In [6]:
def linguisticToken(token_list):
    stemmer = PorterStemmer() #stem
    #lemmer = WordNetLemmatizer()
    tokens = list()
    regex = "[!@#$%^&*()-_=+'`~ \":;|/.,?[]{}<>]"
    for [t,d] in token_list:
        token = t.translate(str.maketrans('', '', regex)) #remove punctuations
        if token == '': #if the token is only punctuation
            continue
        token = token.lower()
        token = stemmer.stem(token)
        #token = lemmer.lemmatize(token)
        tokens += [[token,d]]
    return tokens

In [7]:
def linguisticTokenOptimized(token_list):
    stemmer = PorterStemmer() #stem
    #lemmer = WordNetLemmatizer()
    tokens = list()
    regex = "[!@#$%^&*()-_=+'`~ \":;|/.,?[]{}<>]"
    for t in token_list:
        token = t.translate(str.maketrans('', '', regex)) #remove punctuations
        if token == '': #if the token is only punctuation
            continue
        token = token.lower()
        token = stemmer.stem(token)
        #token = lemmer.lemmatize(token)
        tokens.append(token)
    return tokens

### 3. Sorting

In [8]:
#unoptimized (sort by term, document id)
def sortToken(token_list):
    token_list.sort(key=lambda e: (e[0],e[1]))
    return token_list

In [9]:
#optimized (sort only by term)
def sortTokenOptimized(listStr):
    newList=[]
    for i in range(len(listStr)):
        for j in range(len(listStr[i][0])):
            newList.append([listStr[i][0][j],i])
    sortedToken = sorted(newList, key = lambda x: x[0])
    return sortedToken

### 4. Transform into posting

In [10]:
def transformPosting(sorted_list):
    postDictionary = {}
    for term,docId in sorted_list: #add terms into posting
        postDictionary.setdefault(term,[]).append(docId)
    for key in postDictionary:
        post = list(dict.fromkeys(postDictionary[key]))
        post.sort(key=int)
        postDictionary[key] = [len(post),post]
    return postDictionary

### 5. Merge the Postings (Intersecting)

In [11]:
#implementation of the algorithm from lecutre (for AND operations)
def mergePostings(postingList):
    result = []
    posting1 = postingList[0]
    
    for i in range(1,len(postingList),1):
        merged = []
        posting2 = postingList[i]
        p = 0
        q = 0
        while p < len(posting1) and q < len(posting2):
            if int(posting1[p]) == int(posting2[q]):
                merged.append(posting1[p])
                p += 1
                q += 1
            elif int(posting1[p]) < int(posting2[q]):
                p += 1
            else:
                q += 1
        posting1 = merged
    return posting1

### Merge (for OR)

In [12]:
#for OR operations
def mergeOrPostings(postingList):
    posting1 = postingList[0]
    
    for i in range(1,len(postingList),1):
        merged = []
        posting2 = postingList[i]
        p = 0
        q = 0
        while p < len(posting1) and q < len(posting2):
            if int(posting1[p]) == int(posting2[q]):
                merged.append(posting1[p])
                p += 1
                q += 1
            elif int(posting1[p]) < int(posting2[q]):
                merged.append(posting1[p])
                p += 1
            else:
                merged.append(posting2[q])
                q += 1
        if p < len(posting1):
            merged += posting1[p:]
        elif q < len(posting2):
            merged += posting2[q:]
        posting1 = merged
    return posting1  

In [13]:
#get all the synnonnumys of a term
def getSynon(word):
    syns = wn.synsets(word)
    synslist = [i.lemmas()[0].name() for i in syns] #put evely synnonnyms to list
    synslist = list(dict.fromkeys(synslist)) #remove duplicates
    return synslist

### 6. Create Index

In [14]:
#create inverted index for the directory
def non_optimized(directory):
    start_time = time.time()
    
    print("Reading files from ", directory)
    files = listFile(directory) #list dir
    tokens = list()
    num = 0
    did = {}
    for docs in tqdm(files):
        try:
            file_content = readFile(docs) #read content
            token = createToken(file_content,num) #create token
            token = linguisticToken(token) #stemming
            tokens += token
            did[num] = docs
            num +=1
        except:
            print(docs)
    
    print("Sorting the tokens ...")
    tokens = sortToken(tokens) #token from all files
    
    print("Transforming into postings ...")
    posting = transformPosting(tokens) #create posting from these files

    end_time = time.time()
    time_to_index = end_time - start_time
    print("Finished indexing.\nTime taken to index: " , round(time_to_index,3))
    return posting, did, time_to_index

In [15]:
#optimized inverted index
def optimized(directory):
    start_time = time.time()
    
    print("Reading files from " , directory)
    files = listFile(directory) #list dir
    tokens = list()
    num = 0
    did = {}
    for docs in tqdm(files):
        #print(docs,num)
        try:
            file_content = readFile(docs) #read content
            token = createTokenOptimized(file_content) #create token
            token = linguisticTokenOptimized(token) #stemming
            token_id = [token,num]
            tokens += [token_id]
            did[num] = docs
            num +=1
        except:
            print(docs)
    
    print("Sorting the tokens ...")
    tokens = sortTokenOptimized(tokens) #token from all files
    
    print("Transforming into postings ...")
    posting = transformPosting(tokens)
    end_time = time.time()
    time_to_index = end_time - start_time
    print("Finished indexing.\nTime taken to index: " , round(time_to_index,3))
    return posting, did, time_to_index

### Process the Query (transform into token, search, merge and return)

In [27]:
def processQuery(query):
    q_token = createTokenOptimized(query)
    q_token = linguisticTokenOptimized(q_token)
    posting_list = []
    
    for token in q_token:
        try:
            posting_list.append(posting[token][1])
        except:
            print(token)
            posting_list.append([])
            break
    result = mergePostings(posting_list)
    return result

### Run the indexer (Normal)

In [25]:
posting, docId, time_to_index = non_optimized("HillaryEmails")
print("Size of index: ", sys.getsizeof(posting), " bytes")

Reading files from  HillaryEmails


100%|█████████████████████████████████████████████████████████████████████████████| 7945/7945 [01:05<00:00, 121.99it/s]


Sorting the tokens ...
Transforming into postings ...
Finished indexing.
Time taken to index:  70.536
Size of index:  2621544  bytes


### Run the indexer (Optimized)

In [24]:
posting, docId, time_to_index = optimized("HillaryEmails")
print("Size of index: ", sys.getsizeof(posting), " bytes")

Reading files from  HillaryEmails


100%|█████████████████████████████████████████████████████████████████████████████| 7945/7945 [00:53<00:00, 148.46it/s]


Sorting the tokens ...
Transforming into postings ...
Finished indexing.
Time taken to index:  59.297
Size of index:  2621544  bytes


## Run the Search Engine (without GUI)

In [None]:
while True:
    query = input("Enter a query (type q to exit) : ")
    if query == "q":
        break
    clear_output()
    start_time = time.time()
    result = processQuery(query)
    
    print( len(result), "documents found for ", query)
    print("Time taken to search: ", time.time()-start_time)
    
    for doc in result:
        print(os.path.basename(docId[int(doc)]))

12 documents found for  place where good foods are available
Time taken to search:  0.009974479675292969
1541.txt
1893.txt
2561.txt
4212.txt
4292.txt
5490.txt
5789.txt
6037.txt
6258.txt
6478.txt
6707.txt
6708.txt


## Run the Search Engine (with GUI)

In [32]:
window = tkinter.Tk()

def processBtn():
    start_time = time.time()
    query = inputEntry.get()
    result = processQuery(query)
    toShow = str(len(result)) + " documents found for " + query + "\n"
    toShow += "Time taken to search: " + str(time.time()-start_time) +"\n"
    rs.config(text = toShow)
    rsFiles = ""
    for num, doc in enumerate(result):
        rsFiles += str(num) + ". " + os.path.basename(docId[int(doc)]) + "\n"
    rsdetail.config(text = rsFiles)  

window.title("NTU Search Engine")
window.geometry("600x1024")
label = tkinter.Label(window, text = "Welcome to our Search Engine", width = 60, fg="red")
label.grid(row = 0)
inputEntry = tkinter.Entry(window,text = "Type here to search", width= 60)
inputEntry.grid(row=1, column = 0)
button_widget = tkinter.Button(window,text="Search", command = processBtn)
button_widget.grid(row=1, column = 1)
rs = tkinter.Label(window, text="", fg="red")
rs.grid(row=3, column = 0, columnspan=2)

rsdetail = tkinter.Message(window, bg="white")
rsdetail.grid(row=4, column = 0, columnspan=2, sticky = "w")

window.mainloop()

### Test the time comparison between Normal and Optimized (average of 10 runs with data sample of 100 files)

In [21]:
time_normal = 0
time_opt = 0
for i in range(10):
    print("Running ", i, " iteration for optimized one")
    pO, dO, time_to_index_opt = optimized("data_sample")
    time_opt += time_to_index_opt
    clear_output()
    print("Running ", i, " iteration for normal one")
    p, d, time_to_index_normal = non_optimized("data_sample")
    time_normal += time_to_index_normal
    clear_output()
print("Normal: " , time_normal/10, "seconds average")
print("Optimized: ", time_opt/10, "seconds average")
print(((time_normal-time_opt)/time_opt)*100, "% faster")

Normal:  2.983490324020386 seconds average
Optimized:  2.357524847984314 seconds average
26.551808205596345 % faster
