In [76]:
import os
import nltk
import pandas as pd
import time
import string
import re
import tkinter
import sys
from pympler import asizeof
from tqdm import tqdm,tqdm_notebook
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from IPython.display import clear_output
import pickle

# !pip install pympler

### 0.0 List the Files in the specific directory

In [2]:
def listFile(d):    
    path = [os.path.abspath(os.path.join(d,i)) for i in os.listdir(d)]
    return path

### 0.1 Read the file and return the content

In [3]:
def readFile(d):
    file = open(d,"r",encoding='utf-8')
    content = file.read()
    return content

### 1. Create token

In [4]:
#unoptimized (input - content, document number)
def createToken(content, dnum):
    tokens = list()
    tokenizer = WhitespaceTokenizer()
    for t in tokenizer.tokenize(content):
        tokens = tokens + [[t,dnum]]
    return tokens

In [5]:
#optimized (input takes only content)
def createTokenOptimized(content):
    tokens = list()
    tokenizer = WhitespaceTokenizer()
    for token in tokenizer.tokenize(content):
        tokens.append(token)
    return tokens

### 2. Linguistic (lower, stemming)

In [6]:
def linguisticToken(token_list):
    stemmer = PorterStemmer() #stem
    #lemmer = WordNetLemmatizer()
    tokens = list()
    regex = "[!@#$%^&*()-_=+'`~ \":;|/.,?[]{}<>]"
    for [t,d] in token_list:
        token = t.translate(str.maketrans('', '', regex)) #remove punctuations
        if token == '': #if the token is only punctuation
            continue
        token = token.lower()
        token = stemmer.stem(token)
        #token = lemmer.lemmatize(token)
        tokens += [[token,d]]
    return tokens

In [7]:
#optimized (input takes only content)
def linguisticTokenOptimized(token_list):
    stemmer = PorterStemmer() #stem
    #lemmer = WordNetLemmatizer()
    tokens = list()
    regex = "[!@#$%^&*()-_=+'`~ \":;|/.,?[]{}<>]"
    for t in token_list:
        token = t.translate(str.maketrans('', '', regex)) #remove punctuations
        if token == '': #if the token is only punctuation
            continue
        token = token.lower()
        token = stemmer.stem(token)
        #token = lemmer.lemmatize(token)
        tokens.append(token)
    return tokens

### 3. Sorting

In [8]:
#unoptimized (sort by term, document id)
def sortToken(token_list):

    token_list.sort(key=lambda e: (e[0],e[1]))

    return token_list

In [9]:
#sort by term only
def sortTokenFirstOpt(token_list):
    token_list.sort(key=lambda e: e[0])
    return token_list

In [10]:
#sort by term and append docID
def sortTokenOptimized(listStr):

    newList=[]
    for doc in listStr:
        for term in doc[0]:
            newList.append([term, doc[1]])
    newList.sort(key = lambda x: x[0])

    return newList

### 4. Transform into posting

In [12]:
def transformPosting(sorted_list):
    postDictionary = {}
    for term,docId in sorted_list: #add terms into posting
        postDictionary.setdefault(term,[]).append(docId)
    for key in postDictionary:
        post = list(dict.fromkeys(postDictionary[key]))
        post.sort(key=str)
        postDictionary[key] = [len(post),post]
    return postDictionary

In [11]:
def transformPostingOptimized(sorted_list):
    postDictionary = {}
    for term,docId in sorted_list: #add terms into posting
        postDictionary.setdefault(term,[]).append(docId)
    for key in postDictionary:
        post = list(dict.fromkeys(postDictionary[key]))
        post.sort(key=int)
        postDictionary[key] = [len(post),post]
    return postDictionary

### 5. Merge the Postings (Intersecting)

In [14]:
#implementation of the algorithm from lecutre (for AND operations)
def mergePostings(postingList):
    result = []
    posting1 = postingList[0]
    
    for i in range(1,len(postingList),1):
        merged = []
        posting2 = postingList[i]
        p = 0
        q = 0
        while p < len(posting1) and q < len(posting2):
            if int(posting1[p]) == int(posting2[q]):
                merged.append(posting1[p])
                p += 1
                q += 1
            elif int(posting1[p]) < int(posting2[q]):
                p += 1
            else:
                q += 1
        posting1 = merged
    return posting1

### Merge Postings (for OR operations) - Not in Use

In [15]:
#for OR operations
def mergeOrPostings(postingList):
    posting1 = postingList[0]
    
    for i in range(1,len(postingList),1):
        merged = []
        posting2 = postingList[i]
        p = 0
        q = 0
        while p < len(posting1) and q < len(posting2):
            if int(posting1[p]) == int(posting2[q]):
                merged.append(posting1[p])
                p += 1
                q += 1
            elif int(posting1[p]) < int(posting2[q]):
                merged.append(posting1[p])
                p += 1
            else:
                merged.append(posting2[q])
                q += 1
        if p < len(posting1):
            merged += posting1[p:]
        elif q < len(posting2):
            merged += posting2[q:]
        posting1 = merged
    return posting1  

### 6. Create Index

In [18]:
#create inverted index for the directory
def non_optimized(directory):
    start_time = time.time()
    
    print("Reading files from ", directory)
    files = listFile(directory) #list dir
    tokens = list()
    num=0
    did={}

    for docs in tqdm(files):
        try:
            file_content = readFile(docs) #read content
            token = createToken(file_content,num) #create token
            token = linguisticToken(token) #stemming
            tokens += token
            did[num]=docs
            num+=1
        except:
            print(docs)
        
    
    print("Sorting the tokens ...")
    tokens = sortToken(tokens) #token from all files

    print("Transforming into postings ...")
    posting = transformPostingOptimized(tokens) #create posting from these files

    
    end_time = time.time()
    time_to_index = end_time - start_time
    print("Finished indexing.\nTotal Time taken to index: " , round(time_to_index,3))
    return posting, did, time_to_index

In [17]:
#create inverted index for the directory
def first_optimized(directory):
    start_time = time.time()
    
    print("Reading files from ", directory)
    files = listFile(directory) #list dir
    tokens = list()
    num = 0
    did = {}

    for docs in tqdm(files):
                
        try:
            file_content = readFile(docs) #read content
            token = createToken(file_content,num) #create token
            token = linguisticToken(token) #stemming
            tokens += token
            did[num] = docs
            num +=1
        except:
            print(docs)

    print("Sorting the tokens ...")
    tokens = sortTokenFirstOpt(tokens) #token from all files

    
    print("Transforming into postings ...")
    posting = transformPostingOptimized(tokens) #create posting from these files

    end_time = time.time()
    time_to_index = end_time - start_time
    print("Finished indexing.\nTotal Time taken to index: " , round(time_to_index,3))
    return posting, did, time_to_index

In [19]:
#optimized inverted index
def optimized(directory):
    start_time = time.time()
    
    print("Reading files from " , directory)
    files = listFile(directory) #list dir
    tokens = list()
    num = 0
    did = {}
    

    for docs in tqdm(files):
        try:
            file_content = readFile(docs) #read content
            token = createTokenOptimized(file_content) #create token
            token = linguisticTokenOptimized(token) #stemming
            token_id = [token,num]
            tokens += [token_id]
            did[num] = docs
            num +=1
        except:
            print(docs)
        
    print("Sorting the tokens ...")
    tokens = sortTokenOptimized(tokens) #token from all files
    
    print("Transforming into postings ...")
    posting = transformPostingOptimized(tokens)

    end_time = time.time()
    time_to_index = end_time - start_time
    print("Finished indexing.\nTotal Time taken to index: " , round(time_to_index,3))
    return posting, did, time_to_index

### Process the Query (transform into token, search, merge and return)

In [20]:
def processQuery(query):
    q_token = createTokenOptimized(query)
    q_token = linguisticTokenOptimized(q_token)
    posting_list = []
    
    for token in q_token:
        try:
            posting_list.append(posting[token][1])
        except:
            print(token)
            posting_list.append([])
            break
    result = mergePostings(posting_list)
    return result

### Run the indexer (Unoptimized)

In [21]:
posting, docId, time_to_index = non_optimized("datafull-lean")
print("Size of index: ", asizeof.asizeof(posting)/1000000,"MB. \nSize of dictionary", asizeof.asizeof(docId)/1000000, "MB")

Reading files from  datafull-lean


100%|████████████████████████████████████| 47938/47938 [35:40<00:00, 33.53it/s]


Sorting the tokens ...
Transforming into postings ...
Finished indexing.
Total Time taken to index:  2600.666
Size of index:  628.085488 MB. 
Size of dictionary 15.494416 MB


### Run the indexer (Optimized - Single Sort)

In [38]:
posting, docId, time_to_index = first_optimized("datafull-lean")
print("Size of index: ", asizeof.asizeof(posting1)/1000000,"MB. \nSize of dictionary", asizeof.asizeof(docId)/1000000, "MB")

Reading files from  datafull-lean


100%|████████████████████████████████████| 47938/47938 [34:08<00:00, 23.41it/s]


Sorting the tokens ...
Transforming into postings ...
Finished indexing.
Total Time taken to index:  2140.685
Size of index:  628.085488 MB. 
Size of dictionary 15.494416 MB


### Run the indexer (Optimized Appending of DocID and Single Sort)

In [21]:
posting, docId, time_to_index = optimized("datafull-lean")
print("Size of index: ", asizeof.asizeof(posting2)/1000000,"MB. \nSize of dictionary", asizeof.asizeof(docId)/1000000, "MB")

Reading files from  datafull-lean


100%|████████████████████████████████████| 47938/47938 [22:29<00:00, 35.52it/s]


Sorting the tokens ...
Transforming into postings ...
Finished indexing.
Total Time taken to index:  1470.042
Size of index:  628.085488 MB. 
Size of dictionary 15.494416 MB


### Final IR System Query

In [None]:
while True:
    query = input("Enter a query (type q to exit) : ")
    if query == "q":
        break
    clear_output()
    start_time = time.time()
    result = processQuery(query)
    
    print( len(result), "documents found for '"+query+"'")
    print("Time taken to search: ", time.time()-start_time)
    
    for doc in result:
        print(os.path.basename(docId[int(doc)])

### Saving the posting list, docID and time_to_index as backups

In [69]:
# pickle.dump(posting, open("backup/posting.p","wb"))
# pickle.dump(docId,open("backup/docId.p","wb"))
# pickle.dump(time_to_index,open("backup/time_to_index.p","wb"))

### Loading the posting list, docID and time_to_index from disk

In [None]:
posting=pickle.load(open("backup/posting.p","rb"))
docId=pickle.load(open("backup/docId.p","rb"))
time_to_index=pickle.load(open("backup/time_to_index.p","rb"))

# Codes for Size Optimization
## Includes:
- Codes required for Variable Bytes Compression on posting lists
- Dictionary-as-a-String Compression for Terms
- Combination of Dictionary-as-a-String Compression and Variable Bytes Compression

In [80]:
# codes requied for variable bytes compression for posting list

def encodeNumber(n):
    bytes_list = []
    while True:
        bytes_list.insert(0, n % 128)
        if n < 128:
            break
        n = n // 128
    bytes_list[-1] += 128
    return bytearray(bytes_list)
    
def encode(numbers):
    bytes_list = []
    for num in numbers:
        bytes_list.append(encodeNumber(num))
    return b"".join(bytes_list)
    
def decode(bytes_list):
    numbers = []
    n = 0
    for byte in bytes_list:
        if byte < 128:
            n = 128 * n + byte
        else:
            n = 128 * n + (byte - 128)
            numbers.append(n)
            n = 0
    return numbers

def variableBytesCompression(posting):
    ptr = {}
    for key in tqdm_notebook(posting):

        post = posting[key][1]
        docfreq = posting[key][0]
        
        temp_post = []
        for docId in range(len(post)):
            if(docId == 0):
                temp_post.append(post[0])
            else:
                temp_post.append(post[docId]-post[docId-1])

        ptr[key] = [docfreq,encode(temp_post)]
    return ptr

def search_variableBytesCompression(ptr,t):
    key = t
    post = decode(ptr[key][1])
    temp = [post[0]]
    for i in range(1,len(post),1):
        temp.append(post[i]+temp[i-1])
    result = [ptr[key][0],temp]
    return result

def dict_as_string(posting):
    longstr = "";
    ptr = {}
    i = 0
    for key in tqdm_notebook(posting):
        
        i+=len(key)
        longstr += key

        post = posting[key][1]
        docfreq = posting[key][0]

        ptr[i] = [docfreq,post]
    return longstr,ptr

def search_dict_as_string(longstr,ptr,t):
    start = 0
    end = 0
    i = 0
    for key in ptr:
        end = key
        
        word = longstr[start:end]
        if(t == word):
            return ptr[key]
            break
            
        start = end
        
def combined_compress(posting):
    longstr = "";
    ptr = {}
    i = 0
    for key in tqdm_notebook(posting):        
        i+=len(key)
        longstr += key

        post = posting[key][1]
        docfreq = posting[key][0]
        
        temp_post = []
        for docId in range(len(post)):
            if(docId == 0):
                temp_post.append(post[0])
            else:
                temp_post.append(post[docId]-post[docId-1])

        ptr[i] = [docfreq,encode(temp_post)]
    return longstr,ptr

def combined_search(longstr,ptr,t):
    start = 0
    end = 0
    i = 0
    for key in ptr:
        end = key
        
        word = longstr[start:end]
        if(t == word):
            post = decode(ptr[key][1])
            temp = [post[0]]
            for i in range(1,len(post),1):
                temp.append(post[i]+temp[i-1])
            result = [ptr[key][0],temp]
            return result
            break
            
        start = end

In [81]:
# Variable Bytes Compression on Posting List
ptrVariableBytes = variableBytesCompression(posting) 
print('Size after Variable Bytes Compression:',asizeof.asizeof(ptrVariableBytes)/1000000,'MB')

# Dict-as-String Compression for Terms
longstr,ptrDaS= dict_as_string(posting)
print('Size after Dict-as-String Compression:',asizeof.asizeof(ptrDaS)/1000000,'MB')
print('Dictionary String Size:',asizeof.asizeof(longstr)/1000000,'MB')
print('Total Size:',asizeof.asizeof(longstr)/1000000+asizeof.asizeof(ptrDaS)/1000000,'MB')

# Combination of Dic-as-String and Variable Bytes Compression
longstr2,ptrCombined = combined_compress(posting)
print('Size after Dict-as-String Compression:',asizeof.asizeof(ptrCombined)/1000000,'MB')
print('Dictionary String Size:',asizeof.asizeof(longstr2)/1000000,'MB')
print('Total Size:',asizeof.asizeof(longstr2)/1000000+asizeof.asizeof(ptrCombined)/1000000,'MB')

HBox(children=(IntProgress(value=0, max=1477855), HTML(value='')))

Size after Variable Bytes Compression: 382.797928 MB


HBox(children=(IntProgress(value=0, max=1477855), HTML(value='')))

Size after Dict-as-String Compression: 576.971088 MB
Dictionary String Size: 55.763536 MB
Total Size: 632.734624 MB


HBox(children=(IntProgress(value=0, max=1477855), HTML(value='')))

Size after Dict-as-String Compression: 331.683528 MB
Dictionary String Size: 55.763536 MB
Total Size: 387.447064 MB
