# CI6226 Assignment

## Team: GoogleX
- Dwivedi Vijay Prakash (G1902961C)  
- Shakya Manoj (G1902549)  
- Emadeldeen Ahmed (G1900649D)  

In [1]:
import os
from os.path import isfile, join
import re
import string
from nltk.stem.porter import *

## 0. Toolkit Util Functions

### Function for Directory Listing
- Input: string (path to directory)  
- Output: list of strings (full paths to files in the directory)

In [2]:
def list_all_files(path='HillaryEmails'):
    root = join(os.getcwd(), path)
    file_paths = [join(root, file) for file in os.listdir(path) if isfile(join(path, file))]
    return file_paths

### Function for File Reading
- Input: string (full path to file)  
- Output: string/text (full contents of a file)

In [17]:
def read_file(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
        content = content.replace("\n", " ")     # removing \n and replacing with space
        content = content.replace("\t", " ")     # removing \t and replacing with space
        return content

## 1. Tokenization

### Fucntion for tokenization based on whitespace characters (space, newline, tab)
- Input: text (file contents), string (document id = path to file)  
- Output: list of pairs < string (token) , string (document id) >  

The tokenize function defined below also takes care of multiple whitespace characters in a row.

In [4]:
def tokenize(file_content, file_path):
    return [(x, file_path) for x in re.split(r"([.,!?]+)?\s+", file_content) if x]

In [24]:
# D E M O    O F    T O K E N I Z E     F U N C T I O N

file1 = list_all_files()[23]
file_content = read_file(file1)

tokenize(file_content, file1)[136:141]    # displaying only 5 for demo

[('that', '/Users/Vijay/Desktop/IR_Project/HillaryEmails/4117.txt'),
 ('she', '/Users/Vijay/Desktop/IR_Project/HillaryEmails/4117.txt'),
 ('will', '/Users/Vijay/Desktop/IR_Project/HillaryEmails/4117.txt'),
 ('host', '/Users/Vijay/Desktop/IR_Project/HillaryEmails/4117.txt'),
 ('a', '/Users/Vijay/Desktop/IR_Project/HillaryEmails/4117.txt')]

## 2. Linguistic Modules

### Function for removing punctuation, lowercasing, and stemming
- Input: list of pairs < token , document id >
- Output: list of pairs < modified token , document id >

In [25]:
def linguistic_clean(token_file_path_pairs):
    modified_list = []
    for pair in token_file_path_pairs:
        token, file_path = pair

        # Removing punctuations
        punctuations = '''!@#$%^&*()-_=+'`~ ":;|/.,?[]{}<>'''
        for char in token.lower(): 
            if char in punctuations: 
                token = token.replace(char, "") 

        # Lowercasing the token
        token = token.lower()

        # Stemming
        stemmer = PorterStemmer()
        token = stemmer.stem(token)
        
        modified_list.append((token, file_path))
    
    return modified_list

In [90]:
# D E M O    O F    L I N G U I S T I C      C L E A N     F U N C T I O N

file1 = list_all_files()[23]
file_content = read_file(file1)

token_docid_pairs = tokenize(file_content, file1)[136:141]    # displaying only 5 for demo

linguistic_clean(token_docid_pairs)

[('that', '/Users/Vijay/Desktop/IR_Project/HillaryEmails/4117.txt'),
 ('she', '/Users/Vijay/Desktop/IR_Project/HillaryEmails/4117.txt'),
 ('will', '/Users/Vijay/Desktop/IR_Project/HillaryEmails/4117.txt'),
 ('host', '/Users/Vijay/Desktop/IR_Project/HillaryEmails/4117.txt'),
 ('a', '/Users/Vijay/Desktop/IR_Project/HillaryEmails/4117.txt')]

## 3. Sorting the Tokens

### Function for sorting the tokens doc id pair, first by token, and then by doc id (file paths)
- Input: list of pairs < token , document id >
- Output: sorted list of pairs < token , document id >

In [91]:
def sort_tokens(token_file_path_pairs):
    token_file_path_pairs.sort(key=lambda tup: tup[1])    # then by doc id (file paths)
    token_file_path_pairs.sort(key=lambda tup: tup[0])    # sorting first by token
    return token_file_path_pairs

In [92]:
# D E M O    O F    S O R T     T O K E N S     F U N C T I O N

linguistic_cleaned_token_docid_pairs = linguistic_clean(token_docid_pairs)
sort_tokens(linguistic_cleaned_token_docid_pairs)

[('a', '/Users/Vijay/Desktop/IR_Project/HillaryEmails/4117.txt'),
 ('host', '/Users/Vijay/Desktop/IR_Project/HillaryEmails/4117.txt'),
 ('she', '/Users/Vijay/Desktop/IR_Project/HillaryEmails/4117.txt'),
 ('that', '/Users/Vijay/Desktop/IR_Project/HillaryEmails/4117.txt'),
 ('will', '/Users/Vijay/Desktop/IR_Project/HillaryEmails/4117.txt')]

## 4. Transformation into Postings

### Function for converting the sorted list of token doc id pairs to inverted index
- Input: sorted list of pairs < token , document id >
- Output: inverted index

In [93]:
def create_posting(sorted_token_file_path_pairs):
    inv_index, token_frequency = {}, {}
    for pair in sorted_token_file_path_pairs:
        token, file_path = pair
        if not token in inv_index:
            token_frequency[token] = 1
            inv_index[token] = [file_path]
        else:
            if file_path != inv_index[token][-1]:
                inv_index[token].append(file_path)
                token_frequency[token] += 1
    inv_index['__freq__'] = token_frequency
    return inv_index

In [94]:
# D E M O    O F    C R E A T E     P O S T I N G     F U N C T I O N

sorted_token_file_path_pair = sort_tokens(linguistic_cleaned_token_docid_pairs)
create_posting(sorted_token_file_path_pair)

{'a': ['/Users/Vijay/Desktop/IR_Project/HillaryEmails/4117.txt'],
 'host': ['/Users/Vijay/Desktop/IR_Project/HillaryEmails/4117.txt'],
 'she': ['/Users/Vijay/Desktop/IR_Project/HillaryEmails/4117.txt'],
 'that': ['/Users/Vijay/Desktop/IR_Project/HillaryEmails/4117.txt'],
 'will': ['/Users/Vijay/Desktop/IR_Project/HillaryEmails/4117.txt'],
 '__freq__': {'a': 1, 'host': 1, 'she': 1, 'that': 1, 'will': 1}}

## 5. Postings List Merge

### Function for merging posting lists of two or more tokens
- Input: list of postings lists
- Output: merged postings list

In [None]:
# def func_call(temp_dict)


# key1
# key2
# key3

# temp_dict = {}
# temp_dict[key1] = inv_index[key1]
# temp_dict[key2] = inv_index[key2]
# temp_dict[key2] = inv_index[key3]



In [62]:
# D E M O      R U N

# all_pairs = []
# for file in list_all_files()[23:28]:
#     text = read_file(file)
#     all_pairs.extend(tokenize(text, file))

# inv_index = create_posting(sort_tokens(linguistic_clean(all_pairs)))

# inv_index['xpress']
# inv_index['__freq__']['xpress']