# Import Statements

In [1]:
import pandas as pd
import numpy as np
import string
import os
import re
import pickle

In [2]:
from sortedcontainers import SortedDict, SortedList, SortedSet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

# Read From Files

In [3]:
def getListOfFiles(directory):
    '''
    Parameters:
        directory: type(string)
        
    returns: list of all files in directory with the full path of file
    '''
    
    list_of_files = []
    
    for file_path in os.listdir(directory):
        full_path = os.path.join(directory, file_path)
        if os.path.isfile(full_path):
            list_of_files.append(full_path)
    
    return list_of_files

# Preprocessing Functions

In [4]:
def lowercase(data):
    '''
    Parameters:
        data: type(string)
    
    returns: lowercase of data
    '''
    
    return data.lower()

In [5]:
def perform_word_tokenize(corpus):
    '''
    Parameters:
        corpus: type(string)
    
    returns word-level tokenization of corpus
    '''
    
    return word_tokenize(corpus)

In [6]:
def remove_stopwords_from_tokens(tokens, stopwords_set):
    '''
    Parameters:
        tokens: type(list)
        stopwords_set: type(set)
    
    returns: tokens without stopwords
    '''
    tokens_sans_stopwords = [x for x in tokens if x not in stopwords_set]
    
    return tokens_sans_stopwords

In [7]:
def remove_punctuation_from_tokens(tokens):
    '''
    Parameters:
        tokens: type(list)
    
    returns: tokens without punctuation
    '''
    tokens_sans_punctuation = [x.translate(str.maketrans('', '', string.punctuation)) for x in tokens]
    
    return tokens_sans_punctuation

In [8]:
def remove_blank_space_tokens(tokens):
    '''
    Parameters:
        tokens: type(list)
    
    returns: tokens without blank tokens
    '''
    tokens_sans_blank_space = [x for x in tokens if x!='']
    
    return tokens_sans_blank_space

In [9]:
def lemmatize_tokens(tokens):
    '''
    Parameters:
        tokens: type(list)
    
    returns: returns unique tokens after lemmatization
    '''
    lemmatizer = WordNetLemmatizer()
    lemmatize_tokens = [lemmatizer.lemmatize(x) for x in tokens]
    unique_lemmatize_tokens = list(dict.fromkeys(lemmatize_tokens))
    
    return unique_lemmatize_tokens

In [10]:
def preprocess(corpus, stopwords_set, preprocess_type):
    # Convert the text to lower case
    lowercase_corpus = lowercase(corpus)
    #print(len(lowercase_corpus))
    
    # Perform word tokenization (word_tokenize also takes care of whitespace)
    word_tokens = perform_word_tokenize(lowercase_corpus)
    #print(len(word_tokens))
    
    # Remove stopwords from tokens
    word_tokens_sans_stopwords = remove_stopwords_from_tokens(word_tokens, stopwords_set)
    #print(len(word_tokens_sans_stopwords))
    
    # Remove punctuation marks from tokens
    word_tokens_sans_punctuation = remove_punctuation_from_tokens(word_tokens_sans_stopwords)
    #print(len(word_tokens_sans_punctuation))
    
    # Remove blank space tokens
    word_tokens_sans_blank_tokens = remove_blank_space_tokens(word_tokens_sans_punctuation)
    #print(len(word_tokens_sans_blank_tokens))
    
    # Lemmatize tokens
    #word_tokens_final = lemmatize_tokens(word_tokens_sans_blank_tokens)
    #print(len(word_tokens_final))
    
    return word_tokens_sans_blank_tokens

# Helper Functions

In [11]:
def create_file_dictionary(list_of_files):
    '''
    Paramteres:
        list_of_files: type(string)
    
    returns: file_dictionary with integer key and path_of_file as value
    '''
    file_dictionary = {}
    for i in range(len(list_of_files)):
        file_dictionary[i] = list_of_files[i]
    
    return file_dictionary

In [36]:
def create_positional_index(file_dictionary, stopwords_set):
    # initialize positional index
    positional_index = SortedDict()
    
    # positional index
    for doc_ID in range(1):
        file = open(file_dictionary[doc_ID], 'r', encoding='utf-8', errors='ignore')
        file_corpus = file.read()
        file.close()
        doc_tokens = preprocess(file_corpus, stopwords_set, 'doc')
        print(doc_tokens)
        for index in range(len(doc_tokens)):
            print(doc_tokens[index])
            if doc_tokens[index] in positional_index:
                continue
            else:
                positional_index = [1, {doc_ID:[index]}]
#         for token in doc_tokens:
#             if token in positional_index:
#                 positional_index[token][0] += 1
#                 if doc_ID in positional_index[token][1]:
#                     positional_index[token][1][doc_ID] = doc_tokens.index(token)
#                 else:
#                     positional_index[token][1] = SortedDict({doc_ID:positional_index[token]})
#             else:
#                 positional_index[token] = [1, SortedDict()]
#                 positional_index[token][1] = SortedDict({doc_ID:positional_index[token]})
    
    print(positional_index)
    # Storing positional index
    pi_file = open('positional_index_pickle_file', 'wb')
    pickle.dump(positional_index, pi_file)
    pi_file.close()

# Main

In [37]:
def main():
    # create set of stop words for preprocessing
    stopwords_set = set(stopwords.words('english'))
    
    # Get List of Files in Dataset
    list_of_files = getListOfFiles('Dataset/Humor,Hist,Media,Food/')
    
    # create dictionary of file with docID (integer) as key and full_path of file as value
    file_dictionary = create_file_dictionary(list_of_files)
    
    # create positional index once and then load pickle file afterwards
    create_positional_index(file_dictionary, stopwords_set)
    
    #Loading pre-processed files
    pi_file = open('positional_index_pickle_file', 'rb')
    pi_inverted_index = pickle.load(pi_file)
    pi_file.close()
    
    print(pi_inverted_index)

In [38]:
if __name__ == "__main__":
    main()

['herbalherb1st', 'aidcalendulacomfreyremediessickmedicine', 'herbal', 'first', 'aid', 'kit', 'herbal', 'first', 'aid', 'kit', 'calendula', 'ointment', 'use', 'minor', 'cuts', 'grazes', 'red', 'rashes', 'minor', 'skin', 'rash', 'comfrey', 'ointment', 'suitable', 'bruises', 'minor', 'damage', 'external', 'blood', 'vessels', 'veins', 'st', 'johnswort', 'oil', 'beneficial', 'itchy', 'skin', 'irritable', 'psoriasis', 'also', 'good', 'sunburn', 'applied', 'night', 'liver', 'mixture', 'mild', 'laxative', 'properties', 'helps', 'digestion', 'rich', 'food', 'take', 'one', 'teaspoon', 'night', '30', 'minutes', 'main', 'meal', 'parasite', 'mixture', 'effective', 'common', 'internal', 'parasites', 'infestation', 'suspected', 'abstain', 'food', '24', 'hours', 'take', 'one', 'tablespoon', 'mixture', 'little', 'water', 'repeat', 'dose', 'four', 'hours', 'another', 'four', 'hours', 'parasites', 'died', 'able', 'recommence', 'eating', 'four', 'hours', 'last', 'dose', 'gasp', 'may', 'also', 'used', 'sk