In [1]:
import nltk
#For file reading
import os
from os import listdir
#For pre-processing 
from nltk.tokenize import punkt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import string
# import re
# from collections import Counter

In [2]:
class InvertedIndex:
    """
    Construct Inverted Index
    """
    def __init__(self):
        self.index = {}
    
    def read_data(self, path: str) -> list:
        """
        Read files from a directory and then append the data of each file into a list.
        """
        self.path = path
        folder = listdir(self.path)
        res = []
        for files in folder:
            # check if current path is a file
            if files.endswith(".txt"):
                filePath = os.path.join(self.path, files)
                if os.path.isfile(filePath):
                    with open(filePath, "r") as file:
                        lines = file.readlines()
                    res.append(lines)
        return res
 
    def process_document(self, document: str) -> list:
        """
        pre-process a document and return a list of its terms
        str->list"""
        
        #Tokenization
        token_list = nltk.word_tokenize(document)
        # Remove punctuations.
        table = str.maketrans('', '', '\t')
        token_list = [word.translate(table) for word in token_list]
        punctuations = (string.punctuation).replace("'", "")
        trans_table = str.maketrans('', '', punctuations)
        stripped_words = [word.translate(trans_table) for word in token_list]
        token_list = [str for str in stripped_words if str]
        # Change to lowercase.
        token_list =[word.lower() for word in token_list]
        return token_list
    
    def index_corpus(self, documents: list) -> None:
        """
        index given documents
        list->None"""
        sequencePair = {}
        listPair = []
        #Store the term and docID in a sequence pair
        for i, doc in enumerate(documents):
            for data in doc:
                tokenList = self.process_document(data)
                for token in tokenList:
                    sequencePair[token] = i+1
                    listPair.append(sequencePair)
        print("ListPairs:\n",listPair)
        #Sort the sequence pair first by term then by 
        sortedDict = {val[0] : val[1] for val in sorted(sequencePair.items(), key = lambda x: (x[1], x[0]))}
         
        for key, value in sortedDict.items():
            # Initialize the stemmer.
            stemmer = PorterStemmer()
            # First stem the term.
#             key = stemmer.stem(key)
            if key not in self.index.keys():
                # Initialize the list.
                self.index[key] = []
                # The total frequency is 1.
                self.index[key].append(1)
                # Add docID to posting list.
                # The postings list is initially empty.
                self.index[key].append({})     
                # Add doc ID to postings list.
                self.index[key][1] = value
            else:
                # If key is in the dicctionary, check if the docID already exsist
                if value not in self.index.values():
                    # if same key occurs in a different document, add the document frequency 
                    self.index[key][0] = self.index[key][0] +1
                    self.index[key][1].append(value)
                else:
                    continue
     
    def proximity_search(self, term1: str, term2: str) -> dict:
        """
        1) check whether given two terms appear within a window
        2) calculate the number of their co-existance in a document
        3) add the document id and the number of matches into a dict
        return the dict"""
    
    

In [3]:
inde = InvertedIndex()
path = "/Users/rubyli/Desktop/UoM/UoM-Year3/COMP34711-Natural Language Processing/CWK/Simpsons2022"
res = inde.read_data(path)
# docList = inde.process_document(str(res))
# inde.index_corpus(res)

In [4]:
testDoc = res[4]
print("TEST DOC: \n", testDoc)

TEST DOC: 
 ['Marge vs. the Monorail\n', 'From Wikipedia, the free encyclopedia\n', 'Jump to navigationJump to search\n', '"Marge vs. the Monorail"\n', 'The Simpsons episode\n', 'Marge vs. the Monorail (promo card).png\n', 'Promotional artwork for the episode\n', 'Episode no.\tSeason 4\n', 'Episode 12\n', 'Directed by\tRich Moore[1]\n', "Written by\tConan O'Brien[1]\n", 'Featured music\t"The Monorail Song"\n', "by Conan O'Brien and Al Jean\n", 'Production code\t9F10\n', 'Original air date\tJanuary 14, 1993[2]\n', 'Guest appearances\n', 'Phil Hartman as Lyle Lanley[1]\n', 'Leonard Nimoy as himself[2]\n', 'Episode features\n', 'Chalkboard gag\t"I will not eat things for money"[3]\n', "Couch gag\tThe Simpsons sit on the couch, followed by four rows of Springfield's residents sitting in front of the family.[1]\n", 'Commentary\tMatt Groening\n', 'Al Jean\n', 'Mike Reiss\n', 'Rich Moore\n', 'David Silverman\n', "Conan O'Brien\n", 'Episode chronology\n', '← Previous\n', '"Homer\'s Triple Bypa

In [5]:
inde.index_corpus(testDoc)
print(inde.index)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [6]:
def main():
    "main call function"
    index = InvertedIndex() # initilaise the index
    corpus = index.read_data("/Users/rubyli/Desktop/UoM/UoM-Year3/COMP34711-Natural Language Processing/CWK/Simpsons2022") # specify the directory path in which files are located
    index.index_corpus(corpus) # index documents/corpus
    
#     search_term = input("Enter your query: ") # insert a query
    # write a demo to check entered search terms against the inverted index
        # 1) len(search _term) == one --> return the following: 
            # a) the number of documents in which a term appears.
            # b) all document ids in which a term appears.
            
        # 2) len(search_term) == 2 --> return the following: 
            # a) the number of documents in which the entered terms appear within a pre-defined window.
            # b) all document ids in which the terms appear within that window.
    return index
    
index = main()