In [48]:
from nltk.corpus.reader.wordnet import Lemma
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
#For file reading
import os
from os import listdir
#For pre-processing 
from nltk.tokenize import punkt
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
import string
import time

class InvertedIndex:
    """
    Construct Inverted Index
    """
    def __init__(self):
        self.index = {}
        # List of file names
        self.docList = []
        
    def read_data(self, path: str) -> list:
        """
        Read files from a directory and then append the data of each file into a list.
        """
        folder = listdir(path)
        res = []
        for files in folder:
            # check if current path is a file
            if files.endswith(".txt"):
                filePath = os.path.join(path, files)
                if os.path.isfile(filePath):
                    # Store the file name
                    self.docList.append(files)
                    with open(filePath, "r") as file:
                        lines = file.readlines()
                    res.append(lines)
        return res

    def process_document(self, document: str, mode: str) -> list:
        """
        pre-process a document and return a list of its terms
        str->list"""
        pattern = r'''(?x)        # set flag to allow verbose regexps
                    (?:[A-Z]\.)+     #abbreviations
                    |\w+(?:[-']\w+)*   #word-internal hyphens
                    |'
                    |[-.(]+           #double hyphen, ellipsis, open parenthesis
                    |\S\w*
                    |\$?\d+(?:\.\d+)?%? #currency and percentages
        '''
        #Tokenization
        tokenList = nltk.regexp_tokenize(document, pattern)
        #To lower case
        tokenList = [word.lower() for word in tokenList]
        #Remove Punctuation
        tokenList = list(filter(lambda word: punkt.PunktToken(word).is_non_punct,tokenList))
        #Remove stopwords
        tokenList = list(filter(lambda word: word not in stopwords.words("english"),tokenList))
        if mode.lower() == "lemmatisation":
          # Lemmatisation 
          lemma = WordNetLemmatizer()
          tokenList = [lemma.lemmatize(word) for word in tokenList]
        elif mode.lower() == "stemming":
          #Stemming 
          porter = nltk.PorterStemmer()
          tokenList = [porter.stem(word) for word in tokenList]
        else:
          print("Not a valid parameter, choose between lemmatisation and stemming.")

        return tokenList  
    
    def process_string(self, text: str) -> str:
        "Pre-process string"
        #To lower case
        text = text.lower()
        # Lemmatisation 
        lemma = WordNetLemmatizer()
        text = lemma.lemmatize(text)
        
        return text
    
    def index_corpus(self, documents: list, mode: str) -> None:
        """
        index given documents
        list->None"""
        t1 = time.time()
        docName = ''
        #Store the term and docID in a sequence pair
        for i, doc in enumerate(documents):
            docName = self.docList[i]
            doc = ' '.join(doc)
            tokenList = self.process_document(doc, mode)
            for j, token in enumerate(tokenList):
                if token not in self.index:
                    # Initialise the index, Document frequency set to 1
                    self.index[token] = [1]
                    # Append the document name and term frequency(set to 1)
                    self.index[token].append({docName:[j]})
                    
                else:
                    # If the term has appared in the index already
                    if docName not in self.index[token][1].keys():
                        # If term is in a new doc, increase the document frequency, add new document
                        # name and set new term frequency to 1
                        self.index[token][0] += 1
                        self.index[token][1][docName] = [j]
                    else:
                        # If term is in a exsisted document, increase the term frequency only
                        self.index[token][1][docName].append(j)
        t2 = time.time()
        totalt = t2-t1
        
        print("\n Normalization Mode:",mode,"\n Size of inverted index: ", len(self.index),"\n Time Consuming: ", totalt)
        print("\n Starting time:",t1,"\n Ending time:",t2)
     
    def dump(self, path: str) -> None:
        """
        provide a dump function to show index entries for a given set of terms        
        """
        with open(path, "r") as file:
            lines = file.readlines()
            for word in lines:
                tokenList = self.process_document(word, 'lemmatisation')
                for token in tokenList:
                    try:
                        if token in self.index.keys():
                            res = {token:self.index[token]}
                            print("\n The result index for dump is: \n", res)
                    except:
                        print("There is no corresponding result for <", token,">")
                
       
    def proximity_search(self, term1: str, term2: str, window_size: int) -> dict:
        """
        This is Task 2"""
        """
        1) check whether given two terms appear within a window
        2) calculate the number of their co-existance in a document
        3) add the document id and the number of matches into a dict
        return the dict"""
        result = {}
        t1 = self.process_string(term1)
        t2 = self.process_string(term2)
        try:
            if t1 in self.index.keys():
                # Store the term position in a list
                pos1 = self.index[t1][1]
            else:
                print("There is no result for ->",term1)

            if t2 in self.index.keys():
                # Store the term position in a list
                pos2 = self.index[t2][1]
            else:
                print("There is no result for ->",term2)

            for key1 in pos1.keys():
                for key2 in pos2.keys():
                    # Check the co-exsistance
                    if key1==key2:
                        coF = 0
                        l1 = pos1[key1]
                        l2 = pos2[key2]
                        for i in l1:
                            for j in l2:
                                # Check if two terms in a window
                                if abs(i-j) <= window_size:
                                    coF += 1
                                    result[key1] = coF
            if result:
                print("\n Result for proximity_search",term1,"and",term2,": ", result)
            else:
                print("\n Sorry, there's no matching for",term1,"and",term2)
        except:
            print("\n There is no result for <",term1,"> and <",term2,">.")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# 新段落

In [49]:
def main():
    "main call function"
    index = InvertedIndex() # initilaise the index
    data_path = "/content/Simpsons2022" # specify the directory path in which files are located
    corpus = index.read_data(data_path) 
    index.index_corpus(corpus, 'stemming') # index documents/corpus
    index.index_corpus(corpus,'lemmatisation') # index documents/corpus
    
    # Demo for dump
    dump_path = "/content/development-examples(1).txt"
    index.dump(dump_path)
    
    # Demo for proximity_search
    term1 = "Bart"  # Specify the term for proximity_search
    term2 = "montage"
    window_size = 15
    index.proximity_search(term1, term2, window_size)
    
    return index
    
index = main()


 Normalization Mode: stemming 
 Size of inverted index:  11928 
 Time Consuming:  28.870928049087524

 Starting time: 1666900451.6120903 
 Ending time: 1666900480.4830184

 Normalization Mode: lemmatisation 
 Size of inverted index:  19778 
 Time Consuming:  26.568009614944458

 Starting time: 1666900480.4851613 
 Ending time: 1666900507.053171

 The result index for dump is: 
 {'bart': [110, {'6.5.txt': [130, 245, 267, 306, 322, 352, 388, 416, 130, 245, 267, 306, 322, 352, 388, 416], '7.6.txt': [505, 543, 574, 583, 591, 600, 617, 676, 736, 744, 750, 1096, 1106, 1120, 505, 543, 574, 583, 591, 600, 617, 676, 736, 744, 750, 1096, 1106, 1120], '3.17.txt': [1037, 1037], '5.19.txt': [136, 241, 327, 380, 388, 399, 403, 465, 476, 486, 522, 578, 733, 848, 874, 136, 241, 327, 380, 388, 399, 403, 465, 476, 486, 522, 578, 733, 848, 874], '7.5.txt': [280, 354, 280, 354], '3.11.txt': [445, 729, 757, 445, 729, 757], '5.2.txt': [364, 484, 788, 364, 484, 788], '3.13.txt': [1, 10, 76, 96, 120, 131, 16