In [1]:
import os
import re
from bs4 import BeautifulSoup
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.metrics import edit_distance

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\telmu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\telmu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def extract_content(html_path):
    with open(html_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
        paragraphs = soup.find_all('p')
        content = ' '.join([p.get_text() for p in paragraphs])
        return content

In [4]:
def tokenize_sentences(content):
    sentences = re.split(r'[.!?]', content)
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    return tokenized_sentences

In [6]:
def create_inverted_index(tokenized_sentences):
    inverted_index = defaultdict(list)
    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()

    for i, sentence in enumerate(tokenized_sentences):
        for j, word in enumerate(sentence):
            # Removing stop words and stemming
            if word.lower() not in stop_words:
                stemmed_word = ps.stem(word.lower())
                inverted_index[stemmed_word].append((i, j))

    return inverted_index

In [7]:
def country_search(keyword, inverted_index):
    keyword = PorterStemmer().stem(keyword.lower())
    return [doc for doc, positions in inverted_index[keyword]]

In [8]:
def compute_edit_distance(str1, str2):
    return edit_distance(str1, str2)

In [9]:
def fuzzy_search(keyword, inverted_index):
    keyword = PorterStemmer().stem(keyword.lower())
    results = defaultdict(int)

    for term in inverted_index.keys():
        distance = compute_edit_distance(keyword, term)
        if distance <= 2:  
            for doc, positions in inverted_index[term]:
                results[doc] += 1

    return max(results, key=results.get)

In [None]:
if __name__ == "__main__":
    
    folder_path = 'countries'
    html_files = [f for f in os.listdir(folder_path) if f.endswith('.html')]

    inverted_index_all = {}

    for file_name in html_files:
        file_path = os.path.join(folder_path, file_name)
        content = extract_content(file_path)
        tokenized_sentences = tokenize_sentences(content)
        inverted_index = create_inverted_index(tokenized_sentences)
        inverted_index_all[file_name] = inverted_index

    
    keyword = 'Toronto'
    search_result = country_search(keyword, inverted_index_all)
    print(f"Search result for '{keyword}': {search_result}")

    fuzzy_result = fuzzy_search('Toronta', inverted_index_all)
    print(f"Fuzzy search result for 'Toronta': {fuzzy_result}")