In [1]:
import numpy as np
import pandas as pd
import re
import os
import string
import inflect
import queue

In [2]:
def is_contents_section(line):
    """Check if the line is a 'contents' section."""
    return line == 'contents' or line == 'table of contents'

def is_chapter_or_numbered(line):
    """Check if the line starts with a number, Roman numeral, or the word 'chapter'."""
    return bool(re.match(r'^(\d+|i{1,3}|iv|v{1,3}|ix|x|chapter)\b', line))

In [3]:
def rewrite(file_path, output_path):
    try:
        skip_section = False

        with open(file_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
            for line in infile:
                processed_line = line.strip().lower() # Strip leading/trailing whitespace, lowercase the line
                if processed_line:
                    print(processed_line) 
                    if is_contents_section(processed_line):
                        skip_section = True
                        continue
        
                    if skip_section:
                        if is_chapter_or_numbered(processed_line):
                            continue
                        else:
                            skip_section = False
                    outfile.write(processed_line + '\n')

        print(f"Preprocessing complete. Processed text saved to {output_path}")
    except Exception as e:
        print(f"An error occurred: {e}")


In [4]:
def preprocessing(text):
    processed = text.strip().lower()
    
    re_html = re.compile('<.*?>')
    processed = re_html.sub(r'', processed) #remove html

    re_url = re.compile(r'https?://[^\s/$.?#].[^\s]*|www\.[^\s/$.?#].[^\s]*')
    processed = re_url.sub('', processed)

    punctuations = set(string.punctuation) #remove selective punctuations
    punctuations = (punctuations - {'.','!','?'})
    punctuations = ''.join(punctuations)

    pun_dict = {".": " . ", "!" : " ! ", "?":" ? "} # to treat punctuations as a seprate word
    processed = processed.translate(str.maketrans(pun_dict))

    p = inflect.engine()
    words = processed.split()
    processed = " ".join([p.number_to_words(w) if w.isnumeric() else w for w in words]) #numbers to words
    
    processed = processed.translate(str.maketrans("","",punctuations))

    return processed.split()
    

In [5]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [6]:
text = "My name is Ankush. Whats yours? this is a (test case), adding 5412 number check"
text = preprocessing(text)

In [7]:
def update_tf(text, tf):
    q = queue.Queue()
    q.queue = queue.deque(text)
    previous = q.get()
    current = q.get()
    print(previous)
    print(current)
    for next_word in iter(q.get, None):
        print(next_word)
        if previous in tf:
            if current in tf[previous]:
                if next_word in tf[previous][current]:
                    tf[previous][current][next_word] += 1
                else: 
                    tf[previous][current][next_word] = 1
            else:
                tf[previous][current] = {next_word : 1}
        else:
            tf[previous] = {current : {next_word : 1}}

        previous = current
        current = next_word

    return tf
    

In [None]:
tf = {}
update_tf(text,tf)

my
name
is
ankush
.
whats
yours
?
this
is
a
test
case
adding
five
thousand
four
hundred
and
twelve
number
check


In [None]:
def build_tf():