In [1]:
import numpy as np
import pandas as pd
import re
import os
import string
import inflect
import queue
import random
from collections import deque

In [2]:
def is_contents_section(line):
    """Check if the line is a 'contents' section."""
    return line == 'contents' or line == 'table of contents'

def is_chapter_or_numbered(line):
    """Check if the line starts with a number, Roman numeral, or the word 'chapter'."""
    return bool(re.match(r'^(\d+|i{1,3}|iv|v{1,3}|ix|x|chapter)\b', line))

In [3]:
def rewrite(file_path, output_path): #remove header
    try:
        skip_section = False

        with open(file_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
            for line in infile:
                processed_line = line.strip().lower() # Strip leading/trailing whitespace, lowercase the line
                if processed_line:
                    if is_contents_section(processed_line):
                        skip_section = True
                        continue
        
                    if skip_section:
                        if is_chapter_or_numbered(processed_line):
                            continue
                        else:
                            skip_section = False
                    outfile.write(processed_line + '\n')

    except Exception as e:
        print(f"An error occurred: {e}")


In [4]:
def preprocessing(text):
    processed = text.strip().lower()
    
    re_html = re.compile('<.*?>')
    processed = re_html.sub(r'', processed) #remove html

    re_url = re.compile(r'https?://[^\s/$.?#].[^\s]*|www\.[^\s/$.?#].[^\s]*')
    processed = re_url.sub('', processed)

    punctuations = set(string.punctuation) #remove selective punctuations
    punctuations = (punctuations - {'.','!','?'})
    punctuations = ''.join(punctuations)

    pun_dict = {".": " . ", "!" : " ! ", "?":" ? "} # to treat punctuations as a seprate word
    processed = processed.translate(str.maketrans(pun_dict))

    p = inflect.engine()
    words = processed.split()
    processed = " ".join([p.number_to_words(w) if w.isnumeric() else w for w in words]) #numbers to words
    
    processed = processed.translate(str.maketrans("","",punctuations))

    return processed.split()
    

In [5]:
text = "My name is Ankush. Whats yours? this is a (test case), adding 5412 number check, My name is Dhara. Dhara is my name,"
text = preprocessing(text)

In [6]:
def update_tf(text, tf):
    q = deque(text) 
    if len(q) < 2:  # Ensure there are enough elements to process
        return tf

    previous = q.popleft()
    current = q.popleft()

    while q:
        next_word = q.popleft()

        if previous in tf:
            if current in tf[previous]:
                if next_word in tf[previous][current]:
                    tf[previous][current][next_word] += 1
                else:
                    tf[previous][current][next_word] = 1
            else:
                tf[previous][current] = {next_word: 1}
        else:
            tf[previous] = {current: {next_word: 1}}

        previous = current
        current = next_word

    return tf


In [7]:
tf = {}
tf = update_tf(text,tf)
print(tf)

{'my': {'name': {'is': 2}}, 'name': {'is': {'ankush': 1, 'dhara': 1}}, 'is': {'ankush': {'.': 1}, 'a': {'test': 1}, 'dhara': {'.': 1}, 'my': {'name': 1}}, 'ankush': {'.': {'whats': 1}}, '.': {'whats': {'yours': 1}, 'dhara': {'is': 1}}, 'whats': {'yours': {'?': 1}}, 'yours': {'?': {'this': 1}}, '?': {'this': {'is': 1}}, 'this': {'is': {'a': 1}}, 'a': {'test': {'case': 1}}, 'test': {'case': {'adding': 1}}, 'case': {'adding': {'five': 1}}, 'adding': {'five': {'thousand': 1}}, 'five': {'thousand': {'four': 1}}, 'thousand': {'four': {'hundred': 1}}, 'four': {'hundred': {'and': 1}}, 'hundred': {'and': {'twelve': 1}}, 'and': {'twelve': {'number': 1}}, 'twelve': {'number': {'check': 1}}, 'number': {'check': {'my': 1}}, 'check': {'my': {'name': 1}}, 'dhara': {'.': {'dhara': 1}, 'is': {'my': 1}}}


In [8]:
def engine(tf,file_path,lines_per_chunk=100):
    output_path = "data\\out"
    rewrite(file_path,output_path)
    try:
        with open(output_path, 'r', encoding='utf-8') as infile:
            while True:
                lines = [infile.readline() for _ in range(lines_per_chunk)]
                lines = [line for line in lines if line]
                if not lines:  # End of file
                    break
                lines = " ".join(lines)
                processed = preprocessing(lines)
                tf = update_tf(processed,tf)
    except Exception as e:
        print(f"An error occurred: {e}")  
    return tf

In [9]:
tf = {}
tf = engine(tf,'data\\Alic.txt')
tf = engine(tf,'data\\gatsby.txt')
tf = engine(tf,'data\\prid.txt')
tf = engine(tf,'data\\The_Project_Gutenberg_eBook_of_Moby.txt')

In [11]:
for dirpath, dirnames, filenames in os.walk("D:\\Rutgers\\Sem7\\ML\\NorthEastern\\hw1\\20news-bydate\\20news-bydate-train"):
        for filename in filenames:
            file_path = os.path.join(dirpath, filename)
            engine(tf, file_path)

An error occurred: 'utf-8' codec can't decode byte 0xff in position 3697: invalid start byte
An error occurred: 'utf-8' codec can't decode byte 0xff in position 7478: invalid start byte
An error occurred: 'utf-8' codec can't decode byte 0xff in position 5470: invalid start byte
An error occurred: 'utf-8' codec can't decode byte 0xff in position 1142: invalid start byte
An error occurred: 'utf-8' codec can't decode byte 0xff in position 5909: invalid start byte
An error occurred: 'utf-8' codec can't decode byte 0xda in position 537: invalid continuation byte
An error occurred: 'utf-8' codec can't decode byte 0xfe in position 905: invalid start byte
An error occurred: 'utf-8' codec can't decode byte 0xe9 in position 886: invalid continuation byte
An error occurred: 'utf-8' codec can't decode byte 0xe9 in position 1028: invalid continuation byte
An error occurred: 'utf-8' codec can't decode byte 0xaa in position 680: invalid start byte
An error occurred: 'utf-8' codec can't decode byte 0x

In [10]:
def count_numbers(nested_dict, frequency=None):
    if frequency is None:
        frequency = {}

    for key, value in nested_dict.items():
        # Check if the key is a number and update frequency
        if isinstance(key, int):
            frequency[key] = frequency.get(key, 0) + 1

        # Check if the value is a dictionary and recurse
        if isinstance(value, dict):
            count_numbers(value, frequency)
        elif isinstance(value, int):
            # If value is a number, update frequency
            frequency[value] = frequency.get(value, 0) + 1

    return frequency

In [11]:
result = count_numbers(tf)
print(result)

{80: 3, 52: 9, 24: 16, 59: 2, 1: 327946, 10: 203, 4: 4485, 8: 614, 2: 20358, 6: 807, 16: 66, 26: 22, 3: 5624, 7: 589, 5: 1437, 54: 2, 17: 53, 11: 157, 12: 160, 15: 77, 23: 21, 13: 129, 21: 33, 9: 311, 25: 13, 71: 2, 18: 49, 19: 45, 20: 44, 41: 6, 50: 5, 35: 15, 22: 26, 49: 2, 40: 7, 14: 92, 29: 13, 30: 12, 51: 2, 28: 15, 86: 1, 72: 2, 32: 9, 42: 4, 27: 14, 57: 3, 78: 1, 43: 5, 56: 4, 70: 2, 37: 3, 31: 8, 36: 7, 48: 4, 47: 2, 65: 2, 45: 3, 64: 2, 166: 1, 68: 3, 44: 3, 63: 2, 55: 4, 34: 2, 212: 1, 122: 1, 39: 3, 81: 1, 87: 1, 92: 1, 123: 1, 79: 1, 69: 2, 38: 4, 33: 7, 60: 2, 73: 1, 91: 1, 61: 1, 102: 1, 76: 2, 99: 1, 126: 1, 88: 2, 98: 1, 46: 1, 135: 1, 232: 1, 143: 1}


In [12]:
def generate_sentence(markov_model, start_word1, start_word2, max_words=500):
    if start_word1 not in markov_model or start_word2 not in markov_model.get(start_word1, {}):
        return "Starting words not found in the model."

    sentence = [start_word1, start_word2]

    while len(sentence) < max_words:
        word1, word2 = sentence[-2], sentence[-1]

        next_words = markov_model.get(word1, {}).get(word2, None)
        if not next_words:
            break

        next_word = random.choices(
            population=list(next_words.keys()),
            weights=list(next_words.values()),
            k=1
        )[0]

        sentence.append(next_word)

    return ' '.join(sentence)

In [13]:
start_word1 = random.choice(list(tf.keys()))
start_word2 = random.choice(list(tf[start_word1].keys()))
print(generate_sentence(tf,start_word1,start_word2))

appalled me . but these marks do not know how mr . bennet for anything about it if he cuts off one of its narrowest parts . sir william and mr . gatsby his hands on her mind was visible in some small degree of importance which is thirty thousand pounds were in a bit if you only kept on good authority that sperm is of a sudden terrific downward jerking of the ponies is delightful . we were somebody she knew was deserved . the band below and were it not equal to it . hark ! chapter thirtythree . this observation would not catch . but still retained but his next return into hertfordshire with the crew striking up a peculiar form some two feet above the gunwale again and mr . darcy was delighted with their hands while his three whales in a few brisk phrases . “i’ll telephone for george ? maybe even if encountered should be used on or associated in any doubt of his character never occurred to me sir” said the caterpillar . “is she from new york through numerous populous cities and most thri