In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random
import time
from collections import Counter

2024-04-24 05:46:04.268868: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-24 05:46:04.268978: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-24 05:46:04.537164: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Markov Model

In [2]:
import os

# Loading Dataset
path = "/kaggle/input/harry-potter"
def read(path):
    txt = []
    for _, _, files in os.walk(path):
        for file in files:
            full_path = os.path.join(path, file)  # Complete path
            with open(full_path) as f:
                for line in f:
                    line = line.strip() # Remove any leading or trailing whitespace characters from line
                    if line == '----------': # Break condition
                        break
                    if line != '':
                        txt.append(line)
    return txt

dataset = read(path)
print("Total number of lines =", len(dataset))


Total number of lines = 207


In [3]:
# Cleaning text
def clean_txt(txt):
    cleaned_txt = []
    for line in txt:
        line = line.lower()
        # Removing any kind of special symbols or punctuations
        line = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", line)
        tokens = word_tokenize(line)
        words = [word for word in tokens if word.isalpha()]
        cleaned_txt+=words
    return cleaned_txt

cleaned_dataset = clean_txt(dataset)
print("Total number of words = ", len(cleaned_dataset))

Total number of words =  1068736


In [4]:
# MARKOV MODEL
def markov_model(cleaned_dataset, n_gram=2):
    markov_model = {}
    for i in range(len(cleaned_dataset)-n_gram-1): # Completing n-gram transitions and ensures a valid next state
        curr_state, next_state = "", ""

        # Form current and next state consisting of n-gram words each
        for j in range(n_gram):
            curr_state += cleaned_dataset[i+j] + " "
            next_state += cleaned_dataset[i+j+n_gram] + " "
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]

        # Check if current state is already present in dictionary
        if curr_state not in markov_model:
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1 # Mark the transition from current to next state to 1
        else:
            # Check if next state is already a transition
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1 # If next state exists, increment count of this transition
            else:
                markov_model[curr_state][next_state] = 1 # Else mark the transition to 1

    #calculating transition probabilities
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            # Update the count with the probability of transition
            markov_model[curr_state][state] = count/total

    return markov_model

In [5]:
markov_model = markov_model(cleaned_dataset)

In [6]:
print("Total number of states = ", len(markov_model.keys()))

Total number of states =  339140


In [7]:
print("Transitions from 'the harry' state: \n")
print(markov_model['the harry'])

Transitions from 'the harry' state: 

{'nearest the': 0.125, 'and ron': 0.125, 'thought liberacorpus': 0.125, 'potter the': 0.125, 'and hermione': 0.125, 'said he': 0.125, 'began not': 0.125, 'noticed that': 0.125}


In [8]:
# Generating story using markov model
def generate_story(markov_model, limit=100, start='the harry'):
    n = 0
    curr_state = start
    next_state = None
    text = ""
    text+=curr_state+" "
    # Continue to generate text until word limit is reached
    while n<limit:
        # Randomly choose the next state based on the current state's transitions which is based on distribution of probabilites
        next_state = random.choices(list(markov_model[curr_state].keys()),
                                    list(markov_model[curr_state].values()))
        # Update current state to be the next state
        curr_state = next_state[0]
        # Add current state to the text
        text+=curr_state+" "
        n+=1
    return text

In [9]:
# Generating story
for i in range(20):
    print(str(i)+". ", generate_story(markov_model, start="the snake", limit=8))

0.  the snake glided away through the door and followed harry and dobby dobby is always having a go 
1.  the snake or the sounds of retching cheering and the whistling wind could be somewhere else because they 
2.  the snake but lets not lose sight now of what youd have sent me a howler let go 
3.  the snake hadnt done it the person whos ever bothered them but he had once overheard professor mcgonagall 
4.  the snake slithering at their heels as it was shes not wrong mumbled harry and a drowsy silence 
5.  the snake reared again but harry ron harry said the servant would help him find the giant its 
6.  the snake sent a furry brown wallet that had fangs which were squealing the names of our time 
7.  the snake with him a strong fit of gallantry and insisted that dudley was frightened enough to accept 
8.  the snake but thought better of me not to warn you about mine hagrid glared at hermione too 
9.  the snake and there reflected behind him were at least confident that the boy this lon