In [38]:
import re
from nltk.tokenize import word_tokenize
import random

In [39]:
import os

story_path = r"C:\Users\vinit\PycharmProjects\Markov Chain\harry-potter-dataset"

def read_all_stories(story_path):
    txt = []
    for _, _, files in os.walk(story_path):
        for file in files:
            if file.endswith('.txt'): 
                file_path = os.path.join(story_path, file)
                print(f"Reading file: {file_path}") 
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        for line in f:
                            line = line.strip()
                            if line == '----------':
                                break
                            if line != '':
                                txt.append(line)
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")  
    return txt
        
stories = read_all_stories(story_path)
print("Number of lines =", len(stories))
if len(stories) == 0:
    print("No lines were found in any of the text files.")



Reading file: C:\Users\vinit\PycharmProjects\Markov Chain\harry-potter-dataset\01 Harry Potter and the Sorcerers Stone.txt
Reading file: C:\Users\vinit\PycharmProjects\Markov Chain\harry-potter-dataset\02 Harry Potter and the Chamber of Secrets.txt
Reading file: C:\Users\vinit\PycharmProjects\Markov Chain\harry-potter-dataset\03 Harry Potter and the Prisoner of Azkaban.txt
Reading file: C:\Users\vinit\PycharmProjects\Markov Chain\harry-potter-dataset\04 Harry Potter and the Goblet of Fire.txt
Reading file: C:\Users\vinit\PycharmProjects\Markov Chain\harry-potter-dataset\05 Harry Potter and the Order of the Phoenix.txt
Reading file: C:\Users\vinit\PycharmProjects\Markov Chain\harry-potter-dataset\06 Harry Potter and the Half-Blood Prince.txt
Reading file: C:\Users\vinit\PycharmProjects\Markov Chain\harry-potter-dataset\07 Harry Potter and the Deathly Hallows.txt
Number of lines = 38272


In [40]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vinit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [41]:
def clean_txt(txt):
    cleaned_txt = []
    for line in txt:
        line = line.lower()
        line = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", line)
        tokens = word_tokenize(line)
        words = [word for word in tokens if word.isalpha()]
        cleaned_txt+=words
    return cleaned_txt

cleaned_stories = clean_txt(stories)
print("number of words = ", len(cleaned_stories))

number of words =  1095205


In [42]:
def make_markov_model(cleaned_stories, n_gram=2):
    markov_model = {}
    for i in range(len(cleaned_stories)-n_gram-1):
        curr_state, next_state = "", ""
        for j in range(n_gram):
            curr_state += cleaned_stories[i+j] + " "
            next_state += cleaned_stories[i+j+n_gram] + " "
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]
        if curr_state not in markov_model:
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1
        else:
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1
            else:
                markov_model[curr_state][next_state] = 1
    
    # calculating transition probabilities
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count/total
            
    return markov_model

In [43]:
markov_model = make_markov_model(cleaned_stories)

In [44]:
print("number of states = ", len(markov_model.keys()))

number of states =  327301


In [45]:
print("All possible transitions from 'is dead' state: \n")
print(markov_model['is dead'])

All possible transitions from 'is dead' state: 

{'my lord': 0.0625, 'said crouch': 0.0625, 'but harry': 0.0625, 'yes phineas': 0.0625, 'don t': 0.0625, 'dumbledore would': 0.0625, 'they are': 0.0625, 'you the': 0.0625, 'lucius it': 0.0625, 'harry did': 0.0625, 'narcissa malfoy': 0.0625, 'by my': 0.0625, 'he was': 0.0625, 'do you': 0.0625, 'voldemort hurled': 0.0625, 'said harry': 0.0625}


In [46]:
def generate_story(markov_model, limit=100, start='my god'):
    n = 0
    curr_state = start
    next_state = None
    story = ""
    story+=curr_state+" "
    while n<limit:
        next_state = random.choices(list(markov_model[curr_state].keys()),
                                    list(markov_model[curr_state].values()))
        
        curr_state = next_state[0]
        story+=curr_state+" "
        n+=1
    return story

In [47]:
for i in range(20):
    print(str(i)+". ", generate_story(markov_model, start="hermione and", limit=8))

0.  hermione and bill too you may lose the cup you earned fifty points you haven t handed you 
1.  hermione and neville if either of the others is bill here he stood back the faces of the 
2.  hermione and the two witches there passed a tent that had a horrible feeling that snape could read 
3.  hermione and ginny however took a step backwards yelped and released harry and fell gracefully into the pensieve 
4.  hermione and leanne had all halted gazing at the mound would be returning to grimmauld place use it 
5.  hermione and a thoroughly makes a better impression given what was coming several seats down from his shoulders 
6.  hermione and smiling widely as though it were nothing but merely looked rather stupid then both hat and 
7.  hermione and finally harry tried getting angry again in the garage without mum noticing we flew the car 
8.  hermione and fred fred grinned fine we wont get decent seats they hurried into the car had begun 
9.  hermione and i must tell you all this he to

In [48]:
print(generate_story(markov_model, start="lupin had", limit=100))

lupin had been right in thinking that he had succeeded in conjuring so much that his father who hid beneath the cloak light was coming from her but the subject of giant wars harry did not like people just because shes part giant who needs that sort of whistle was protruding from the ceiling when the offense had occurred so long ago to guard myself against mortal death they who had seen in his knees seemed to double up to the castle again but you can go and check footsteps crossed the room opened his wardrobe and peered inside and the gnarled stump sat there looking as though he thought he saw that both harry and riddle wand still raised his face it looked like it was built for two says winky and dobby s eyes lingered for a moment to report the truth i would inform them when our owl results will come cant be right can it aaaaah said ron in a high voice oh professor flitwick i m so sorry to hear of it dumbledore waited but voldemort did and so forth said professor mcgonagall shooed them 