# Import Libraries

In [None]:
import os
import re
import random
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer

# Load Data

In [None]:
files_path = "/mnt/d/Datasets/harry-potter-stories/"

In [None]:
def read_files(path):
    stories = []
    for file in os.listdir(path):
        if file.endswith(".txt"):
            with open(os.path.join(path, file)) as f:
                for line in f:
                    line = line.strip()
                    if len(line) > 1:
                        stories.append(line)

            f.close()

    return stories

In [None]:
stories = read_files(files_path)

In [None]:
len(stories)

# Preprocess

In [None]:
def clean_stories(texts):
    cleaned = []
    for text in texts:
        text = text.lower()
        text = re.sub('[0-9]+', '', text)
        text = text.translate(text.maketrans('', '', string.punctuation))
        text = text.replace('"', '').replace("’", '').replace("'", '').replace("”", '')
        cleaned.append(text)

    return cleaned

In [None]:
cleaned = clean_stories(stories)

In [None]:
stop_words = [x.strip() for x in open('/mnt/d/Datasets/SmartStoplist.txt','r').read().split('\n')]

In [None]:
cleaned_ns = [' '.join([word for word in text.split() if word.lower() not in stop_words]) for text in cleaned]

# EDA

In [None]:
def print_wordcloud(words):
    wordcloud = WordCloud(background_color="white", width=1200, height=800).generate(words)

    plt.figure(figsize=(12, 8))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title("Word Cloud")
    plt.show()

In [None]:
print_wordcloud(' '.join(cleaned_ns))

In [None]:
def count_ngrams(corpus, ngram, n):
    vec = CountVectorizer(ngram_range=(ngram,ngram)).fit(corpus)
    bow = vec.transform(corpus).sum(axis=0)
    words_freq = sorted([(word, bow[0, idx]) for word, idx in vec.vocabulary_.items()], key=lambda x: x[1], reverse=True)[:n]
    return words_freq

In [None]:
def plot_ngrams(ngram_df, ngram_name):
    plt.figure(figsize=(12, 6))
    plt.bar(data=ngram_df, x="Text", height="Count")
    plt.xticks(rotation=90)
    plt.xlabel(ngram_name)
    plt.ylabel("Count")
    plt.title(ngram_name)
    plt.show()

In [None]:
unigrams = count_ngrams(cleaned_ns, 1, 30)
top_unigram = pd.DataFrame(unigrams, columns=['Text', "Count"])
top_unigram.head()

In [None]:
plot_ngrams(top_unigram, "Unigrams")

In [None]:
bigrams = count_ngrams(cleaned_ns, 2, 30)
top_bigram = pd.DataFrame(bigrams, columns=['Text', "Count"])
top_bigram.head()

In [None]:
plot_ngrams(top_bigram, "Bigrams")

In [None]:
trigrams = count_ngrams(cleaned_ns, 3, 30)
top_trigram = pd.DataFrame(trigrams, columns=['Text', "Count"])
top_trigram.head()

In [None]:
plot_ngrams(top_trigram, "Trigrams")

# Markov Chains

In [None]:
def build_markov_model(text, n=2):
    words = text.split()
    markov_chain = {}
    
    for i in range(len(words) - n):
        curr_state = tuple(words[i:i + n])
        next_state = tuple(words[i+1:i+1+n])

        if curr_state not in markov_chain:
            markov_chain[curr_state] = {}
        if next_state not in markov_chain[curr_state]:
            markov_chain[curr_state][next_state] = 0
        markov_chain[curr_state][next_state] += 1

    for curr_state, transitions in markov_chain.items():
        total = sum(transitions.values())
        for state in transitions:
            markov_chain[curr_state][state] /= total
    
    return markov_chain

In [None]:
markov_model = build_markov_model(' '.join(cleaned), n=2)

# Generate

In [None]:
def generate_story(markov_model, start_words, max_length=100):
    start = tuple(start_words)
    if start not in markov_model:
        raise ValueError(f"Starting state '{start}' not found in the Markov model.")

    story = start_words
    curr_state = start

    for _ in range(max_length - len(start_words)):
        if curr_state not in markov_model:
            break

        next_state = random.choices(
            population=list(markov_model[curr_state].keys()),
            weights=list(markov_model[curr_state].values())
        )[0]
        
        story.append(list(next_state)[-1])
        curr_state = next_state

    return ' '.join(story)

# Results

In [None]:
start_words = ["the", "day"]
generated = generate_story(markov_model, start_words, max_length=10)

print("Generated Text:")
print(generated)