In [100]:
from pathlib import Path
import pandas as pd
import re
import random

In [45]:
# lets take a look at the data
data_path = Path("../../dad_jokes.csv")
df = pd.read_csv(data_path)
df.head(5)

Unnamed: 0,#,Dad Jokes
0,0,A steak pun is a rare medium well done.
1,1,They say that breakfast is the most important ...
2,2,What do you get if you cross an angry sheep wi...
3,3,An apple a day keeps the doctor away. At least...
4,4,What sounds like a sneeze and is made of leath...


In [46]:
# ok so our data is a little messy, lets refactor it a bit 
df = df.drop('#', axis=1)
df = df.rename(columns={"Dad Jokes": "text"})
df.head(5)

Unnamed: 0,text
0,A steak pun is a rare medium well done.
1,They say that breakfast is the most important ...
2,What do you get if you cross an angry sheep wi...
3,An apple a day keeps the doctor away. At least...
4,What sounds like a sneeze and is made of leath...


In [91]:
def get_next_token_frequency(df: pd.DataFrame, end_token = '<eos>') -> dict[str, dict[str: int]]:
    vocabulary = {}
    for index, row in df.iterrows():
        split_text = re.findall(r'\w+|[.,!?;":]', row['text'])
        for index in range(len(split_text)):
            word = split_text[index].strip().lower()

            if index == len(split_text) - 1:
                next_word = end_token
            else:
                next_word = split_text[index + 1].strip().lower()

            if word == "":
                break

            if word in vocabulary:
                if next_word in vocabulary[word]:
                    vocabulary[word][next_word] += 1
                else:
                    vocabulary[word][next_word] = 1

            else:
                vocabulary[word] = {next_word: 1}

    return vocabulary

In [92]:
def convert_frequency_to_probability(vocabulary: dict[str, dict[str: int]]) -> dict[tuple[float,str]]:
    probability_dictonary = {}
    for key, frequency_dict in vocabulary.items():
        total_frequency = sum(frequency_dict.values())
        probability_list = [(freq / total_frequency, word) for word, freq in frequency_dict.items()]
        probability_list.sort(reverse=True, key=lambda x: x[0])

        cumulative_prob = 0
        cumulative_prob_list = []
        for probability, word in probability_list:
            cumulative_prob += probability
            cumulative_prob_list.append((cumulative_prob, word))

        probability_dictonary[key] = cumulative_prob_list

    return probability_dictonary

In [121]:
class StateMachine():
    def __init__(self, csv_path: str, end_token = '<eos>') -> None:
        self.end_token = end_token
        df = pd.read_csv(csv_path)
        df = df.drop('#', axis=1)
        df = df.rename(columns={"Dad Jokes": "text"})
        vocabulary = get_next_token_frequency(df, end_token)
        self.state_probabilities = convert_frequency_to_probability(vocabulary)

    def get_next_token(self, current_token: str):
        token_probabilities = self.state_probabilities[current_token]
        probability = random.uniform(0, 1)
        for i in token_probabilities:
            if probability <= i[0]:
                return i[1]

    def generate_sequence(self, starting_token: str) -> list[str]:
        starting_token = starting_token.lower()
        output = [starting_token]
        while output[-1] != self.end_token:
            output.append(self.get_next_token(output[-1]))

        return self.correct_punctuation_and_spacing(" ".join(output[:-1]))
    

    def correct_punctuation_and_spacing(self, text):
        # Remove extra spaces around punctuation marks
        text = re.sub(r'\s+([?.!,])', r'\1', text)
        
        # Add a space after punctuation marks if not present
        text = re.sub(r'([?.!,])([^\s])', r'\1 \2', text)
        
        # Fix multiple spaces
        text = re.sub(r'\s+', ' ', text).strip()

        # Capitalize the first word of each sentence
        sentences = re.split(r'(?<=[?.!])\s+', text)
        capitalized_sentences = [sentence.capitalize() for sentence in sentences]
        corrected_text = ' '.join(capitalized_sentences)
        
        return corrected_text

In [122]:
my_markov_chain = StateMachine(Path("../../dad_jokes.csv"))

In [124]:
my_markov_chain.generate_sequence("I")

'I gave birth.'