In [None]:
# some potentially useful imports
from pathlib import Path
import pandas as pd
import re

In [None]:
# lets take a look at the data
data_path = Path("../../dad_jokes.csv")
df = pd.read_csv(data_path)
df.head(5)

In [None]:
# ok so our data is a little messy, lets fix it

""" ToDo:
- rename the second column named "Data Jokes" to "text"
- first remove the first column named "#" 
Here is some useful documentation:
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html
"""
data_path = Path("../../dad_jokes.csv")
df = pd.read_csv(data_path)
#### Your Code ####


###################
df.head(5)

Your output should look like this: 

| |text|
|--|----|
|0	|A steak pun is a rare medium well done.|
|1	|They say that breakfast is the most important ...|
|2	|What do you get if you cross an angry sheep wi...|
|3	|An apple a day keeps the doctor away. At least...|
|4	|What sounds like a sneeze and is made of leath...|

In [None]:
# Next we need to find the conditional probability of each token given that we know the previous token
# practically to do this we need to know the number of times token 1 was followed by token 2
# if the sequence ends there is no subsequent token, in which case we insert a special token,
# which is called end_token here, that denotes the sequence has ended
""" ToDo:
- Here we have split_text which holds a list of strings (tokens) 
- vocabulary is a dictionary which maps each token to a dictionary of tokens to frequency
- use conditionals to check if the current word is already in the dictionary
- and if so check if the next token is in the dictionary the word maps to and increment it
- if not make it map to a dictionary with the next token mapping to 1

note:
Make sure that all words are lower case and have no whitespace
You may also find that in some datasets there is an empty string word ignore it
"""


def get_next_token_frequency(
    df: pd.DataFrame, end_token="<eos>"
) -> dict[str : dict[str:int]]:
    vocabulary = {}
    for index, row in df.iterrows():
        split_text = re.findall(r'\w+|[.,!?;":]', row["text"])
        for index in range(len(split_text)):
            word = split_text[index].strip().lower()  # noqa

            #### Your Code ####

            ###################

    return vocabulary

In [None]:
sample = [
    "hi, I like deep learning",
    "I actually prefer classical machine learning.",
    "meh, I like classical machines",
]
get_next_token_frequency(pd.DataFrame(sample, columns=["text"]))

The above test should return

{'hi': {',': 1}, \
 ',': {'i': 2}, \
 'i': {'like': 2, 'actually': 1}, \
 'like': {'deep': 1, 'classical': 1}, \
 'deep': {'learning': 1}, \
 'learning': {'<eos>': 1, '.': 1}, \
 'actually': {'prefer': 1}, \
 'prefer': {'classical': 1}, \
 'classical': {'machine': 1, 'machines': 1},\
 'machine': {'learning': 1}, \
 '.': {'<eos>': 1}, \
 'meh': {',': 1}, \
 'machines': {'<eos>': 1}} 

In [None]:
# Now that we have the frequency with which each token follows the other tokens
# we need to calculate the *probability* each token follows each other token
# this is easiest to implment as a dictionary that maps each token to
# a list of tuples in decending order of frequency where the first element represents
# the cummulative probablity of that token and the second is the token's value
""" ToDo:
- Iterate through the items in vocabulary and calculate the total of all the frequencies in the dictionaries
- Then calculate the probability that frequency has, sort this is ascending order,
- then use it to calculate the cummulative probability, which the probability_dictonary will map
- each token to
"""


def convert_frequency_to_probability(
    vocabulary: dict[str : dict[str:int]],
) -> dict[str : list[tuple[float, str]]]:
    probability_dictonary = {}

    #### Your Code ####

    ###################

    return probability_dictonary

In [None]:
sample = [
    "hi, I like deep learning",
    "I actually prefer classical machine learning.",
    "meh, I like classical machines",
]
vocabulary = get_next_token_frequency(pd.DataFrame(sample, columns=["text"]))
convert_frequency_to_probability(vocabulary)

The above test code should return:

{'hi': [(1.0, ',')], \
 ',': [(1.0, 'i')], \
 'i': [(0.6666666666666666, 'like'), (1.0, 'actually')], \
 'like': [(0.5, 'deep'), (1.0, 'classical')], \
 'deep': [(1.0, 'learning')], \
 'learning': [(0.5, '<eos>'), (1.0, '.')], \
 'actually': [(1.0, 'prefer')], \
 'prefer': [(1.0, 'classical')], \
 'classical': [(0.5, 'machine'), (1.0, 'machines')], \
 'machine': [(1.0, 'learning')], \
 '.': [(1.0, '<eos>')], \
 'meh': [(1.0, ',')], \
 'machines': [(1.0, '<eos>')]}

In [None]:
# lets now assemble a class that we can use to generate our sentences
class StateMachine:
    # note that these static variables denote that we will be storing some variable of
    # the below names and types in self
    state_probabilities: dict[str : list[tuple[float, str]]]
    end_token: str

    def __init__(self, csv_path: str, end_token="<eos>") -> None:
        """ToDo:
        - Use pandas to read in the csv and then use get_next_token_frequency and
        - convert_frequency_to_probability to get the probabilities of the next state
        - and store that in state_probabilities
        """

        #### Your Code ####

        ###################

        pass

    def get_next_token(self, current_token: str) -> str:
        """ToDo:
        - Given the current_token find the probabilities of the subsuquent tokens in state_probabilities
        - get a random number between 0 and 1 using random.uniform(0, 1) and find the first
        - iterate through the probabilities and find the first probability that is <= the random number
        """

        #### Your Code ####

        ###################

        pass

    def generate_sequence(self, starting_token: str) -> list[str]:
        """ToDo:
        - while the last token in the sequence is not the end_token (<eos> by default)
        - pass the last token in ouput to get_next_token and append the result to output
        """
        output = [starting_token.lower()]

        #### Your Code ####

        ###################

        return self.correct_punctuation_and_spacing(" ".join(output[:-1]))

In [None]:
my_markov_chain = StateMachine(Path("../../dad_jokes.csv"))

In [None]:
for i in range(10):
    print(my_markov_chain.get_next_token("2019"))

The above test should give some combination of "," and "afraid"
\
I got: \
, \
, \
afraid \
, \
afraid \
, \
afraid \
, \
afraid \
, 

In [None]:
my_markov_chain.generate_sequence("I")

The above is our final product put together, some of the dad jokes it generated for me were: \
'I gave birth.' \
'I don t work out of cheese. " " my class dairy! That blue ship for being selfless, thank god took too long have you heard buzzing, as he had a stick a seafood diet!' \
'I made of boundaries. I can february march? 3k.' \
'I m finally knocks again! !' \
'I ve only fans.'