In [None]:
import numpy as np
 
class MarkovChain(object):
    def __init__(self, transition_prob):
        """
        Initialize the MarkovChain instance.
 
        Parameters
        ----------
        transition_prob: dict
            A dict object representing the transition 
            probabilities in Markov Chain. 
            Should be of the form: 
                {'state1': {'state1': 0.1, 'state2': 0.4}, 
                 'state2': {...}}
        """
        self.transition_prob = transition_prob
        self.states = list(transition_prob.keys())
 
    def next_state(self, current_state):
        """
        Returns the state of the random variable at the next time 
        instance.
 
        Parameters
        ----------
        current_state: str
            The current state of the system.
        """
        return np.random.choice(
            self.states, 
            p=[self.transition_prob[current_state][next_state] 
               for next_state in self.states]
        )
 
    def generate_states(self, current_state, no=10):
        """
        Generates the next states of the system.
 
        Parameters
        ----------
        current_state: str
            The state of the current random variable.
 
        no: int
            The number of future states to generate.
        """
        future_states = []
        for i in range(no):
            next_state = self.next_state(current_state)
            future_states.append(next_state)
            current_state = next_state
        return future_states

In [None]:
import pandas as pd

import nltk
import string

from nltk.tokenize import sent_tokenize, word_tokenize

from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from nltk.tokenize import RegexpTokenizer

porter = PorterStemmer()

wordnet_lemmatizer = WordNetLemmatizer()


df = pd.read_csv("spam.csv", encoding='latin-1')

data = df[['v1', 'v2']]

In [None]:
""" 
Iterate through each message, tokenize and normalize it. Each processed token is then added to the 
word_dict which is a nested dictionary holding all the tokens that follow the given token along with 
their count corresponding to this token.

E.g.,
{
    "token_1": {"token_2": 10, "token_3": 5}
}

This data will be used to calculate the probability of each following token given the primary token.
The calculated probability is the transition probability from token_1 to token_2 and token_3.
"""
def get_word_dictionary(class_tag):
    word_dict = {}

    for index, row in data.iterrows():
        """
        Fetch the class and the message into two separate variables.
        """
        tag = row['v1']
        if tag != class_tag:
            continue
        
        message = row['v2']

        """
        Tokenize the message text and normalize it by removing the punctuations.
        """
        msg_tokens = word_tokenize(message)
        last_token = None
        for token in msg_tokens:
            normalized_token = token.lower()

            if normalized_token in string.punctuation:
                continue

            """
            Lemmatize the word before adding it to the markov chain.
            """
            lemmatized_token = wordnet_lemmatizer.lemmatize(normalized_token)

            if last_token is not None:
                if last_token in word_dict:
                    sub_dict = word_dict[last_token] 
                else:
                    sub_dict = {}
                    word_dict[last_token] = sub_dict

                if not lemmatized_token in sub_dict:
                    sub_dict[lemmatized_token] = 1 
                else:
                    sub_dict[lemmatized_token] += 1
            else:
                word_dict[lemmatized_token] = {}

            last_token = lemmatized_token

    return word_dict

In [None]:
ham_word_dict = get_word_dictionary('ham')
spam_word_dict = get_word_dictionary('spam')