In [None]:
# A few imports useful to any data scientist.
import pandas as pd
import numpy as np
import random
import os
import logging
import time
import pickle
import re
logger = logging.getLogger()
pd.set_option('display.max_columns', 5000)

![Shakespeare](shakespeare.jpg "William Shakespeare")

<h3>William Shakespeare is a little-known playwright from the 16th and 17th centuries.
In the following few exercises, we have a look at lines from Shakespeare's plays
and try to implement a simple text generator to write just like Shakespeare.</h3>

Note: You are free to solve exercises in any manner you wish. There is no specific requirement to use the code already provided in the notebook cells, but the code is there in case you choose to use it.

A dataset has been kindly provided to us by the fine folks at Kaggle. This dataset contains every line from every major Shakespeare play, along with information about the current actor speaking the line, the play this line originated from, etc.

In [None]:
# Dataset is provided as both a pandas dataframe or a tuple list. Again, if you prefer using another format, you are free to do so.
dataset = pd.read_csv("Shakespeare_data.csv")
dataset.describe()

Let's view the first ten elements of the dataset to get an idea about our data.

In [None]:
dataset.head(n=10)

<h4>EXERCISE 1: Drop from our dataset all lines which are not dialogue.</h4>

Example input dataset (first six rows):\
(1, 'Henry IV', nan, nan, nan, 'ACT I')\
(2, 'Henry IV', nan, nan, nan, 'SCENE I. London. The palace.')\
(3, 'Henry IV', nan, nan, nan, 'Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others')\
(4, 'Henry IV', 1.0, '1.1.1', 'KING HENRY IV', 'So shaken as we are, so wan with care,')\
(5, 'Henry IV', 1.0, '1.1.2', 'KING HENRY IV', 'Find we a time for frighted peace to pant,')\
(6, 'Henry IV', 1.0, '1.1.3', 'KING HENRY IV', 'And breathe short-winded accents of new broils')

Example output dataset (first six rows):\
(4, 'Henry IV', 1.0, '1.1.1', 'KING HENRY IV', 'So shaken as we are, so wan with care,')\
(5, 'Henry IV', 1.0, '1.1.2', 'KING HENRY IV', 'Find we a time for frighted peace to pant,')\
(6, 'Henry IV', 1.0, '1.1.3', 'KING HENRY IV', 'And breathe short-winded accents of new broils')\
(7, 'Henry IV', 1.0, '1.1.4', 'KING HENRY IV', 'To be commenced in strands afar remote.')\
(8, 'Henry IV', 1.0, '1.1.5', 'KING HENRY IV', 'No more the thirsty entrance of this soil')\
(9, 'Henry IV', 1.0, '1.1.6', 'KING HENRY IV', "Shall daub her lips with her own children's blood,")

In [None]:
dialogues = dataset[~dataset.Player.isnull()]
print("Number of lines: ", len(dataset))
print("Number of lines: ", len(dialogues))

<h4>EXERCISE 2: Group all dialogue lines for the entire dataset together into a single large string (join lines by a single whitespace).</h4>	

Example input dataset (first six rows):\
(4, 'Henry IV', 1.0, '1.1.1', 'KING HENRY IV', 'So shaken as we are, so wan with care,')\
(5, 'Henry IV', 1.0, '1.1.2', 'KING HENRY IV', 'Find we a time for frighted peace to pant,')\
(6, 'Henry IV', 1.0, '1.1.3', 'KING HENRY IV', 'And breathe short-winded accents of new broils')\
(7, 'Henry IV', 1.0, '1.1.4', 'KING HENRY IV', 'To be commenced in strands afar remote.')\
(8, 'Henry IV', 1.0, '1.1.5', 'KING HENRY IV', 'No more the thirsty entrance of this soil')\
(9, 'Henry IV', 1.0, '1.1.6', 'KING HENRY IV', "Shall daub her lips with her own children's blood,")

Example output string:\
So shaken as we are, so wan with care, Find we a time for frighted peace to pant, And breathe short-winded accents of new broils To be commenced in strands afar remote. No more the thirsty entrance of this soil Shall daub her lips with her own children's blood, 

In [None]:
one_string_dialogue = " ".join(dialogues.PlayerLine)

<h4>EXERCISE 3: Implement a simple tokenization: disregard any non-alpha characters except
.!?:', which you should treat as single tokens. All other words should be regarded as
single tokens. Convert all word tokens to lowercase.</h4>

Example input: "So shaken as we are, so wan with care, Find we a time"

Example output: ['so', 'shaken', 'as', 'we', 'are', ',', 'so', 'wan', 'with', 'care', 'find','a','time' ]

In [None]:
# There is a mistake in output: "so" is repeated twice.
punctuation = ".!?:',"
def tokenize(text, order_matters=True):
    text = text.lower()
    #w+ matches one or more word characters (same as [a-zA-Z0-9_]+
    word_pattern = rF"([a-z0-9]+|[{punctuation}])"
    matches = re.findall(word_pattern, text)
    return matches

    if order_matters:
        return [x for i, x in enumerate(matches) if x not in matches[:i]]
    else:
        return list(set(matches))

input = "So shaken as we are, so wan with care, Find we a time"
start_time = time.perf_counter()
output = tokenize(input)
end_time = time.perf_counter()
print(output)

assert  output == ['so', 'shaken', 'as', 'we', 'are', ',', 'so','wan', 'with', 'care', ',', 'find', 'we','a','time' ]

<h4>EXERCISE 4: List the 50 most common tokens and their occurrence amount.</h4>

Example output:\
{',': 95042, '.': 33787, 'the': 26027, "'": 24099, 'and': 23443, 'i': 21772, 'to': 18800, 'of': 15446, 'you': 13579, ':': 13507, 'a': 13481, 'my': 11875, 'that': 10843, 'in': 10365, '?': 10039, 'is': 8997, '!': 8855, 'not': 8234, 'it': 7492, 'me': 7489, 'for': 7433, 's': 7121, 'with': 6957, 'be': 6697, 'he': 6521, 'your': 6507, 'this': 6446, 'his': 6347, 'but': 5985, 'have': 5754, 'as': 5500, 'thou': 5273, 'd': 5062, 'him': 4960, 'will': 4864 ... }

In [None]:
output = tokenize(one_string_dialogue)
def count_tokens(text):
    keys = set(output)
    most_common_tokens_dict = {}
    for key in keys:
        most_common_tokens_dict[key] = output.count(key)
        # remove the key from the list
        text = list(filter((key).__ne__, text))
    return most_common_tokens_dict

top_50_words = None

if os.path.exists("top_50_words.pickle") == False:
    word_count = count_tokens(output)
    #count the time it takes to run the function
    top_50_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)[:50]
    # save the top 50 words to a pickle file
    with open("top_50_words.pickle", "wb") as outfile:
    # "wb" argument opens the file in binary mode
        pickle.dump(top_50_words, outfile)
else:
    with open("top_50_words.pickle", "rb") as infile:
        # "rb" argument opens the file in binary mode
        top_50_words = pickle.load(infile)

In [None]:
import plotly.express as px
fig = px.histogram(x=[x[0] for x in top_50_words], y=[x[1] for x in top_50_words])
fig.show()

In [None]:
## split list into chunks of size n
break
def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]

chunks = list(chunks(output, 520000))

from multiprocessing import Pool
results = []
with Pool(2) as pool:
        results = pool.map(tokenize, chunks)
print(results)

<h4>EXERCISE 5: For each word, instantiate a dictionary of words that follow it in the corpus,
as well as the number of occurrences of follow words.</h4>

Input: ['we', 'did', 'not', 'think', 'we', 'did', 'bad', '.']

Output: {
    'we': {'did': 2},
    'did': {'not': 1, 'bad': 1},
    'not': {'think': 1},
    'think': {'we': 1},
    'bad': {'.': 1},
}

In [None]:
from collections import defaultdict

class ZeroDict(dict):
    def __missing__(self, key):
        return 0

def build_ngram_dict(tokens, n=1):
    # create a defaultdict to store ngram counts
    ngram_dict = defaultdict(ZeroDict)
    
    # loop over tokens
    for i in range(len(tokens)-n):
        # if ngrams are unigrams
        if n == 1:
            # increment the count for the ngram (key=tokens[i], value=tokens[i+n])
            ngram_dict[tokens[i]][tokens[i + n]] += 1        
        # if ngrams are not unigrams
        else:
            # increment the count for the ngram (key=tokens[i:i+n], value=tokens[i+n])
            ngram_dict[str(tokens[i:i + n])][tokens[i + n]] += 1
    
    # return the ngram dictionary
    return ngram_dict

if os.path.exists("unigram_dict.pickle") == False:
    ngram_dict = build_ngram_dict(output, n=1)
    with open("unigram_dict.pickle", "wb") as outfile:
        pickle.dump(ngram_dict, outfile)
else:
    with open("unigram_dict.pickle", "rb") as infile:
        ngram_dict = pickle.load(infile)

display(ngram_dict)

<h4>EXERCISE 6: Starting from a single word, "i", generate text by sampling possible subsequent words given the word statistics you previously built, up to 1000 tokens.</h4>

In [None]:
class UnigramModel:

    def __init__(self,n=1):
        self.n = n

    def fit(self, tokens):
        self.tokens = tokens
        self.ngram_dict = build_ngram_dict(tokens, n=self.n)
        
    def generate(self, seq_length=20, seq=["i"]):
        for i in range(seq_length):
            # Get the last word in the sequence.
            last_word = seq[-1]
            # Get the dictionary of words that follow the last word.
            next_words = self.ngram_dict[last_word]

            # Get the most common word that follows the last word.
            most_common_word = max(next_words, key=next_words.get)
            
            # Add the most common word to the sequence.
            seq.append(most_common_word)

        return " ".join(seq)

unigram_model = UnigramModel(n=1)
unigram_model.fit(tokens=output)
print(unigram_model.generate(seq=["i"]))

### Remove stop words and punctuation

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopWordsEng = stopwords.words("english")
# stopWordsEng.remove("i")

# output_without_sw = [word for word in output if not word in stopWordsEng]
output_without_sw = [word for word in output if not word in list(punctuation)]
output_without_sw = [word for word in output_without_sw if not word in ["and"]]

unigram_model = UnigramModel(n=1)
unigram_model.fit(tokens=output_without_sw)
print(unigram_model.generate(seq=["i"]))

<h4>EXERCISE 7: What kind of avenues would you think of to improve this text generation?</h4>

In [None]:
class TwoGramModel:

    def __init__(self):
        pass

    def fit(self, tokens):
        self.tokens = tokens
        self.single_gram_dict = build_ngram_dict(tokens, n=1)
        self.double_gram_dict = build_ngram_dict(tokens, n=2)
        
    def predict_token(self,words):
        
        if len(words) == 1:
            next_words = self.single_gram_dict[words[-1]]
            return max(next_words, key=next_words.get)

    
        elif len(words) > 1:
            # next_words = self.single_gram_dict[words[0]]
            # one_gram_prediction = max(next_words, key=next_words.get)
            # print(max(next_words.values()))
            key = str(words)
            next_words = self.double_gram_dict[key]
            try:
                return max(next_words, key=next_words.get)
            except:
                next_words = self.single_gram_dict[words[-1]]
                return max(next_words, key=next_words.get)
        
    def generate(self,seq,seq_length=20):
        for i in range(seq_length):
            if i == 0:
               seq.append(self.predict_token(seq[0]))
            elif i >= 1:
                seq.append(self.predict_token(seq[-2:]))                

        return " ".join(seq)

In [None]:
ensaemble_model = TwoGramModel()
ensaemble_model.fit(tokens=output)
print(ensaemble_model.generate(seq=["i"], seq_length=20))

In [None]:
twoGramModel = TwoGramModel()
twoGramModel.fit(tokens=output)
print(twoGramModel.generate(seq=["i"], seq_length=200))

In [None]:
twoGramModel = TwoGramModel()
twoGramModel.fit(tokens=output_without_punct)
print(twoGramModel.generate(seq=["i"], seq_length=200))