In [8]:
# A few imports useful to any data scientist.
import pandas as pd
import numpy as np
import random
pd.set_option('display.max_columns', 5000)

![Shakespeare](shakespeare.jpg "William Shakespeare")

<h3>William Shakespeare is a little-known playwright from the 16th and 17th centuries.
In the following few exercises, we have a look at lines from Shakespeare's plays
and try to implement a simple text generator to write just like Shakespeare.</h3>

Note: You are free to solve exercises in any manner you wish. There is no specific requirement to use the code already provided in the notebook cells, but the code is there in case you choose to use it.

A dataset has been kindly provided to us by the fine folks at Kaggle. This dataset contains every line from every major Shakespeare play, along with information about the current actor speaking the line, the play this line originated from, etc.

In [11]:
# Dataset is provided as both a pandas dataframe or a tuple list. Again, if you prefer using another format, you are free to do so.
dataset = pd.read_csv("Shakespeare_data.csv")
dataset_list = list(dataset.itertuples(index=False, name=None))

Let's view the first ten elements of the dataset to get an idea about our data.

In [13]:
def view_data():
    print()
    print(dataset.head())
    print()
    for d in dataset_list[:5]:
        print(d)

view_data()


   Dataline      Play  PlayerLinenumber ActSceneLine         Player  \
0         1  Henry IV               NaN          NaN            NaN   
1         2  Henry IV               NaN          NaN            NaN   
2         3  Henry IV               NaN          NaN            NaN   
3         4  Henry IV               1.0        1.1.1  KING HENRY IV   
4         5  Henry IV               1.0        1.1.2  KING HENRY IV   

                                          PlayerLine  
0                                              ACT I  
1                       SCENE I. London. The palace.  
2  Enter KING HENRY, LORD JOHN OF LANCASTER, the ...  
3             So shaken as we are, so wan with care,  
4         Find we a time for frighted peace to pant,  

(1, 'Henry IV', nan, nan, nan, 'ACT I')
(2, 'Henry IV', nan, nan, nan, 'SCENE I. London. The palace.')
(3, 'Henry IV', nan, nan, nan, 'Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others')
(4, 'He

<h4>EXERCISE 1: Drop from our dataset all lines which are not dialogue.</h4>

Example input dataset (first six rows):\
(1, 'Henry IV', nan, nan, nan, 'ACT I')\
(2, 'Henry IV', nan, nan, nan, 'SCENE I. London. The palace.')\
(3, 'Henry IV', nan, nan, nan, 'Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others')\
(4, 'Henry IV', 1.0, '1.1.1', 'KING HENRY IV', 'So shaken as we are, so wan with care,')\
(5, 'Henry IV', 1.0, '1.1.2', 'KING HENRY IV', 'Find we a time for frighted peace to pant,')\
(6, 'Henry IV', 1.0, '1.1.3', 'KING HENRY IV', 'And breathe short-winded accents of new broils')

Example output dataset (first six rows):\
(4, 'Henry IV', 1.0, '1.1.1', 'KING HENRY IV', 'So shaken as we are, so wan with care,')\
(5, 'Henry IV', 1.0, '1.1.2', 'KING HENRY IV', 'Find we a time for frighted peace to pant,')\
(6, 'Henry IV', 1.0, '1.1.3', 'KING HENRY IV', 'And breathe short-winded accents of new broils')\
(7, 'Henry IV', 1.0, '1.1.4', 'KING HENRY IV', 'To be commenced in strands afar remote.')\
(8, 'Henry IV', 1.0, '1.1.5', 'KING HENRY IV', 'No more the thirsty entrance of this soil')\
(9, 'Henry IV', 1.0, '1.1.6', 'KING HENRY IV', "Shall daub her lips with her own children's blood,")

<h4>EXERCISE 2: Group all dialogue lines for the entire dataset together into a single large string (join lines by a single whitespace).</h4>	

Example input dataset (first six rows):\
(4, 'Henry IV', 1.0, '1.1.1', 'KING HENRY IV', 'So shaken as we are, so wan with care,')\
(5, 'Henry IV', 1.0, '1.1.2', 'KING HENRY IV', 'Find we a time for frighted peace to pant,')\
(6, 'Henry IV', 1.0, '1.1.3', 'KING HENRY IV', 'And breathe short-winded accents of new broils')\
(7, 'Henry IV', 1.0, '1.1.4', 'KING HENRY IV', 'To be commenced in strands afar remote.')\
(8, 'Henry IV', 1.0, '1.1.5', 'KING HENRY IV', 'No more the thirsty entrance of this soil')\
(9, 'Henry IV', 1.0, '1.1.6', 'KING HENRY IV', "Shall daub her lips with her own children's blood,")

Example output string:\
So shaken as we are, so wan with care, Find we a time for frighted peace to pant, And breathe short-winded accents of new broils To be commenced in strands afar remote. No more the thirsty entrance of this soil Shall daub her lips with her own children's blood, 

<h4>EXERCISE 3: Implement a simple tokenization: disregard any non-alpha characters except
.!?:', which you should treat as single tokens. All other words should be regarded as
single tokens. Convert all word tokens to lowercase.</h4>

Example input: "So shaken as we are, so wan with care, Find we a time"

Example output: ['so', 'shaken', 'as', 'we', 'are', ',', 'so', 'wan', 'with', 'care']

In [None]:
punctuation = ".!?:',"

<h4>EXERCISE 4: List the 50 most common tokens and their occurrence amount.</h4>

Example output:\
{',': 95042, '.': 33787, 'the': 26027, "'": 24099, 'and': 23443, 'i': 21772, 'to': 18800, 'of': 15446, 'you': 13579, ':': 13507, 'a': 13481, 'my': 11875, 'that': 10843, 'in': 10365, '?': 10039, 'is': 8997, '!': 8855, 'not': 8234, 'it': 7492, 'me': 7489, 'for': 7433, 's': 7121, 'with': 6957, 'be': 6697, 'he': 6521, 'your': 6507, 'this': 6446, 'his': 6347, 'but': 5985, 'have': 5754, 'as': 5500, 'thou': 5273, 'd': 5062, 'him': 4960, 'will': 4864 ... }

<h4>EXERCISE 5: For each word, instantiate a dictionary of words that follow it in the corpus,
as well as the number of occurrences of follow words.</h4>

Input: ['we', 'did', 'not', 'think', 'we', 'did', 'bad', '.']

Output: {
    'we': {'did': 2},
    'did': {'not': 1, 'bad': 1},
    'not': {'think': 1},
    'think': {'we': 1},
    'bad': {'.': 1},
}

In [None]:
from collections import defaultdict


class ZeroDict(dict):

    def __missing__(self, key):
        return 0

<h4>EXERCISE 6: Starting from a single word, "i", generate text by sampling possible subsequent words given the word statistics you previously built, up to 1000 tokens.</h4>

In [None]:
seq = ['i']

# Do text generation...



# End text generation.

print(" ".join(seq))

<h4>EXERCISE 7: What kind of avenues would you think of to improve this text generation?</h4>