# Purpose of this notebook:

- ETL for our initial Shakespeare dataset
- Creation of src and target examples for 

In [1]:
import pandas as pd
import numpy as np
import pickle
import os
from transformers import AutoTokenizer, PreTrainedTokenizerFast, BertTokenizerFast
import json
import random
import re

In [2]:
random.seed(102)

# Creating tokenizer

In [3]:
with open("./data/source/shakespeare_tokenize.txt", 'r') as file:
    ss = file.read()

In [4]:
ss = str(ss).lower().replace('\n\n','\n').replace('  ', ' ').replace(';', '').replace(':', '').replace('[', '').replace(']', '').replace('  ', ' ')

In [5]:
ss_list = ss.split('\n')
ss_list = [i.strip() for i in ss_list]
ss_list[:5]

["a midsummer-night's dream",
 'now , fair hippolyta , our nuptial hour',
 'draws on apace four happy days bring in',
 'another moon but o ! methinks how slow',
 'this old moon wanes she lingers my desires ,',
 'like to a step dame , or a dowager',
 "long withering out a young man's revenue .",
 'four days will quickly steep themselves in night',
 'four nights will quickly dream away the time',
 'and then the moon , like to a silver bow',
 'new-bent in heaven , shall behold the night',
 'of our solemnities .',
 'go , philostrate ,',
 'stir up the athenian youth to merriments',
 'awake the pert and nimble spirit of mirth',
 'turn melancholy forth to funerals',
 'the pale companion is not for our pomp .',
 "hippolyta , i woo'd thee with my sword ,",
 'and won thy love doing thee injuries',
 'but i will wed thee in another key ,',
 'with pomp , with triumph , and with revelling .',
 '',
 'happy be theseus , our renowned duke !',
 "thanks , good egeus what's the news with thee ?",
 'full o

In [6]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokenizer = tokenizer.train_new_from_iterator(iter(ss_list),32000)
tokenizer.vocab_size

30267

In [7]:
tokenizer.tokenize("my tongue should catch your tongue's sweet melody ."), tokenizer.tokenize("demetrius , i'll avouch it to his head ,")

(['my',
  'tongue',
  'should',
  'catch',
  'your',
  'tongue',
  "'",
  's',
  'sweet',
  'melody',
  '.'],
 ['demetrius', ',', 'i', "'", 'll', 'avouch', 'it', 'to', 'his', 'head', ','])

In [8]:
tokenizer_save_path = 'shakespeare-tokenizer-bert/plays'
tokenizer.save_pretrained(tokenizer_save_path)

('shakespeare-tokenizer-bert/plays\\tokenizer_config.json',
 'shakespeare-tokenizer-bert/plays\\special_tokens_map.json',
 'shakespeare-tokenizer-bert/plays\\vocab.txt',
 'shakespeare-tokenizer-bert/plays\\added_tokens.json',
 'shakespeare-tokenizer-bert/plays\\tokenizer.json')

# Exploratory

In [9]:
df = pd.read_csv('data/source/shakespeare.csv')

In [10]:
df.head(5)

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


In [11]:
# df['Play'].str.lower().unique()

In [12]:
# df['Player'].str.lower().unique()

In [13]:
# french characters: alencon, alice, king of france, katharine, dauphin
# french speaking characters

In [14]:
# df[df['Player'].str.lower() == 'katharine']
# maybe could remove the whole of Henry V since that is where most french lines are found

In [15]:
# Remove most contextual lines (act numbers, leaving, end, etc)
remove = df[(df['Player'].notna()) & (df["PlayerLine"]!='Exeunt')]
data = remove[['Play', 'PlayerLinenumber', "Player", "PlayerLine"]]
data.head(5)

Unnamed: 0,Play,PlayerLinenumber,Player,PlayerLine
3,Henry IV,1.0,KING HENRY IV,"So shaken as we are, so wan with care,"
4,Henry IV,1.0,KING HENRY IV,"Find we a time for frighted peace to pant,"
5,Henry IV,1.0,KING HENRY IV,And breathe short-winded accents of new broils
6,Henry IV,1.0,KING HENRY IV,To be commenced in strands afar remote.
7,Henry IV,1.0,KING HENRY IV,No more the thirsty entrance of this soil


# Preprocessing Shakespeare

In [16]:
class ShakespeareData:
    """Creates a cleaned up version of the shakespeare dataset"""
    def __init__(self, filepath='data/source/shakespeare.csv'):        
        data = pd.read_csv(filepath)
        self.data = self.clean(data) 
            
    def clean(self, data):
        repl = {
            '@\w*': ' ', 
            '&amp;' : 'and',
            '&#\w*;': ' ', 
            '\\n\.':' ' ,
            '\\n':' ',
            "\.{2,}": '.', 
            "!{2,}":'!', 
            "\?{2,}":'?', 
            '_':" ",
            ' +': ' ', 
            '\-{2,}': ' ',
            ':': '',
            ';': '',
            '\[': '',
            '\]': '',
            '  ': ' '
            }

        data = data[(data['Player'].notna()) & (data["PlayerLine"]!='Exeunt') & (data["Play"]!='Henry V') & (data['ActSceneLine'].notna())]
        data = data[['ActSceneLine', "Player", "PlayerLine", "Play", "PlayerLinenumber"]]
        data['Player'] = data['Player'].str.lower()
        data['ActScene'] = data['ActSceneLine'].apply(extract_before_second_full_stop)
        cleaned = data['PlayerLine'].apply(lambda x: x.strip().lower())
        cleaned = cleaned.replace(repl, regex=True)
        cleaned_data = data.assign(text = cleaned)
        return cleaned_data

def extract_before_second_full_stop(string):
    """ Finds the act number"""
    index = string.find('.', string.find('.') + 1)  # Find index of second full stop
    if index != -1:
        return string[:index]
    else:
        return string

In [47]:
def generate_src_trg_dataset(text_col):
    data = []
    
    for i in range(len(text_col)-1):
        dictionary = {}
        dictionary["src"] = text_col.iloc[i]
        dictionary["trg"] = text_col.iloc[i+1]
        data.append(dictionary)

#     for i in range(len(text_col)-5):
#         dictionary = {}
#         dictionary["src"] = (text_col.iloc[i] + " " + text_col.iloc[i+1] + " " + text_col.iloc[i+2]).strip()
#         dictionary["trg"] = (text_col.iloc[i+3] + " " + text_col.iloc[i+4] + " " + text_col.iloc[i+5]).strip()
#         data.append(dictionary)
    
    return data

def save_src_trg_dataset(data, filename = 'data', folder_dir = ''):
    fn = folder_dir + filename + '.jsonl'
    if not os.path.exists(folder_dir):
        os.mkdir(folder_dir)
    with open(fn, 'w') as outfile:
        for entry in data:
            json.dump(entry, outfile)
            outfile.write('\n')

def generate_and_save_dataset(data_text_column, filename= 'data', folder_dir= ''):
    data = generate_src_trg_dataset(data_text_column)
    
    save_src_trg_dataset(data, filename=filename, folder_dir=folder_dir)

In [18]:
# with open('vocab_list.pickle', 'wb') as handle:
#     pickle.dump(vocab.idx_word, handle)

In [19]:
shakespeare_data = ShakespeareData()

In [20]:
shakespeare_data.data.head() # 107459 -> # 101919 after removing na in actsceneline

Unnamed: 0,ActSceneLine,Player,PlayerLine,Play,PlayerLinenumber,ActScene,text
3,1.1.1,king henry iv,"So shaken as we are, so wan with care,",Henry IV,1.0,1.1,"so shaken as we are, so wan with care,"
4,1.1.2,king henry iv,"Find we a time for frighted peace to pant,",Henry IV,1.0,1.1,"find we a time for frighted peace to pant,"
5,1.1.3,king henry iv,And breathe short-winded accents of new broils,Henry IV,1.0,1.1,and breathe short-winded accents of new broils
6,1.1.4,king henry iv,To be commenced in strands afar remote.,Henry IV,1.0,1.1,to be commenced in strands afar remote.
7,1.1.5,king henry iv,No more the thirsty entrance of this soil,Henry IV,1.0,1.1,no more the thirsty entrance of this soil


In [21]:
shakespeare_data.data.head(60)

Unnamed: 0,ActSceneLine,Player,PlayerLine,Play,PlayerLinenumber,ActScene,text
3,1.1.1,king henry iv,"So shaken as we are, so wan with care,",Henry IV,1.0,1.1,"so shaken as we are, so wan with care,"
4,1.1.2,king henry iv,"Find we a time for frighted peace to pant,",Henry IV,1.0,1.1,"find we a time for frighted peace to pant,"
5,1.1.3,king henry iv,And breathe short-winded accents of new broils,Henry IV,1.0,1.1,and breathe short-winded accents of new broils
6,1.1.4,king henry iv,To be commenced in strands afar remote.,Henry IV,1.0,1.1,to be commenced in strands afar remote.
7,1.1.5,king henry iv,No more the thirsty entrance of this soil,Henry IV,1.0,1.1,no more the thirsty entrance of this soil
8,1.1.6,king henry iv,Shall daub her lips with her own children's bl...,Henry IV,1.0,1.1,shall daub her lips with her own children's bl...
9,1.1.7,king henry iv,"Nor more shall trenching war channel her fields,",Henry IV,1.0,1.1,"nor more shall trenching war channel her fields,"
10,1.1.8,king henry iv,Nor bruise her flowerets with the armed hoofs,Henry IV,1.0,1.1,nor bruise her flowerets with the armed hoofs
11,1.1.9,king henry iv,"Of hostile paces: those opposed eyes,",Henry IV,1.0,1.1,"of hostile paces those opposed eyes,"
12,1.1.10,king henry iv,"Which, like the meteors of a troubled heaven,",Henry IV,1.0,1.1,"which, like the meteors of a troubled heaven,"


In [22]:
def split_long_lines(line, max_length=100):
    # Split long lines in the data set in order to fit sequence length.
    if len(line) <= max_length:
        return [line]

    # Find best line break area, if not, 
    match_punctuation = re.search(r'[\.,;:!?>]\s*', line[:max_length][::-1])
    # match_word = re.search(r'\s', line[:max_length][::-1])
    if match_punctuation:
        split_point = max_length-match_punctuation.start()
#         elif match_word:
#             split_point = max_length - match_word.start()
    else:
        split_point = max_length

    # Split lines and recurse on unsplit part
    return [line[:split_point].strip()] + split_long_lines(line[split_point:].strip(), max_length=max_length)
    
def remove_separator(lines):
    return [i.replace(">", "").strip().replace("  ", " ") for i in lines]

In [24]:
# Aggregate character lines into a single line
data_per_convo = (
    shakespeare_data.data.groupby(['Play', 'Player', 'ActScene', 'PlayerLinenumber'],sort=False)['text']
    .agg(lambda x: ' > '.join(x)).reset_index()
)
# Split long single actor lines into sequence length-ish length
data_per_convo['text_processed']=data_per_convo['text'].apply(split_long_lines)
data_per_convo['text_processed']=data_per_convo['text_processed'].apply(remove_separator)
data_per_convo['text_processed'].iloc[0][:5]

# Create processed data for each row
data_per_convo = data_per_convo.explode('text_processed')

In [25]:
data_per_convo['text_processed_with_player'] = data_per_convo['Player'] + " : " + data_per_convo['text_processed']
data_per_convo['text_processed_with_player'].head()

0    king henry iv : so shaken as we are, so wan wi...
0    king henry iv : and breathe short-winded accen...
0    king henry iv : no more the thirsty entrance o...
0    king henry iv : nor more shall trenching war c...
0    king henry iv : of hostile paces those opposed...
Name: text_processed_with_player, dtype: object

In [26]:
all_plays = data_per_convo.Play.unique()

In [27]:
all_plays

array(['Henry IV', 'Henry VI Part 1', 'Henry VI Part 2',
       'Henry VI Part 3', 'Alls well that ends well', 'As you like it',
       'Antony and Cleopatra', 'A Comedy of Errors', 'Coriolanus',
       'Cymbeline', 'Hamlet', 'Henry VIII', 'King John', 'Julius Caesar',
       'King Lear', 'Loves Labours Lost', 'macbeth',
       'Measure for measure', 'Merchant of Venice',
       'Merry Wives of Windsor', 'A Midsummer nights dream',
       'Much Ado about nothing', 'Othello', 'Pericles', 'Richard II',
       'Richard III', 'Romeo and Juliet', 'Taming of the Shrew',
       'The Tempest', 'Timon of Athens', 'Titus Andronicus',
       'Troilus and Cressida', 'Twelfth Night', 'Two Gentlemen of Verona',
       'A Winters Tale'], dtype=object)

### Preparing train/test Shakespeare data

In [28]:
processed_data = []
for play in all_plays:
#     text_col = data_per_convo[data_per_convo['Play'] == play]['text_processed']
    play_lines = data_per_convo[data_per_convo['Play'] == play]['text_processed_with_player']
    processed_data += generate_src_trg_dataset(play_lines)

In [29]:
play_lines

28614    archidamus : if you shall chance, camillo, to ...
28614    archidamus : the like occasion whereon my serv...
28614    archidamus : difference betwixt our bohemia an...
28615    camillo : i think, this coming summer, the kin...
28615    camillo : means to pay bohemia the visitation ...
                               ...                        
29360    leontes : look upon my brother both your pardo...
29360    leontes : my ill suspicion. this is your son-i...
29360    leontes : is troth-plight to your daughter. go...
29360    leontes : each one demand an answer to his par...
29360      leontes : we were dissever'd hastily lead away.
Name: text_processed_with_player, Length: 1869, dtype: object

In [30]:
processed_data[:5]

[{'src': 'king henry iv : so shaken as we are, so wan with care, find we a time for frighted peace to pant,',
  'trg': 'king henry iv : and breathe short-winded accents of new broils to be commenced in strands afar remote.'},
 {'src': 'king henry iv : and breathe short-winded accents of new broils to be commenced in strands afar remote.',
  'trg': "king henry iv : no more the thirsty entrance of this soil shall daub her lips with her own children's blood,"},
 {'src': "king henry iv : no more the thirsty entrance of this soil shall daub her lips with her own children's blood,",
  'trg': 'king henry iv : nor more shall trenching war channel her fields, nor bruise her flowerets with the armed hoofs'},
 {'src': 'king henry iv : nor more shall trenching war channel her fields, nor bruise her flowerets with the armed hoofs',
  'trg': 'king henry iv : of hostile paces those opposed eyes, which, like the meteors of a troubled heaven,'},
 {'src': 'king henry iv : of hostile paces those opposed 

In [31]:
random.shuffle(processed_data) # in-place shuffle

In [32]:
n = len(processed_data)
n

60784

In [33]:
train_cutoff = int(n*0.8)

In [34]:
# want to preserve order
ss_train_data = processed_data[:train_cutoff]
ss_test_data = processed_data[-10:]
ss_val_data = processed_data[train_cutoff:-10]

In [35]:
len(ss_test_data), len(ss_train_data), len(ss_val_data)

(10, 48627, 12147)

In [36]:
ss_train_data[:5]

[{'src': 'morocco : o hell! what have we here? a carrion death, within whose empty eye there is a written scroll!',
  'trg': "morocco : i'll read the writing. all that glitters is not gold, often have you heard that told"},
 {'src': 'cromwell : and his disciples only envy at, ye blew the fire that burns ye now have at ye! enter king,',
  'trg': 'cromwell : frowning on them, takes his seat'},
 {'src': 'mistress quickly : mistress page would desire you to send her your little page, of all loves her husband has a',
  'trg': 'mistress quickly : marvellous infection to the little page, and truly master page is an honest man. never a wife in'},
 {'src': "plantagenet : he bears him on the place's privilege, or durst not, for his craven heart, say thus.",
  'trg': "somerset : by him that made me, i'll maintain my words on any plot of ground in christendom."},
 {'src': 'queen margaret : loather a hundred times to part than die. yet now farewell, and farewell life with thee!',
  'trg': 'suffolk 

In [48]:
save_src_trg_dataset(ss_train_data, filename= 'train', folder_dir= 'data/with_player/')

In [49]:
save_src_trg_dataset(ss_val_data, filename= 'valid', folder_dir= 'data/with_player/')

In [50]:
save_src_trg_dataset(ss_test_data, filename= 'test', folder_dir= 'data/with_player/')