# Purpose of this notebook:

- ETL for our initial Shakespeare dataset
- Creation of src and target examples for 

In [1]:
import pandas as pd
import numpy as np
import pickle
import os
from transformers import AutoTokenizer, PreTrainedTokenizerFast, BertTokenizerFast
import json
import random
import re

In [2]:
random.seed(102)

# Creating tokenizer

In [3]:
with open("./data/source/shakespeare_tokenize.txt", 'r') as file:
    ss = file.read()

In [4]:
ss = str(ss).lower().replace('\n\n','\n').replace('  ', ' ').replace(';', '').replace(':', '').replace('[', '').replace(']', '').replace('  ', ' ')

In [5]:
ss_list = ss.split('\n')
ss_list = [i.strip() for i in ss_list]
ss_list[:5]

["a midsummer-night's dream",
 'now , fair hippolyta , our nuptial hour',
 'draws on apace four happy days bring in',
 'another moon but o ! methinks how slow',
 'this old moon wanes she lingers my desires ,']

In [6]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokenizer = tokenizer.train_new_from_iterator(iter(ss_list),32000)
tokenizer.vocab_size






30266

In [7]:
tokenizer.tokenize("my tongue should catch your tongue's sweet melody ."), tokenizer.tokenize("demetrius , i'll avouch it to his head ,")

(['my',
  'tongue',
  'should',
  'catch',
  'your',
  'tongue',
  "'",
  's',
  'sweet',
  'melody',
  '.'],
 ['demetrius', ',', 'i', "'", 'll', 'avouch', 'it', 'to', 'his', 'head', ','])

In [8]:
tokenizer_save_path = 'shakespeare-tokenizer-bert/plays'
tokenizer.save_pretrained(tokenizer_save_path)

('shakespeare-tokenizer-bert/plays\\tokenizer_config.json',
 'shakespeare-tokenizer-bert/plays\\special_tokens_map.json',
 'shakespeare-tokenizer-bert/plays\\vocab.txt',
 'shakespeare-tokenizer-bert/plays\\added_tokens.json',
 'shakespeare-tokenizer-bert/plays\\tokenizer.json')

# Exploratory

In [7]:
df = pd.read_csv('data/source/shakespeare.csv')

In [8]:
df.head(5)

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


In [9]:
# df['Play'].str.lower().unique()

In [10]:
# df['Player'].str.lower().unique()

In [11]:
# french characters: alencon, alice, king of france, katharine, dauphin
# french speaking characters

In [12]:
# df[df['Player'].str.lower() == 'katharine']
# maybe could remove the whole of Henry V since that is where most french lines are found

In [13]:
# Remove most contextual lines (act numbers, leaving, end, etc)
remove = df[(df['Player'].notna()) & (df["PlayerLine"]!='Exeunt')]
data = remove[['Play', 'PlayerLinenumber', "Player", "PlayerLine"]]
data.head(5)

Unnamed: 0,Play,PlayerLinenumber,Player,PlayerLine
3,Henry IV,1.0,KING HENRY IV,"So shaken as we are, so wan with care,"
4,Henry IV,1.0,KING HENRY IV,"Find we a time for frighted peace to pant,"
5,Henry IV,1.0,KING HENRY IV,And breathe short-winded accents of new broils
6,Henry IV,1.0,KING HENRY IV,To be commenced in strands afar remote.
7,Henry IV,1.0,KING HENRY IV,No more the thirsty entrance of this soil


# Preprocessing Shakespeare

In [14]:
class ShakespeareData:
    """Creates a cleaned up version of the shakespeare dataset"""
    def __init__(self, filepath='data/source/shakespeare.csv'):        
        data = pd.read_csv(filepath)
        self.data = self.clean(data) 
            
    def clean(self, data):
        repl = {
            '@\w*': ' ', 
            '&amp;' : 'and',
            '&#\w*;': ' ', 
            '\\n\.':' ' ,
            '\\n':' ',
            "\.{2,}": '.', 
            "!{2,}":'!', 
            "\?{2,}":'?', 
            '_':" ",
            ' +': ' ', 
            '\-{2,}': ' ',
            ':': '',
            ';': '',
            '\[': '',
            '\]': '',
            '  ': ' '
            }

        data = data[(data['Player'].notna()) & (data["PlayerLine"]!='Exeunt') & (data["Play"]!='Henry V') & (data['ActSceneLine'].notna())]
        data = data[['ActSceneLine', "Player", "PlayerLine", "Play", "PlayerLinenumber"]]
        data['Player'] = data['Player'].str.lower()
        data['ActScene'] = data['ActSceneLine'].apply(extract_before_second_full_stop)
        cleaned = data['PlayerLine'].apply(lambda x: x.strip().lower())
        cleaned = cleaned.replace(repl, regex=True)
        cleaned_data = data.assign(text = cleaned)
        return cleaned_data

def extract_before_second_full_stop(string):
    """ Finds the act number"""
    index = string.find('.', string.find('.') + 1)  # Find index of second full stop
    if index != -1:
        return string[:index]
    else:
        return string

In [15]:
def generate_src_trg_dataset(text_col):
    data = []
    
    for i in range(len(text_col)-1):
        dictionary = {}
        dictionary["src"] = text_col.iloc[i]
        dictionary["trg"] = text_col.iloc[i+1]
        data.append(dictionary)

#     for i in range(len(text_col)-5):
#         dictionary = {}
#         dictionary["src"] = (text_col.iloc[i] + " " + text_col.iloc[i+1] + " " + text_col.iloc[i+2]).strip()
#         dictionary["trg"] = (text_col.iloc[i+3] + " " + text_col.iloc[i+4] + " " + text_col.iloc[i+5]).strip()
#         data.append(dictionary)
    
    return data

def save_src_trg_dataset(data, filename = 'data', folder_dir = ''):
    fn = folder_dir + filename + '.jsonl'
    if not os.path.exists(folder_dir):
        os.mkdir(folder_dir)
    with open(fn, 'w') as outfile:
        for entry in data:
            json.dump(entry, outfile)
            outfile.write('\n')

def generate_and_save_dataset(data_text_column, filename= 'data', folder_dir= ''):
    data = generate_src_trg_dataset(data_text_column)
    
    save_src_trg_dataset(data, filename=filename, folder_dir=folder_dir)

In [16]:
# with open('vocab_list.pickle', 'wb') as handle:
#     pickle.dump(vocab.idx_word, handle)

In [37]:
shakespeare_data = ShakespeareData()

In [38]:
shakespeare_data.data.head() # 107459 -> # 101919 after removing na in actsceneline

Unnamed: 0,ActSceneLine,Player,PlayerLine,Play,PlayerLinenumber,ActScene,text
3,1.1.1,king henry iv,"So shaken as we are, so wan with care,",Henry IV,1.0,1.1,"so shaken as we are, so wan with care,"
4,1.1.2,king henry iv,"Find we a time for frighted peace to pant,",Henry IV,1.0,1.1,"find we a time for frighted peace to pant,"
5,1.1.3,king henry iv,And breathe short-winded accents of new broils,Henry IV,1.0,1.1,and breathe short-winded accents of new broils
6,1.1.4,king henry iv,To be commenced in strands afar remote.,Henry IV,1.0,1.1,to be commenced in strands afar remote.
7,1.1.5,king henry iv,No more the thirsty entrance of this soil,Henry IV,1.0,1.1,no more the thirsty entrance of this soil


In [39]:
shakespeare_data.data.head(60)

Unnamed: 0,ActSceneLine,Player,PlayerLine,Play,PlayerLinenumber,ActScene,text
3,1.1.1,king henry iv,"So shaken as we are, so wan with care,",Henry IV,1.0,1.1,"so shaken as we are, so wan with care,"
4,1.1.2,king henry iv,"Find we a time for frighted peace to pant,",Henry IV,1.0,1.1,"find we a time for frighted peace to pant,"
5,1.1.3,king henry iv,And breathe short-winded accents of new broils,Henry IV,1.0,1.1,and breathe short-winded accents of new broils
6,1.1.4,king henry iv,To be commenced in strands afar remote.,Henry IV,1.0,1.1,to be commenced in strands afar remote.
7,1.1.5,king henry iv,No more the thirsty entrance of this soil,Henry IV,1.0,1.1,no more the thirsty entrance of this soil
8,1.1.6,king henry iv,Shall daub her lips with her own children's bl...,Henry IV,1.0,1.1,shall daub her lips with her own children's bl...
9,1.1.7,king henry iv,"Nor more shall trenching war channel her fields,",Henry IV,1.0,1.1,"nor more shall trenching war channel her fields,"
10,1.1.8,king henry iv,Nor bruise her flowerets with the armed hoofs,Henry IV,1.0,1.1,nor bruise her flowerets with the armed hoofs
11,1.1.9,king henry iv,"Of hostile paces: those opposed eyes,",Henry IV,1.0,1.1,"of hostile paces those opposed eyes,"
12,1.1.10,king henry iv,"Which, like the meteors of a troubled heaven,",Henry IV,1.0,1.1,"which, like the meteors of a troubled heaven,"


In [40]:
def split_long_lines(line, max_length=200):
    # Split long lines in the data set in order to fit sequence length.
    if len(line) <= max_length:
        return [line]

    # Find best line break area, if not, 
    match_punctuation = re.search(r'[\.,;:!?>]\s*', line[:max_length][::-1])
    # match_word = re.search(r'\s', line[:max_length][::-1])
    if match_punctuation:
        split_point = max_length-match_punctuation.start()
#         elif match_word:
#             split_point = max_length - match_word.start()
    else:
        split_point = max_length

    # Split lines and recurse on unsplit part
    return [line[:split_point].strip()] + split_long_lines(line[split_point:].strip(), max_length=max_length)
    
def remove_separator(lines):
    return [i.replace(">", "").strip().replace("  ", " ") for i in lines]

In [41]:
# Aggregate character lines into a single line
data_per_convo = (
    shakespeare_data.data.groupby(['Play', 'Player', 'ActScene', 'PlayerLinenumber'],sort=False)['text']
    .agg(lambda x: ' > '.join(x)).reset_index()
)
# Split long single actor lines into sequence length-ish length
data_per_convo['text_processed']=data_per_convo['text'].apply(split_long_lines)
data_per_convo['text_processed']=data_per_convo['text_processed'].apply(remove_separator)
data_per_convo['text_processed'].iloc[0][:5]

# Create processed data for each row
data_per_convo = data_per_convo.explode('text_processed')

In [42]:
# data_per_convo['text_processed_with_player'] = data_per_convo['Player'] + " : " + data_per_convo['text_processed']
data_per_convo['text_processed_with_player'].head()

0    king henry iv : so shaken as we are, so wan wi...
0    king henry iv : no more the thirsty entrance o...
0    king henry iv : of hostile paces those opposed...
0    king henry iv : and furious close of civil but...
0    king henry iv : like an ill-sheathed knife, no...
Name: text_processed_with_player, dtype: object

In [43]:
all_plays = data_per_convo.Play.unique()

In [44]:
all_plays

array(['Henry IV', 'Henry VI Part 1', 'Henry VI Part 2',
       'Henry VI Part 3', 'Alls well that ends well', 'As you like it',
       'Antony and Cleopatra', 'A Comedy of Errors', 'Coriolanus',
       'Cymbeline', 'Hamlet', 'Henry VIII', 'King John', 'Julius Caesar',
       'King Lear', 'Loves Labours Lost', 'macbeth',
       'Measure for measure', 'Merchant of Venice',
       'Merry Wives of Windsor', 'A Midsummer nights dream',
       'Much Ado about nothing', 'Othello', 'Pericles', 'Richard II',
       'Richard III', 'Romeo and Juliet', 'Taming of the Shrew',
       'The Tempest', 'Timon of Athens', 'Titus Andronicus',
       'Troilus and Cressida', 'Twelfth Night', 'Two Gentlemen of Verona',
       'A Winters Tale'], dtype=object)

### Preparing train/test Shakespeare data

In [54]:
processed_data = []
for play in all_plays:
    text_col = data_per_convo[data_per_convo['Play'] == play]['text_processed']
#     text_col = data_per_convo[data_per_convo['Play'] == play]['text_processed_with_player']
    processed_data += generate_src_trg_dataset(text_col)

In [56]:
text_col

28614    if you shall chance, camillo, to visit bohemia...
28615    i think, this coming summer, the king of sicil...
28616    wherein our entertainment shall shame us we wi...
28617                                         beseech you,
28618    verily, i speak it in the freedom of my knowle...
                               ...                        
29360    o, peace, paulina! thou shouldst a husband tak...
29360    for i saw her, as i thought, dead, and have in...
29360    and take her by the hand, whose worth and hone...
29360    that e'er i put between your holy looks my ill...
29360    lead us from hence, where we may leisurely eac...
Name: text_processed, Length: 1154, dtype: object

In [57]:
processed_data[:5]

[{'src': 'so shaken as we are, so wan with care, find we a time for frighted peace to pant, and breathe short-winded accents of new broils to be commenced in strands afar remote.',
  'trg': "no more the thirsty entrance of this soil shall daub her lips with her own children's blood, nor more shall trenching war channel her fields, nor bruise her flowerets with the armed hoofs"},
 {'src': "no more the thirsty entrance of this soil shall daub her lips with her own children's blood, nor more shall trenching war channel her fields, nor bruise her flowerets with the armed hoofs",
  'trg': 'of hostile paces those opposed eyes, which, like the meteors of a troubled heaven, all of one nature, of one substance bred, did lately meet in the intestine shock'},
 {'src': 'of hostile paces those opposed eyes, which, like the meteors of a troubled heaven, all of one nature, of one substance bred, did lately meet in the intestine shock',
  'trg': 'and furious close of civil butchery shall now, in mutua

In [58]:
random.shuffle(processed_data) # in-place shuffle

In [59]:
n = len(processed_data)
n

40664

In [60]:
train_cutoff = int(n*0.8)

In [61]:
# want to preserve order
ss_train_data = processed_data[:train_cutoff]
ss_test_data = processed_data[-10:]
ss_val_data = processed_data[train_cutoff:-10]

In [62]:
len(ss_test_data), len(ss_train_data), len(ss_val_data)

(10, 32531, 8123)

In [63]:
ss_train_data[:5]

[{'src': 'this is the very false gallop of verses why do you infect yourself with them?',
  'trg': 'peace, you dull fool! i found them on a tree.'},
 {'src': 'and fair men call for grace. aaron will have his soul black like his face.',
  'trg': 'o, here i lift this one hand up to heaven, and bow this feeble ruin to the earth if any power pities wretched tears, to that i call! what, wilt thou kneel with me? do, then, dear heart,'},
 {'src': 'miranda. o my father, i have broke your hest to say so!',
  'trg': "admired miranda! indeed the top of admiration! worth what's dearest to the world! full many a lady i have eyed with best regard and many a time the harmony of their tongues hath into bondage"},
 {'src': 'i have, thou gallant trojan, seen thee oft labouring for destiny make cruel way through ranks of greekish youth, and i have seen thee, as hot as perseus, spur thy phrygian steed,',
  'trg': "despising many forfeits and subduements, when thou hast hung thy advanced sword i' the air, 

In [64]:
save_src_trg_dataset(ss_train_data, filename= 'train', folder_dir= 'data/')

In [65]:
save_src_trg_dataset(ss_val_data, filename= 'valid', folder_dir= 'data/')

In [66]:
save_src_trg_dataset(ss_test_data, filename= 'test', folder_dir= 'data/')