In [3]:
import pandas as pd
import numpy as np
import pickle
import os
from transformers import AutoTokenizer, PreTrainedTokenizerFast, BertTokenizerFast
import json
import random
import re

In [4]:
random.seed(102)

# Creating tokenizer

In [38]:
with open("./data/source/shakespeare_tokenize.txt", 'r') as file:
    ss = file.read()

In [39]:
ss = str(ss).lower().replace('\n\n','\n').replace('  ', ' ').replace(';', '').replace(':', '').replace('[', '').replace(']', '').replace('  ', ' ')

In [42]:
ss_list = ss.split('\n')
ss_list = [i.strip() for i in ss_list]
ss_list

["a midsummer-night's dream now , fair hippolyta , our nuptial hour",
 'draws on apace four happy days bring in',
 'another moon but o ! methinks how slow',
 'this old moon wanes she lingers my desires ,',
 'like to a step dame , or a dowager',
 "long withering out a young man's revenue . four days will quickly steep themselves in night",
 'four nights will quickly dream away the time',
 'and then the moon , like to a silver bow',
 'new-bent in heaven , shall behold the night',
 'of our solemnities . go , philostrate ,',
 'stir up the athenian youth to merriments',
 'awake the pert and nimble spirit of mirth',
 'turn melancholy forth to funerals',
 "the pale companion is not for our pomp . hippolyta , i woo'd thee with my sword ,",
 'and won thy love doing thee injuries',
 'but i will wed thee in another key ,',
 'with pomp , with triumph , and with revelling .',
 "happy be theseus , our renowned duke ! thanks , good egeus what's the news with thee ? full of vexation come i , with comp

In [47]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokenizer = tokenizer.train_new_from_iterator(iter(ss_list),32000)
tokenizer.vocab_size






30267

In [51]:
tokenizer.tokenize("my tongue should catch your tongue's sweet melody ."), tokenizer.tokenize("demetrius , i'll avouch it to his head ,")

(['my',
  'tongue',
  'should',
  'catch',
  'your',
  'tongue',
  "'",
  's',
  'sweet',
  'melody',
  '.'],
 ['demetrius', ',', 'i', "'", 'll', 'avouch', 'it', 'to', 'his', 'head', ','])

In [48]:
tokenizer_save_path = 'shakespeare-tokenizer-bert/plays'
tokenizer.save_pretrained(tokenizer_save_path)

('shakespeare-tokenizer-bert/plays/tokenizer_config.json',
 'shakespeare-tokenizer-bert/plays/special_tokens_map.json',
 'shakespeare-tokenizer-bert/plays/vocab.txt',
 'shakespeare-tokenizer-bert/plays/added_tokens.json',
 'shakespeare-tokenizer-bert/plays/tokenizer.json')

# Exploratory

In [7]:
df = pd.read_csv('data/source/shakespeare.csv')

In [8]:
df.head(5)

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


In [9]:
# df['Play'].str.lower().unique()

In [10]:
# df['Player'].str.lower().unique()

In [11]:
# french characters: alencon, alice, king of france, katharine, dauphin
# french speaking characters

In [12]:
# df[df['Player'].str.lower() == 'katharine']
# maybe could remove the whole of Henry V since that is where most french lines are found

In [13]:
remove = df[(df['Player'].notna()) & (df["PlayerLine"]!='Exeunt')]
data = remove[['Play', 'PlayerLinenumber', "Player", "PlayerLine"]]
data.head(10)

Unnamed: 0,Play,PlayerLinenumber,Player,PlayerLine
3,Henry IV,1.0,KING HENRY IV,"So shaken as we are, so wan with care,"
4,Henry IV,1.0,KING HENRY IV,"Find we a time for frighted peace to pant,"
5,Henry IV,1.0,KING HENRY IV,And breathe short-winded accents of new broils
6,Henry IV,1.0,KING HENRY IV,To be commenced in strands afar remote.
7,Henry IV,1.0,KING HENRY IV,No more the thirsty entrance of this soil
8,Henry IV,1.0,KING HENRY IV,Shall daub her lips with her own children's bl...
9,Henry IV,1.0,KING HENRY IV,"Nor more shall trenching war channel her fields,"
10,Henry IV,1.0,KING HENRY IV,Nor bruise her flowerets with the armed hoofs
11,Henry IV,1.0,KING HENRY IV,"Of hostile paces: those opposed eyes,"
12,Henry IV,1.0,KING HENRY IV,"Which, like the meteors of a troubled heaven,"


# Preprocessing Shakespeare

In [91]:
class ShakespeareData:
    def __init__(self, filepath='data/source/shakespeare.csv'):        
        data = pd.read_csv(filepath)
        self.data = self.clean(data) 
            
    def clean(self, data):
        repl = {
            '@\w*': ' ', 
            '&amp;' : 'and',
            '&#\w*;': ' ', 
            '\\n\.':' ' ,
            '\\n':' ',
            "\.{2,}": '.', 
            "!{2,}":'!', 
            "\?{2,}":'?', 
            '_':" ",
            ' +': ' ', 
            '\-{2,}': ' ',
            ':': '',
            ';': '',
            '\[': '',
            '\]': '',
            '  ': ' '
            }

        data = data[(data['Player'].notna()) & (data["PlayerLine"]!='Exeunt') & (data["Play"]!='Henry V') & (data['ActSceneLine'].notna())]
        data = data[['ActSceneLine', "Player", "PlayerLine", "Play", "PlayerLinenumber"]]
        data['ActScene'] = data['ActSceneLine'].apply(extract_before_second_full_stop)
        cleaned = data['PlayerLine'].apply(lambda x: x.strip().lower())
        cleaned = cleaned.replace(repl, regex=True)
        cleaned_data = data.assign(text = cleaned)
        return cleaned_data

def extract_before_second_full_stop(string):
    index = string.find('.', string.find('.') + 1)  # Find index of second full stop
    if index != -1:
        return string[:index]
    else:
        return string

In [92]:
def generate_src_trg_dataset(text_col):
    data = []
    
    for i in range(len(text_col)-1):
        dictionary = {}
        dictionary["src"] = text_col.iloc[i]
        dictionary["trg"] = text_col.iloc[i+1]
        data.append(dictionary)

#     for i in range(len(text_col)-5):
#         dictionary = {}
#         dictionary["src"] = (text_col.iloc[i] + " " + text_col.iloc[i+1] + " " + text_col.iloc[i+2]).strip()
#         dictionary["trg"] = (text_col.iloc[i+3] + " " + text_col.iloc[i+4] + " " + text_col.iloc[i+5]).strip()
#         data.append(dictionary)
    
    return data

def save_src_trg_dataset(data, filename = 'data', folder_dir = ''):
    fn = folder_dir + filename + '.jsonl'
    with open(fn, 'w') as outfile:
        for entry in data:
            json.dump(entry, outfile)
            outfile.write('\n')

def generate_and_save_dataset(data_text_column, filename= 'data', folder_dir= ''):
    data = generate_src_trg_dataset(data_text_column)
    save_src_trg_dataset(data, filename=filename, folder_dir=folder_dir)

In [93]:
# with open('vocab_list.pickle', 'wb') as handle:
#     pickle.dump(vocab.idx_word, handle)

In [94]:
shakespeare_data = ShakespeareData()

In [95]:
shakespeare_data.data.head() # 107459 -> # 101919 after removing na in actsceneline

Unnamed: 0,ActSceneLine,Player,PlayerLine,Play,PlayerLinenumber,ActScene,text
3,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,",Henry IV,1.0,1.1,"so shaken as we are, so wan with care,"
4,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,",Henry IV,1.0,1.1,"find we a time for frighted peace to pant,"
5,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils,Henry IV,1.0,1.1,and breathe short-winded accents of new broils
6,1.1.4,KING HENRY IV,To be commenced in strands afar remote.,Henry IV,1.0,1.1,to be commenced in strands afar remote.
7,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil,Henry IV,1.0,1.1,no more the thirsty entrance of this soil


In [61]:
def split_long_lines(line, max_length=100):
    if len(line) <= max_length:
        return [line]
    else:
        match_punctuation = re.search(r'[\.,;:!?>]\s*', line[:max_length][::-1])
#         match_word = re.search(r'\s', line[:max_length][::-1])
        if match_punctuation:
            split_point = max_length-match_punctuation.start()
#         elif match_word:
#             split_point = max_length - match_word.start()
        else:
            split_point = max_length
        return [line[:split_point].strip()] + split_long_lines(line[split_point:].strip(), max_length=max_length)
    
def remove_separator(lines):
    return [i.replace(">", "").strip().replace("  ", " ") for i in lines]

In [62]:
data_per_convo = shakespeare_data.data.groupby(['Play', 'Player', 'ActScene', 'PlayerLinenumber'],sort=False)['text'].agg(lambda x: ' > '.join(x)).reset_index()

In [63]:
data_per_convo.head()

Unnamed: 0,Play,Player,ActScene,PlayerLinenumber,text
0,Henry IV,KING HENRY IV,1.1,1.0,"so shaken as we are, so wan with care, > find ..."
1,Henry IV,WESTMORELAND,1.1,2.0,"my liege, this haste was hot in question, > an..."
2,Henry IV,KING HENRY IV,1.1,3.0,it seems then that the tidings of this broil >...
3,Henry IV,WESTMORELAND,1.1,4.0,"this match'd with other did, my gracious lord,..."
4,Henry IV,KING HENRY IV,1.1,5.0,"here is a dear, a true industrious friend, > s..."


In [64]:
data_per_convo['text_processed']=data_per_convo['text'].apply(split_long_lines)

In [65]:
data_per_convo.text_processed.iloc[0]

['so shaken as we are, so wan with care, > find we a time for frighted peace to pant, >',
 'and breathe short-winded accents of new broils > to be commenced in strands afar remote. >',
 "no more the thirsty entrance of this soil > shall daub her lips with her own children's blood, >",
 'nor more shall trenching war channel her fields, > nor bruise her flowerets with the armed hoofs >',
 'of hostile paces those opposed eyes, > which, like the meteors of a troubled heaven, >',
 'all of one nature, of one substance bred, > did lately meet in the intestine shock >',
 'and furious close of civil butchery > shall now, in mutual well-beseeming ranks, >',
 'march all one way and be no more opposed > against acquaintance, kindred and allies >',
 'the edge of war, like an ill-sheathed knife, > no more shall cut his master. therefore, friends, >',
 'as far as to the sepulchre of christ, > whose soldier now, under whose blessed cross >',
 'we are impressed and engaged to fight, > forthwith a power

In [66]:
data_per_convo['text_processed']=data_per_convo['text_processed'].apply(remove_separator)

In [67]:
data_per_convo.text_processed.iloc[0]

['so shaken as we are, so wan with care, find we a time for frighted peace to pant,',
 'and breathe short-winded accents of new broils to be commenced in strands afar remote.',
 "no more the thirsty entrance of this soil shall daub her lips with her own children's blood,",
 'nor more shall trenching war channel her fields, nor bruise her flowerets with the armed hoofs',
 'of hostile paces those opposed eyes, which, like the meteors of a troubled heaven,',
 'all of one nature, of one substance bred, did lately meet in the intestine shock',
 'and furious close of civil butchery shall now, in mutual well-beseeming ranks,',
 'march all one way and be no more opposed against acquaintance, kindred and allies',
 'the edge of war, like an ill-sheathed knife, no more shall cut his master. therefore, friends,',
 'as far as to the sepulchre of christ, whose soldier now, under whose blessed cross',
 'we are impressed and engaged to fight, forthwith a power of english shall we levy,',
 "whose arms 

In [68]:
data_per_convo['text'].iloc[21712], data_per_convo['text_processed'].iloc[21712]

('so many miseries have crazed my voice, > that my woe-wearied tongue is mute and dumb, > edward plantagenet, why art thou dead?',
 ['so many miseries have crazed my voice, that my woe-wearied tongue is mute and dumb,',
  'edward plantagenet, why art thou dead?'])

In [69]:
data_per_convo[(data_per_convo['Play'] == 'A Comedy of Errors') & (data_per_convo['Player'] == 'ADRIANA') & (data_per_convo['PlayerLinenumber'] == 1.0)].head()

Unnamed: 0,Play,Player,ActScene,PlayerLinenumber,text,text_processed
5992,A Comedy of Errors,ADRIANA,2.1,1.0,"neither my husband nor the slave return'd, > t...","[neither my husband nor the slave return'd, th..."
6278,A Comedy of Errors,ADRIANA,4.2,1.0,"ah, luciana, did he tempt thee so? > mightst t...","[ah, luciana, did he tempt thee so? mightst th..."


In [70]:
data_per_convo = data_per_convo.explode('text_processed')

In [71]:
data_per_convo

Unnamed: 0,Play,Player,ActScene,PlayerLinenumber,text,text_processed
0,Henry IV,KING HENRY IV,1.1,1.0,"so shaken as we are, so wan with care, > find ...","so shaken as we are, so wan with care, find we..."
0,Henry IV,KING HENRY IV,1.1,1.0,"so shaken as we are, so wan with care, > find ...",and breathe short-winded accents of new broils...
0,Henry IV,KING HENRY IV,1.1,1.0,"so shaken as we are, so wan with care, > find ...",no more the thirsty entrance of this soil shal...
0,Henry IV,KING HENRY IV,1.1,1.0,"so shaken as we are, so wan with care, > find ...",nor more shall trenching war channel her field...
0,Henry IV,KING HENRY IV,1.1,1.0,"so shaken as we are, so wan with care, > find ...","of hostile paces those opposed eyes, which, li..."
...,...,...,...,...,...,...
29360,A Winters Tale,LEONTES,5.3,38.0,"o, peace, paulina! > thou shouldst a husband t...","look upon my brother both your pardons, that e..."
29360,A Winters Tale,LEONTES,5.3,38.0,"o, peace, paulina! > thou shouldst a husband t...","my ill suspicion. this is your son-in-law, and..."
29360,A Winters Tale,LEONTES,5.3,38.0,"o, peace, paulina! > thou shouldst a husband t...",is troth-plight to your daughter. good paulina...
29360,A Winters Tale,LEONTES,5.3,38.0,"o, peace, paulina! > thou shouldst a husband t...",each one demand an answer to his part perform'...


In [72]:
all_plays = data_per_convo.Play.unique()

In [73]:
all_plays

array(['Henry IV', 'Henry VI Part 1', 'Henry VI Part 2',
       'Henry VI Part 3', 'Alls well that ends well', 'As you like it',
       'Antony and Cleopatra', 'A Comedy of Errors', 'Coriolanus',
       'Cymbeline', 'Hamlet', 'Henry VIII', 'King John', 'Julius Caesar',
       'King Lear', 'Loves Labours Lost', 'macbeth',
       'Measure for measure', 'Merchant of Venice',
       'Merry Wives of Windsor', 'A Midsummer nights dream',
       'Much Ado about nothing', 'Othello', 'Pericles', 'Richard II',
       'Richard III', 'Romeo and Juliet', 'Taming of the Shrew',
       'The Tempest', 'Timon of Athens', 'Titus Andronicus',
       'Troilus and Cressida', 'Twelfth Night', 'Two Gentlemen of Verona',
       'A Winters Tale'], dtype=object)

### Preparing train/test Shakespeare data

In [74]:
processed_data = []
for play in all_plays:
    text_col = data_per_convo[data_per_convo['Play'] == play]['text_processed']
    processed_data += generate_src_trg_dataset(text_col)

In [75]:
processed_data[:5]

[{'src': 'so shaken as we are, so wan with care, find we a time for frighted peace to pant,',
  'trg': 'and breathe short-winded accents of new broils to be commenced in strands afar remote.'},
 {'src': 'and breathe short-winded accents of new broils to be commenced in strands afar remote.',
  'trg': "no more the thirsty entrance of this soil shall daub her lips with her own children's blood,"},
 {'src': "no more the thirsty entrance of this soil shall daub her lips with her own children's blood,",
  'trg': 'nor more shall trenching war channel her fields, nor bruise her flowerets with the armed hoofs'},
 {'src': 'nor more shall trenching war channel her fields, nor bruise her flowerets with the armed hoofs',
  'trg': 'of hostile paces those opposed eyes, which, like the meteors of a troubled heaven,'},
 {'src': 'of hostile paces those opposed eyes, which, like the meteors of a troubled heaven,',
  'trg': 'all of one nature, of one substance bred, did lately meet in the intestine shock

In [76]:
random.shuffle(processed_data) # in-place shuffle

In [77]:
n = len(processed_data)
n

60784

In [79]:
train_cutoff = int(n*0.8)

In [80]:
# want to preserve order
ss_train_data = processed_data[:train_cutoff]
ss_test_data = processed_data[-10:]
ss_val_data = processed_data[train_cutoff:-10]

In [86]:
len(ss_test_data), len(ss_train_data), len(ss_val_data)

(10, 48627, 12147)

In [82]:
ss_train_data[:5]

[{'src': 'o hell! what have we here? a carrion death, within whose empty eye there is a written scroll!',
  'trg': "i'll read the writing. all that glitters is not gold, often have you heard that told"},
 {'src': 'and his disciples only envy at, ye blew the fire that burns ye now have at ye! enter king,',
  'trg': 'frowning on them, takes his seat'},
 {'src': 'mistress page would desire you to send her your little page, of all loves her husband has a',
  'trg': 'marvellous infection to the little page, and truly master page is an honest man. never a wife in'},
 {'src': "he bears him on the place's privilege, or durst not, for his craven heart, say thus.",
  'trg': "by him that made me, i'll maintain my words on any plot of ground in christendom."},
 {'src': 'loather a hundred times to part than die. yet now farewell, and farewell life with thee!',
  'trg': 'thus is poor suffolk ten times banished, once by the king, and three times thrice by thee.'}]

In [85]:
save_src_trg_dataset(ss_train_data, filename= 'train', folder_dir= 'data/')

In [83]:
save_src_trg_dataset(ss_val_data, filename= 'valid', folder_dir= 'data/')

In [84]:
save_src_trg_dataset(ss_test_data, filename= 'test', folder_dir= 'data/')

## Creating CC tokenizer

In [23]:
comb_train_data_dir='data/commonsense/train.jsonl'
comb_test_data_dir='data/commonsense/test.jsonl'

comb_data = []

with open(comb_train_data_dir, 'r') as f_reader:
    for row in f_reader:
        comb_data.append(json.loads(row))
        
with open(comb_test_data_dir, 'r') as f_reader:
    for row in f_reader:
        comb_data.append(json.loads(row))

In [25]:
comb_data[0], len(comb_data)

({'src': 'jesus , what kind of concerts do you go to where people sucker punch you for being born tall ?',
  'trg': 'the kind that allow bitter short people in . so basically all of them .'},
 3392137)

In [32]:
merged_lines = []
for line in comb_data:
    l = line['src'] + " " + line['trg']
    merged_lines.append(l)

In [33]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') # Bert will be the base tokenizer
lines_iter = iter(merged_lines) # Most likely not needed, already in list

# Creates new tokenizer with our vocabulary set
comb_tokenizer = tokenizer.train_new_from_iterator(lines_iter, 50000)






In [34]:
comb_tokenizer

PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=49959, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [35]:
comb_tokenizer.save_pretrained("commonsense-tokenizer-bert")

('commonsense-tokenizer-bert/tokenizer_config.json',
 'commonsense-tokenizer-bert/special_tokens_map.json',
 'commonsense-tokenizer-bert/vocab.txt',
 'commonsense-tokenizer-bert/added_tokens.json',
 'commonsense-tokenizer-bert/tokenizer.json')

# Preparing combined dataset (Commonsense Dialogue & Shakespeare)

In [54]:
cc_train_data_dir='data/commonsense/train.jsonl'
ss_train_data_dir='data/shakespeare/train.jsonl'
cc_test_data_dir='data/commonsense/test.jsonl'
ss_test_data_dir='data/shakespeare/test.jsonl'

In [55]:
cc_train_data = []
with open(cc_train_data_dir, 'r') as f_reader:
    for row in f_reader:
        cc_train_data.append(json.loads(row))

In [56]:
cc_test_data = []
with open(cc_test_data_dir, 'r') as f_reader:
    for row in f_reader:
        cc_test_data.append(json.loads(row))

In [48]:
ss_train_data = []
with open(ss_train_data_dir, 'r') as f_reader:
    for row in f_reader:
        ss_train_data.append(json.loads(row))

In [62]:
ss_test_data = []
with open(ss_test_data_dir, 'r') as f_reader:
    for row in f_reader:
        ss_test_data.append(json.loads(row))

In [63]:
len(cc_train_data), len(ss_train_data), len(cc_test_data), len(ss_test_data)

(3382137, 99754, 10000, 11084)

In [50]:
random.shuffle(cc_train_data)

In [61]:
combined_train_data = cc_train_data[:1_000_000] + ss_train_data
combined_test_data = cc_test_data[-10:] + ss_test_data

In [62]:
random.shuffle(combined_train_data)
random.shuffle(combined_test_data)
len(combined_train_data), len(combined_test_data)

(1110818, 30)

In [63]:
save_src_trg_dataset(combined_test_data, filename= 'test', folder_dir= 'data/combined/')

In [64]:
save_src_trg_dataset(combined_train_data, filename= 'train', folder_dir= 'data/combined/')

# Preparing small combined

In [12]:
comb_train_data_dir='data/combined/train.jsonl'
comb_test_data_dir='data/combined/test.jsonl'

comb_train_data = []
comb_test_data = []

with open(comb_train_data_dir, 'r') as f_reader:
    for row in f_reader:
        comb_train_data.append(json.loads(row))
        
with open(comb_test_data_dir, 'r') as f_reader:
    for row in f_reader:
        comb_test_data.append(json.loads(row))

In [13]:
combined_train_data=comb_train_data[:100]
combined_test_data=comb_test_data[:10]

In [14]:
save_src_trg_dataset(combined_test_data, filename= 'test', folder_dir= 'data/combined/small/')
save_src_trg_dataset(combined_train_data, filename= 'train', folder_dir= 'data/combined/small/')

# Introducing Sonnets

In [57]:
with open('data/source/sonnets.txt', 'r') as f:
    data = f.read().lower()

In [60]:
sonnets = data.split('\n\n')
sonnets[:4]

['i',
 "from fairest creatures we desire increase,\nthat thereby beauty's rose might never die,\nbut as the riper should by time decease,\nhis tender heir might bear his memory:\nbut thou contracted to thine own bright eyes,\nfeed'st thy light's flame with self-substantial fuel,\nmaking a famine where abundance lies,\nthy self thy foe, to thy sweet self too cruel:\nthou that art now the world's fresh ornament,\nand only herald to the gaudy spring,\nwithin thine own bud buriest thy content,\nand tender churl mak'st waste in niggarding:\n  pity the world, or else this glutton be,\n  to eat the world's due, by the grave and thee.",
 'ii',
 "when forty winters shall besiege thy brow,\nand dig deep trenches in thy beauty's field,\nthy youth's proud livery so gazed on now,\nwill be a tatter'd weed of small worth held: \nthen being asked, where all thy beauty lies,\nwhere all the treasure of thy lusty days; \nto say, within thine own deep sunken eyes,\nwere an all-eating shame, and thriftless

In [79]:
sonnets_num = []
sonnets_text = []
for i in range(len(sonnets)):
    if len(sonnets[i]) <= 10:
        sonnets_num.append(sonnets[i])
    else:
        sonnets_text.append(sonnets[i])

In [85]:
sonnets_cleaned = []
for s in sonnets_text:
    # Shakespeare's sonnets are made of 3 quatrains (4-line stanza) and last 2 lines belong together
    curr = s.split('\n')
    curr = [i.strip().replace("'d", "ed") for i in curr]
    q1 = curr[:4]
    q2 = curr[4:8]
    q3 = curr[8:12]
    c1 = curr[12:]
    sonnets_cleaned.append([' '.join(q1), ' '.join(q2), ' '.join(q3), ' '.join(c1)])

In [106]:
sonnets_cleaned[:2]

[["from fairest creatures we desire increase, that thereby beauty's rose might never die, but as the riper should by time decease, his tender heir might bear his memory:",
  "but thou contracted to thine own bright eyes, feed'st thy light's flame with self-substantial fuel, making a famine where abundance lies, thy self thy foe, to thy sweet self too cruel:",
  "thou that art now the world's fresh ornament, and only herald to the gaudy spring, within thine own bud buriest thy content, and tender churl mak'st waste in niggarding:",
  "pity the world, or else this glutton be, to eat the world's due, by the grave and thee."],
 ["when forty winters shall besiege thy brow, and dig deep trenches in thy beauty's field, thy youth's proud livery so gazed on now, will be a tattered weed of small worth held:",
  'then being asked, where all thy beauty lies, where all the treasure of thy lusty days; to say, within thine own deep sunken eyes, were an all-eating shame, and thriftless praise.',
  "ho

In [120]:
sonnets_src_trg_data = []
for s in sonnets_cleaned:
    for i in range(3):
        dictionary = {}
        dictionary["src"] = s[i].strip()
        dictionary["trg"] = s[i+1].strip()
        sonnets_src_trg_data.append(dictionary)

In [122]:
sonnets_src_trg_data[:3]

[{'src': "from fairest creatures we desire increase, that thereby beauty's rose might never die, but as the riper should by time decease, his tender heir might bear his memory:",
  'trg': "but thou contracted to thine own bright eyes, feed'st thy light's flame with self-substantial fuel, making a famine where abundance lies, thy self thy foe, to thy sweet self too cruel:"},
 {'src': "but thou contracted to thine own bright eyes, feed'st thy light's flame with self-substantial fuel, making a famine where abundance lies, thy self thy foe, to thy sweet self too cruel:",
  'trg': "thou that art now the world's fresh ornament, and only herald to the gaudy spring, within thine own bud buriest thy content, and tender churl mak'st waste in niggarding:"},
 {'src': "thou that art now the world's fresh ornament, and only herald to the gaudy spring, within thine own bud buriest thy content, and tender churl mak'st waste in niggarding:",
  'trg': "pity the world, or else this glutton be, to eat the

In [123]:
len(sonnets_src_trg_data)

462

## Combining Sonnets with Plays

In [138]:
processed_data = []
for play in all_plays:
    text_col = shakespeare_data.data[shakespeare_data.data['Play'] == play]['text']
    processed_data += generate_src_trg_dataset(text_col)

In [139]:
processed_data += sonnets_src_trg_data

In [143]:
random.shuffle(processed_data) # in-place shuffle

In [144]:
n = len(processed_data)
n

107746

In [145]:
# cutoff = int(n*0.9)
cutoff = -20
ss_train_data, ss_test_data = processed_data[:cutoff], processed_data[cutoff:]

In [146]:
save_src_trg_dataset(ss_train_data, filename= 'train', folder_dir= 'data/')

In [147]:
save_src_trg_dataset(ss_test_data, filename= 'test', folder_dir= 'data/')

## Creating Sonnets tokenizer

In [136]:
sonnets_tokenizer = create_tokenizer(np.array(sonnets_cleaned).flatten().flatten())






In [137]:
tokenizer_save_path = 'shakespeare-tokenizer-bert/sonnets'
sonnets_tokenizer.save_pretrained(tokenizer_save_path)

('shakespeare-tokenizer-bert/sonnets/tokenizer_config.json',
 'shakespeare-tokenizer-bert/sonnets/special_tokens_map.json',
 'shakespeare-tokenizer-bert/sonnets/vocab.txt',
 'shakespeare-tokenizer-bert/sonnets/added_tokens.json',
 'shakespeare-tokenizer-bert/sonnets/tokenizer.json')