In [1]:
import pandas as pd
import numpy as np
import pickle
import os
from transformers import AutoTokenizer, PreTrainedTokenizerFast, BertTokenizerFast
import json
import random

In [2]:
random.seed(102)

# Exploratory

In [3]:
df = pd.read_csv('data/source/shakespeare.csv')

In [4]:
df.head(5)

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


In [5]:
# df['Play'].str.lower().unique()

In [6]:
# df['Player'].str.lower().unique()

In [7]:
# french characters: alencon, alice, king of france, katharine, dauphin
# french speaking characters

In [8]:
# df[df['Player'].str.lower() == 'katharine']
# maybe could remove the whole of Henry V since that is where most french lines are found

In [9]:
df[df['Play'] == 'Henry IV'].groupby(['Play', 'PlayerLinenumber']).agg(PlayerLine=('PlayerLine', np.sum)).reset_index()

Unnamed: 0,Play,PlayerLinenumber,PlayerLine
0,Henry IV,1.0,"So shaken as we are, so wan with care,Find we ..."
1,Henry IV,2.0,"My liege, this haste was hot in question,And m..."
2,Henry IV,3.0,It seems then that the tidings of this broilBr...
3,Henry IV,4.0,"This match'd with other did, my gracious lord,..."
4,Henry IV,5.0,"Here is a dear, a true industrious friend,Sir ..."
...,...,...,...
193,Henry IV,194.0,"Nothing but papers, my lord."
194,Henry IV,195.0,Let's see what they be: read them.
195,Henry IV,196.0,"[Reads] Item, A capon,. . 2s. 2d.Item, Sauce,..."
196,Henry IV,197.0,O monstrous! but one half-penny-worth of bread...


In [10]:
remove = df[(df['Player'].notna()) & (df["PlayerLine"]!='Exeunt')]
data = remove[['ActSceneLine', "Player", "PlayerLine"]]
data.head()

Unnamed: 0,ActSceneLine,Player,PlayerLine
3,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"
5,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils
6,1.1.4,KING HENRY IV,To be commenced in strands afar remote.
7,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil


# Preprocessing Shakespeare

In [11]:
class ShakespeareData:
    def __init__(self, filepath='data/source/shakespeare.csv'):        
        data = pd.read_csv(filepath)
        self.data = self.clean(data) 
    
    def clean(self, data):
        repl = {
            '@\w*': ' ', 
            '&amp;' : 'and',
            '\su\s':' you ', 
            '&#\w*;': ' ', 
            '#':' ', 
            '\s2\s': 'two', 
            "ð[^ ]*": ' ' ,
            "â[^ ]*": ' ',
            "(dont)|(don't)": 'do not', 
            "(cant)|(can't)": "can not",
            "(yous)|(you's)": "you is", 
            "(yous)|(you's)": "you is", 
            "(youve)|(you've)": "you have", 
            "(doesnt)|(doesn't)": 'does not', 
            "(wont)|(won't)": 'will not', 
            "honour": 'honor',
            "durst": 'dare',
            "wast": 'was',
            "curst": 'cursed',
            "blest": 'blessed',
            "crost": 'crossed',
            "accurst": 'accursed',
            "o'ver": 'over',
            "\'tis'": 'this',
            "[0-9]+\.*[0-9%]+\w*" : "NUMBER",
            '\\n\.':' ' ,
            '\\n':' ',
            "\.{2,}": '.', 
            "!{2,}":'!', 
            "\?{2,}":'?', 
#             'ing[^a-z]':' ', 
#             'ed[^a-z]': ' ', 
            '_':" ",
            ' +': ' ', 
            '\-{2,}': ' ', 
            '\-': '',
            '\:': '',
            "\'d": "ed"}

        data = data[(data['Player'].notna()) & (data["PlayerLine"]!='Exeunt') & (data["Play"]!='Henry V')]
        data = data[['ActSceneLine', "Player", "PlayerLine", "Play"]]
        cleaned = data['PlayerLine'].apply(lambda x: x.strip().lower())
        cleaned = cleaned.replace(repl, regex=True)
        cleaned_data = data.assign(text = cleaned)
        return cleaned_data

In [12]:
def create_tokenizer(data_col, tokenizer_config='bert-base-uncased', max_tokens=50000):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_config) # Bert will be the base tokenizer
    lines_iter = iter(data_col) # Most likely not needed, already in list

    # Creates new tokenizer with our vocabulary set
    return tokenizer.train_new_from_iterator(lines_iter, max_tokens)

In [13]:
def generate_src_trg_dataset(text_col):
    data = []
    
#     for i in range(len(text_col)-1):
#         dictionary = {}
#         dictionary["src"] = text_col.iloc[i]
#         dictionary["trg"] = text_col.iloc[i+1]
#         data.append(dictionary)

    for i in range(len(text_col)-5):
        dictionary = {}
        dictionary["src"] = (text_col.iloc[i] + " " + text_col.iloc[i+1] + " " + text_col.iloc[i+2]).strip()
        dictionary["trg"] = (text_col.iloc[i+3] + " " + text_col.iloc[i+4] + " " + text_col.iloc[i+5]).strip()
        data.append(dictionary)
    
    return data

def save_src_trg_dataset(data, filename = 'data', folder_dir = ''):
    fn = folder_dir + filename + '.jsonl'
    with open(fn, 'w') as outfile:
        for entry in data:
            json.dump(entry, outfile)
            outfile.write('\n')

def generate_and_save_dataset(data_text_column, filename= 'data', folder_dir= ''):
    data = generate_src_trg_dataset(data_text_column)
    save_src_trg_dataset(data, filename=filename, folder_dir=folder_dir)

In [14]:
# with open('vocab_list.pickle', 'wb') as handle:
#     pickle.dump(vocab.idx_word, handle)

In [15]:
shakespeare_data = ShakespeareData()

In [16]:
shakespeare_data.data.head()

Unnamed: 0,ActSceneLine,Player,PlayerLine,Play,text
3,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,",Henry IV,"so shaken as we are, so wan with care,"
4,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,",Henry IV,"find we a time for frighted peace to pant,"
5,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils,Henry IV,and breathe shortwinded accents of new broils
6,1.1.4,KING HENRY IV,To be commenced in strands afar remote.,Henry IV,to be commenced in strands afar remote.
7,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil,Henry IV,no more the thirsty entrance of this soil


In [19]:
all_plays = shakespeare_data.data.Play.unique()

### Preparing train/test Shakespeare data

In [20]:
processed_data = []
for play in all_plays:
    text_col = shakespeare_data.data[shakespeare_data.data['Play'] == play]['text']
    processed_data += generate_src_trg_dataset(text_col)

In [21]:
random.shuffle(processed_data) # in-place shuffle

In [22]:
n = len(processed_data)
n

107284

In [23]:
# cutoff = int(n*0.9)
cutoff = -20
ss_train_data, ss_test_data = processed_data[:cutoff], processed_data[cutoff:]

In [24]:
save_src_trg_dataset(ss_train_data, filename= 'train', folder_dir= 'data/')

In [25]:
save_src_trg_dataset(ss_test_data, filename= 'test', folder_dir= 'data/')

In [124]:
# Just for sanity check
tokenizer = create_tokenizer(shakespeare_data.data.text)
print(f'Tokenizer contains vocab size {tokenizer.vocab_size}')
tokenizer.tokenize(shakespeare_data.data.text.iloc[0])




Tokenizer contains vocab size 33747


['so', 'shaken', 'as', 'we', 'are', ',', 'so', 'wan', 'with', 'care', ',']

In [125]:
isinstance(tokenizer, PreTrainedTokenizerFast)

True

In [126]:
tokenizer

PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=33747, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [129]:
# Save tokenizer
tokenizer_save_path = 'shakespeare-tokenizer-bert/plays'
tokenizer.save_pretrained(tokenizer_save_path)

('shakespeare-tokenizer-bert/plays/tokenizer_config.json',
 'shakespeare-tokenizer-bert/plays/special_tokens_map.json',
 'shakespeare-tokenizer-bert/plays/vocab.txt',
 'shakespeare-tokenizer-bert/plays/added_tokens.json',
 'shakespeare-tokenizer-bert/plays/tokenizer.json')

## Creating CC tokenizer

In [23]:
comb_train_data_dir='data/commonsense/train.jsonl'
comb_test_data_dir='data/commonsense/test.jsonl'

comb_data = []

with open(comb_train_data_dir, 'r') as f_reader:
    for row in f_reader:
        comb_data.append(json.loads(row))
        
with open(comb_test_data_dir, 'r') as f_reader:
    for row in f_reader:
        comb_data.append(json.loads(row))

In [25]:
comb_data[0], len(comb_data)

({'src': 'jesus , what kind of concerts do you go to where people sucker punch you for being born tall ?',
  'trg': 'the kind that allow bitter short people in . so basically all of them .'},
 3392137)

In [32]:
merged_lines = []
for line in comb_data:
    l = line['src'] + " " + line['trg']
    merged_lines.append(l)

In [33]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') # Bert will be the base tokenizer
lines_iter = iter(merged_lines) # Most likely not needed, already in list

# Creates new tokenizer with our vocabulary set
comb_tokenizer = tokenizer.train_new_from_iterator(lines_iter, 50000)






In [34]:
comb_tokenizer

PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=49959, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [35]:
comb_tokenizer.save_pretrained("commonsense-tokenizer-bert")

('commonsense-tokenizer-bert/tokenizer_config.json',
 'commonsense-tokenizer-bert/special_tokens_map.json',
 'commonsense-tokenizer-bert/vocab.txt',
 'commonsense-tokenizer-bert/added_tokens.json',
 'commonsense-tokenizer-bert/tokenizer.json')

# Preparing Shakespeare train/test dataset

In [21]:
ss_all_data_dir='data/all.jsonl'

In [22]:
ss_all_data = []
with open(ss_all_data_dir, 'r') as f_reader:
    for row in f_reader:
        ss_all_data.append(json.loads(row))

In [23]:
random.shuffle(ss_all_data)

In [27]:
ss_train_data = ss_all_data[:-20]
ss_test_data = ss_all_data[-20:]

In [28]:
len(ss_test_data), len(ss_train_data)

(20, 110814)

In [29]:
save_src_trg_dataset(ss_train_data, filename= 'train', folder_dir= 'data/')

In [31]:
save_src_trg_dataset(ss_test_data, filename= 'test', folder_dir= 'data/')

# Preparing combined dataset (Commonsense Dialogue & Shakespeare)

In [54]:
cc_train_data_dir='data/commonsense/train.jsonl'
ss_train_data_dir='data/shakespeare/train.jsonl'
cc_test_data_dir='data/commonsense/test.jsonl'
ss_test_data_dir='data/shakespeare/test.jsonl'

In [55]:
cc_train_data = []
with open(cc_train_data_dir, 'r') as f_reader:
    for row in f_reader:
        cc_train_data.append(json.loads(row))

In [56]:
cc_test_data = []
with open(cc_test_data_dir, 'r') as f_reader:
    for row in f_reader:
        cc_test_data.append(json.loads(row))

In [48]:
ss_train_data = []
with open(ss_train_data_dir, 'r') as f_reader:
    for row in f_reader:
        ss_train_data.append(json.loads(row))

In [62]:
ss_test_data = []
with open(ss_test_data_dir, 'r') as f_reader:
    for row in f_reader:
        ss_test_data.append(json.loads(row))

In [63]:
len(cc_train_data), len(ss_train_data), len(cc_test_data), len(ss_test_data)

(3382137, 99754, 10000, 11084)

In [50]:
random.shuffle(cc_train_data)

In [61]:
combined_train_data = cc_train_data[:1_000_000] + ss_train_data
combined_test_data = cc_test_data[-10:] + ss_test_data

In [62]:
random.shuffle(combined_train_data)
random.shuffle(combined_test_data)
len(combined_train_data), len(combined_test_data)

(1110818, 30)

In [63]:
save_src_trg_dataset(combined_test_data, filename= 'test', folder_dir= 'data/combined/')

In [64]:
save_src_trg_dataset(combined_train_data, filename= 'train', folder_dir= 'data/combined/')

# Preparing small combined

In [12]:
comb_train_data_dir='data/combined/train.jsonl'
comb_test_data_dir='data/combined/test.jsonl'

comb_train_data = []
comb_test_data = []

with open(comb_train_data_dir, 'r') as f_reader:
    for row in f_reader:
        comb_train_data.append(json.loads(row))
        
with open(comb_test_data_dir, 'r') as f_reader:
    for row in f_reader:
        comb_test_data.append(json.loads(row))

In [13]:
combined_train_data=comb_train_data[:100]
combined_test_data=comb_test_data[:10]

In [14]:
save_src_trg_dataset(combined_test_data, filename= 'test', folder_dir= 'data/combined/small/')
save_src_trg_dataset(combined_train_data, filename= 'train', folder_dir= 'data/combined/small/')

# Preparing tokenizer

In [70]:
tokenizer_config = 'bert-base-uncased'

In [71]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_config)

In [75]:
ss_tokenizer = BertTokenizerFast('shakespeare-tokenizer-bert/vocab.txt')

In [81]:
new_tokens = set(ss_tokenizer.vocab.keys())-set(tokenizer.vocab.keys())

In [82]:
len(new_tokens) # these tokens will be added to bert tokenizer

19044

# Introducing Sonnets

In [57]:
with open('data/source/sonnets.txt', 'r') as f:
    data = f.read().lower()

In [60]:
sonnets = data.split('\n\n')
sonnets[:4]

['i',
 "from fairest creatures we desire increase,\nthat thereby beauty's rose might never die,\nbut as the riper should by time decease,\nhis tender heir might bear his memory:\nbut thou contracted to thine own bright eyes,\nfeed'st thy light's flame with self-substantial fuel,\nmaking a famine where abundance lies,\nthy self thy foe, to thy sweet self too cruel:\nthou that art now the world's fresh ornament,\nand only herald to the gaudy spring,\nwithin thine own bud buriest thy content,\nand tender churl mak'st waste in niggarding:\n  pity the world, or else this glutton be,\n  to eat the world's due, by the grave and thee.",
 'ii',
 "when forty winters shall besiege thy brow,\nand dig deep trenches in thy beauty's field,\nthy youth's proud livery so gazed on now,\nwill be a tatter'd weed of small worth held: \nthen being asked, where all thy beauty lies,\nwhere all the treasure of thy lusty days; \nto say, within thine own deep sunken eyes,\nwere an all-eating shame, and thriftless

In [79]:
sonnets_num = []
sonnets_text = []
for i in range(len(sonnets)):
    if len(sonnets[i]) <= 10:
        sonnets_num.append(sonnets[i])
    else:
        sonnets_text.append(sonnets[i])

In [85]:
sonnets_cleaned = []
for s in sonnets_text:
    # Shakespeare's sonnets are made of 3 quatrains (4-line stanza) and last 2 lines belong together
    curr = s.split('\n')
    curr = [i.strip().replace("'d", "ed") for i in curr]
    q1 = curr[:4]
    q2 = curr[4:8]
    q3 = curr[8:12]
    c1 = curr[12:]
    sonnets_cleaned.append([' '.join(q1), ' '.join(q2), ' '.join(q3), ' '.join(c1)])

In [106]:
sonnets_cleaned[:2]

[["from fairest creatures we desire increase, that thereby beauty's rose might never die, but as the riper should by time decease, his tender heir might bear his memory:",
  "but thou contracted to thine own bright eyes, feed'st thy light's flame with self-substantial fuel, making a famine where abundance lies, thy self thy foe, to thy sweet self too cruel:",
  "thou that art now the world's fresh ornament, and only herald to the gaudy spring, within thine own bud buriest thy content, and tender churl mak'st waste in niggarding:",
  "pity the world, or else this glutton be, to eat the world's due, by the grave and thee."],
 ["when forty winters shall besiege thy brow, and dig deep trenches in thy beauty's field, thy youth's proud livery so gazed on now, will be a tattered weed of small worth held:",
  'then being asked, where all thy beauty lies, where all the treasure of thy lusty days; to say, within thine own deep sunken eyes, were an all-eating shame, and thriftless praise.',
  "ho

In [120]:
sonnets_src_trg_data = []
for s in sonnets_cleaned:
    for i in range(3):
        dictionary = {}
        dictionary["src"] = s[i].strip()
        dictionary["trg"] = s[i+1].strip()
        sonnets_src_trg_data.append(dictionary)

In [122]:
sonnets_src_trg_data[:3]

[{'src': "from fairest creatures we desire increase, that thereby beauty's rose might never die, but as the riper should by time decease, his tender heir might bear his memory:",
  'trg': "but thou contracted to thine own bright eyes, feed'st thy light's flame with self-substantial fuel, making a famine where abundance lies, thy self thy foe, to thy sweet self too cruel:"},
 {'src': "but thou contracted to thine own bright eyes, feed'st thy light's flame with self-substantial fuel, making a famine where abundance lies, thy self thy foe, to thy sweet self too cruel:",
  'trg': "thou that art now the world's fresh ornament, and only herald to the gaudy spring, within thine own bud buriest thy content, and tender churl mak'st waste in niggarding:"},
 {'src': "thou that art now the world's fresh ornament, and only herald to the gaudy spring, within thine own bud buriest thy content, and tender churl mak'st waste in niggarding:",
  'trg': "pity the world, or else this glutton be, to eat the

In [123]:
len(sonnets_src_trg_data)

462

## Combining Sonnets with Plays

In [138]:
processed_data = []
for play in all_plays:
    text_col = shakespeare_data.data[shakespeare_data.data['Play'] == play]['text']
    processed_data += generate_src_trg_dataset(text_col)

In [139]:
processed_data += sonnets_src_trg_data

In [143]:
random.shuffle(processed_data) # in-place shuffle

In [144]:
n = len(processed_data)
n

107746

In [145]:
# cutoff = int(n*0.9)
cutoff = -20
ss_train_data, ss_test_data = processed_data[:cutoff], processed_data[cutoff:]

In [146]:
save_src_trg_dataset(ss_train_data, filename= 'train', folder_dir= 'data/')

In [147]:
save_src_trg_dataset(ss_test_data, filename= 'test', folder_dir= 'data/')

## Creating Sonnets tokenizer

In [136]:
sonnets_tokenizer = create_tokenizer(np.array(sonnets_cleaned).flatten().flatten())






In [137]:
tokenizer_save_path = 'shakespeare-tokenizer-bert/sonnets'
sonnets_tokenizer.save_pretrained(tokenizer_save_path)

('shakespeare-tokenizer-bert/sonnets/tokenizer_config.json',
 'shakespeare-tokenizer-bert/sonnets/special_tokens_map.json',
 'shakespeare-tokenizer-bert/sonnets/vocab.txt',
 'shakespeare-tokenizer-bert/sonnets/added_tokens.json',
 'shakespeare-tokenizer-bert/sonnets/tokenizer.json')