In [1]:
import pandas as pd
import numpy as np
import pickle
import os
from transformers import AutoTokenizer, PreTrainedTokenizerFast, BertTokenizerFast
import json
import random
import re

In [2]:
random.seed(102)

# Creating tokenizer

In [None]:
with open("./data/source/shakespeare_tokenize.txt", 'r') as file:
    ss = file.read()

In [None]:
ss = str(ss).lower().replace('\n\n','\n').replace('  ', ' ').replace(';', '').replace(':', '').replace('[', '').replace(']', '').replace('  ', ' ')

In [None]:
ss_list = ss.split('\n')
ss_list = [i.strip() for i in ss_list]
ss_list

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokenizer = tokenizer.train_new_from_iterator(iter(ss_list),32000)
tokenizer.vocab_size

In [None]:
tokenizer.tokenize("my tongue should catch your tongue's sweet melody ."), tokenizer.tokenize("demetrius , i'll avouch it to his head ,")

In [None]:
tokenizer_save_path = 'shakespeare-tokenizer-bert/plays'
tokenizer.save_pretrained(tokenizer_save_path)

# Exploratory

In [3]:
df = pd.read_csv('data/source/shakespeare.csv')

In [4]:
df.head(5)

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


In [5]:
# df['Play'].str.lower().unique()

In [6]:
# df['Player'].str.lower().unique()

In [7]:
# french characters: alencon, alice, king of france, katharine, dauphin
# french speaking characters

In [8]:
# df[df['Player'].str.lower() == 'katharine']
# maybe could remove the whole of Henry V since that is where most french lines are found

In [9]:
remove = df[(df['Player'].notna()) & (df["PlayerLine"]!='Exeunt')]
data = remove[['Play', 'PlayerLinenumber', "Player", "PlayerLine"]]
data.head(10)

Unnamed: 0,Play,PlayerLinenumber,Player,PlayerLine
3,Henry IV,1.0,KING HENRY IV,"So shaken as we are, so wan with care,"
4,Henry IV,1.0,KING HENRY IV,"Find we a time for frighted peace to pant,"
5,Henry IV,1.0,KING HENRY IV,And breathe short-winded accents of new broils
6,Henry IV,1.0,KING HENRY IV,To be commenced in strands afar remote.
7,Henry IV,1.0,KING HENRY IV,No more the thirsty entrance of this soil
8,Henry IV,1.0,KING HENRY IV,Shall daub her lips with her own children's bl...
9,Henry IV,1.0,KING HENRY IV,"Nor more shall trenching war channel her fields,"
10,Henry IV,1.0,KING HENRY IV,Nor bruise her flowerets with the armed hoofs
11,Henry IV,1.0,KING HENRY IV,"Of hostile paces: those opposed eyes,"
12,Henry IV,1.0,KING HENRY IV,"Which, like the meteors of a troubled heaven,"


# Preprocessing Shakespeare

In [26]:
class ShakespeareData:
    def __init__(self, filepath='data/source/shakespeare.csv'):        
        data = pd.read_csv(filepath)
        self.data = self.clean(data) 
            
    def clean(self, data):
        repl = {
            '@\w*': ' ', 
            '&amp;' : 'and',
            '&#\w*;': ' ', 
            '\\n\.':' ' ,
            '\\n':' ',
            "\.{2,}": '.', 
            "!{2,}":'!', 
            "\?{2,}":'?', 
            '_':" ",
            ' +': ' ', 
            '\-{2,}': ' ',
            ':': '',
            ';': '',
            '\[': '',
            '\]': '',
            '  ': ' '
            }

        data = data[(data['Player'].notna()) & (data["PlayerLine"]!='Exeunt') & (data["Play"]!='Henry V') & (data['ActSceneLine'].notna())]
        data = data[['ActSceneLine', "Player", "PlayerLine", "Play", "PlayerLinenumber"]]
        data['Player'] = data['Player'].str.lower()
        data['ActScene'] = data['ActSceneLine'].apply(extract_before_second_full_stop)
        cleaned = data['PlayerLine'].apply(lambda x: x.strip().lower())
        cleaned = cleaned.replace(repl, regex=True)
        cleaned_data = data.assign(text = cleaned)
        return cleaned_data

def extract_before_second_full_stop(string):
    index = string.find('.', string.find('.') + 1)  # Find index of second full stop
    if index != -1:
        return string[:index]
    else:
        return string

In [27]:
def generate_src_trg_dataset(text_col):
    data = []
    
    for i in range(len(text_col)-1):
        dictionary = {}
        dictionary["src"] = text_col.iloc[i]
        dictionary["trg"] = text_col.iloc[i+1]
        data.append(dictionary)

#     for i in range(len(text_col)-5):
#         dictionary = {}
#         dictionary["src"] = (text_col.iloc[i] + " " + text_col.iloc[i+1] + " " + text_col.iloc[i+2]).strip()
#         dictionary["trg"] = (text_col.iloc[i+3] + " " + text_col.iloc[i+4] + " " + text_col.iloc[i+5]).strip()
#         data.append(dictionary)
    
    return data

def save_src_trg_dataset(data, filename = 'data', folder_dir = ''):
    fn = folder_dir + filename + '.jsonl'
    with open(fn, 'w') as outfile:
        for entry in data:
            json.dump(entry, outfile)
            outfile.write('\n')

def generate_and_save_dataset(data_text_column, filename= 'data', folder_dir= ''):
    data = generate_src_trg_dataset(data_text_column)
    save_src_trg_dataset(data, filename=filename, folder_dir=folder_dir)

In [28]:
# with open('vocab_list.pickle', 'wb') as handle:
#     pickle.dump(vocab.idx_word, handle)

In [29]:
shakespeare_data = ShakespeareData()

In [30]:
shakespeare_data.data.head() # 107459 -> # 101919 after removing na in actsceneline

Unnamed: 0,ActSceneLine,Player,PlayerLine,Play,PlayerLinenumber,ActScene,text
3,1.1.1,king henry iv,"So shaken as we are, so wan with care,",Henry IV,1.0,1.1,"so shaken as we are, so wan with care,"
4,1.1.2,king henry iv,"Find we a time for frighted peace to pant,",Henry IV,1.0,1.1,"find we a time for frighted peace to pant,"
5,1.1.3,king henry iv,And breathe short-winded accents of new broils,Henry IV,1.0,1.1,and breathe short-winded accents of new broils
6,1.1.4,king henry iv,To be commenced in strands afar remote.,Henry IV,1.0,1.1,to be commenced in strands afar remote.
7,1.1.5,king henry iv,No more the thirsty entrance of this soil,Henry IV,1.0,1.1,no more the thirsty entrance of this soil


In [31]:
def split_long_lines(line, max_length=100):
    if len(line) <= max_length:
        return [line]
    else:
        match_punctuation = re.search(r'[\.,;:!?>]\s*', line[:max_length][::-1])
#         match_word = re.search(r'\s', line[:max_length][::-1])
        if match_punctuation:
            split_point = max_length-match_punctuation.start()
#         elif match_word:
#             split_point = max_length - match_word.start()
        else:
            split_point = max_length
        return [line[:split_point].strip()] + split_long_lines(line[split_point:].strip(), max_length=max_length)
    
def remove_separator(lines):
    return [i.replace(">", "").strip().replace("  ", " ") for i in lines]

In [32]:
data_per_convo = shakespeare_data.data.groupby(['Play', 'Player', 'ActScene', 'PlayerLinenumber'],sort=False)['text'].agg(lambda x: ' > '.join(x)).reset_index()

In [33]:
data_per_convo.head()

Unnamed: 0,Play,Player,ActScene,PlayerLinenumber,text
0,Henry IV,king henry iv,1.1,1.0,"so shaken as we are, so wan with care, > find ..."
1,Henry IV,westmoreland,1.1,2.0,"my liege, this haste was hot in question, > an..."
2,Henry IV,king henry iv,1.1,3.0,it seems then that the tidings of this broil >...
3,Henry IV,westmoreland,1.1,4.0,"this match'd with other did, my gracious lord,..."
4,Henry IV,king henry iv,1.1,5.0,"here is a dear, a true industrious friend, > s..."


In [34]:
data_per_convo['text_processed']=data_per_convo['text'].apply(split_long_lines)

In [35]:
data_per_convo.text_processed.iloc[0]

['so shaken as we are, so wan with care, > find we a time for frighted peace to pant, >',
 'and breathe short-winded accents of new broils > to be commenced in strands afar remote. >',
 "no more the thirsty entrance of this soil > shall daub her lips with her own children's blood, >",
 'nor more shall trenching war channel her fields, > nor bruise her flowerets with the armed hoofs >',
 'of hostile paces those opposed eyes, > which, like the meteors of a troubled heaven, >',
 'all of one nature, of one substance bred, > did lately meet in the intestine shock >',
 'and furious close of civil butchery > shall now, in mutual well-beseeming ranks, >',
 'march all one way and be no more opposed > against acquaintance, kindred and allies >',
 'the edge of war, like an ill-sheathed knife, > no more shall cut his master. therefore, friends, >',
 'as far as to the sepulchre of christ, > whose soldier now, under whose blessed cross >',
 'we are impressed and engaged to fight, > forthwith a power

In [36]:
data_per_convo['text_processed']=data_per_convo['text_processed'].apply(remove_separator)

In [37]:
data_per_convo.text_processed.iloc[0]

['so shaken as we are, so wan with care, find we a time for frighted peace to pant,',
 'and breathe short-winded accents of new broils to be commenced in strands afar remote.',
 "no more the thirsty entrance of this soil shall daub her lips with her own children's blood,",
 'nor more shall trenching war channel her fields, nor bruise her flowerets with the armed hoofs',
 'of hostile paces those opposed eyes, which, like the meteors of a troubled heaven,',
 'all of one nature, of one substance bred, did lately meet in the intestine shock',
 'and furious close of civil butchery shall now, in mutual well-beseeming ranks,',
 'march all one way and be no more opposed against acquaintance, kindred and allies',
 'the edge of war, like an ill-sheathed knife, no more shall cut his master. therefore, friends,',
 'as far as to the sepulchre of christ, whose soldier now, under whose blessed cross',
 'we are impressed and engaged to fight, forthwith a power of english shall we levy,',
 "whose arms 

In [38]:
data_per_convo = data_per_convo.explode('text_processed')

In [40]:
data_per_convo['text_processed_with_player'] = data_per_convo['Player'] + " : " + data_per_convo['text_processed']

In [41]:
data_per_convo

Unnamed: 0,Play,Player,ActScene,PlayerLinenumber,text,text_processed,text_processed_with_player
0,Henry IV,king henry iv,1.1,1.0,"so shaken as we are, so wan with care, > find ...","so shaken as we are, so wan with care, find we...","king henry iv : so shaken as we are, so wan wi..."
0,Henry IV,king henry iv,1.1,1.0,"so shaken as we are, so wan with care, > find ...",and breathe short-winded accents of new broils...,king henry iv : and breathe short-winded accen...
0,Henry IV,king henry iv,1.1,1.0,"so shaken as we are, so wan with care, > find ...",no more the thirsty entrance of this soil shal...,king henry iv : no more the thirsty entrance o...
0,Henry IV,king henry iv,1.1,1.0,"so shaken as we are, so wan with care, > find ...",nor more shall trenching war channel her field...,king henry iv : nor more shall trenching war c...
0,Henry IV,king henry iv,1.1,1.0,"so shaken as we are, so wan with care, > find ...","of hostile paces those opposed eyes, which, li...",king henry iv : of hostile paces those opposed...
...,...,...,...,...,...,...,...
29360,A Winters Tale,leontes,5.3,38.0,"o, peace, paulina! > thou shouldst a husband t...","look upon my brother both your pardons, that e...",leontes : look upon my brother both your pardo...
29360,A Winters Tale,leontes,5.3,38.0,"o, peace, paulina! > thou shouldst a husband t...","my ill suspicion. this is your son-in-law, and...",leontes : my ill suspicion. this is your son-i...
29360,A Winters Tale,leontes,5.3,38.0,"o, peace, paulina! > thou shouldst a husband t...",is troth-plight to your daughter. good paulina...,leontes : is troth-plight to your daughter. go...
29360,A Winters Tale,leontes,5.3,38.0,"o, peace, paulina! > thou shouldst a husband t...",each one demand an answer to his part perform'...,leontes : each one demand an answer to his par...


In [42]:
all_plays = data_per_convo.Play.unique()

In [43]:
all_plays

array(['Henry IV', 'Henry VI Part 1', 'Henry VI Part 2',
       'Henry VI Part 3', 'Alls well that ends well', 'As you like it',
       'Antony and Cleopatra', 'A Comedy of Errors', 'Coriolanus',
       'Cymbeline', 'Hamlet', 'Henry VIII', 'King John', 'Julius Caesar',
       'King Lear', 'Loves Labours Lost', 'macbeth',
       'Measure for measure', 'Merchant of Venice',
       'Merry Wives of Windsor', 'A Midsummer nights dream',
       'Much Ado about nothing', 'Othello', 'Pericles', 'Richard II',
       'Richard III', 'Romeo and Juliet', 'Taming of the Shrew',
       'The Tempest', 'Timon of Athens', 'Titus Andronicus',
       'Troilus and Cressida', 'Twelfth Night', 'Two Gentlemen of Verona',
       'A Winters Tale'], dtype=object)

### Preparing train/test Shakespeare data

In [44]:
processed_data = []
for play in all_plays:
#     text_col = data_per_convo[data_per_convo['Play'] == play]['text_processed']
    text_col = data_per_convo[data_per_convo['Play'] == play]['text_processed_with_player']
    processed_data += generate_src_trg_dataset(text_col)

In [45]:
processed_data[:5]

[{'src': 'king henry iv : so shaken as we are, so wan with care, find we a time for frighted peace to pant,',
  'trg': 'king henry iv : and breathe short-winded accents of new broils to be commenced in strands afar remote.'},
 {'src': 'king henry iv : and breathe short-winded accents of new broils to be commenced in strands afar remote.',
  'trg': "king henry iv : no more the thirsty entrance of this soil shall daub her lips with her own children's blood,"},
 {'src': "king henry iv : no more the thirsty entrance of this soil shall daub her lips with her own children's blood,",
  'trg': 'king henry iv : nor more shall trenching war channel her fields, nor bruise her flowerets with the armed hoofs'},
 {'src': 'king henry iv : nor more shall trenching war channel her fields, nor bruise her flowerets with the armed hoofs',
  'trg': 'king henry iv : of hostile paces those opposed eyes, which, like the meteors of a troubled heaven,'},
 {'src': 'king henry iv : of hostile paces those opposed 

In [46]:
random.shuffle(processed_data) # in-place shuffle

In [47]:
n = len(processed_data)
n

60784

In [48]:
train_cutoff = int(n*0.8)

In [49]:
# want to preserve order
ss_train_data = processed_data[:train_cutoff]
ss_test_data = processed_data[-10:]
ss_val_data = processed_data[train_cutoff:-10]

In [50]:
len(ss_test_data), len(ss_train_data), len(ss_val_data)

(10, 48627, 12147)

In [51]:
ss_train_data[:5]

[{'src': 'morocco : o hell! what have we here? a carrion death, within whose empty eye there is a written scroll!',
  'trg': "morocco : i'll read the writing. all that glitters is not gold, often have you heard that told"},
 {'src': 'cromwell : and his disciples only envy at, ye blew the fire that burns ye now have at ye! enter king,',
  'trg': 'cromwell : frowning on them, takes his seat'},
 {'src': 'mistress quickly : mistress page would desire you to send her your little page, of all loves her husband has a',
  'trg': 'mistress quickly : marvellous infection to the little page, and truly master page is an honest man. never a wife in'},
 {'src': "plantagenet : he bears him on the place's privilege, or durst not, for his craven heart, say thus.",
  'trg': "somerset : by him that made me, i'll maintain my words on any plot of ground in christendom."},
 {'src': 'queen margaret : loather a hundred times to part than die. yet now farewell, and farewell life with thee!',
  'trg': 'suffolk 

In [55]:
save_src_trg_dataset(ss_train_data, filename= 'train', folder_dir= 'data/with_player/')

In [56]:
save_src_trg_dataset(ss_val_data, filename= 'valid', folder_dir= 'data/with_player/')

In [57]:
save_src_trg_dataset(ss_test_data, filename= 'test', folder_dir= 'data/with_player/')

## Creating CC tokenizer

In [None]:
comb_train_data_dir='data/commonsense/train.jsonl'
comb_test_data_dir='data/commonsense/test.jsonl'

comb_data = []

with open(comb_train_data_dir, 'r') as f_reader:
    for row in f_reader:
        comb_data.append(json.loads(row))
        
with open(comb_test_data_dir, 'r') as f_reader:
    for row in f_reader:
        comb_data.append(json.loads(row))

In [None]:
comb_data[0], len(comb_data)

In [None]:
merged_lines = []
for line in comb_data:
    l = line['src'] + " " + line['trg']
    merged_lines.append(l)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') # Bert will be the base tokenizer
lines_iter = iter(merged_lines) # Most likely not needed, already in list

# Creates new tokenizer with our vocabulary set
comb_tokenizer = tokenizer.train_new_from_iterator(lines_iter, 50000)

In [None]:
comb_tokenizer

In [None]:
comb_tokenizer.save_pretrained("commonsense-tokenizer-bert")

# Preparing combined dataset (Commonsense Dialogue & Shakespeare)

In [None]:
cc_train_data_dir='data/commonsense/train.jsonl'
ss_train_data_dir='data/shakespeare/train.jsonl'
cc_test_data_dir='data/commonsense/test.jsonl'
ss_test_data_dir='data/shakespeare/test.jsonl'

In [None]:
cc_train_data = []
with open(cc_train_data_dir, 'r') as f_reader:
    for row in f_reader:
        cc_train_data.append(json.loads(row))

In [None]:
cc_test_data = []
with open(cc_test_data_dir, 'r') as f_reader:
    for row in f_reader:
        cc_test_data.append(json.loads(row))

In [None]:
ss_train_data = []
with open(ss_train_data_dir, 'r') as f_reader:
    for row in f_reader:
        ss_train_data.append(json.loads(row))

In [None]:
ss_test_data = []
with open(ss_test_data_dir, 'r') as f_reader:
    for row in f_reader:
        ss_test_data.append(json.loads(row))

In [None]:
len(cc_train_data), len(ss_train_data), len(cc_test_data), len(ss_test_data)

In [None]:
random.shuffle(cc_train_data)

In [None]:
combined_train_data = cc_train_data[:1_000_000] + ss_train_data
combined_test_data = cc_test_data[-10:] + ss_test_data

In [None]:
random.shuffle(combined_train_data)
random.shuffle(combined_test_data)
len(combined_train_data), len(combined_test_data)

In [None]:
save_src_trg_dataset(combined_test_data, filename= 'test', folder_dir= 'data/combined/')

In [None]:
save_src_trg_dataset(combined_train_data, filename= 'train', folder_dir= 'data/combined/')

# Preparing small combined

In [None]:
comb_train_data_dir='data/combined/train.jsonl'
comb_test_data_dir='data/combined/test.jsonl'

comb_train_data = []
comb_test_data = []

with open(comb_train_data_dir, 'r') as f_reader:
    for row in f_reader:
        comb_train_data.append(json.loads(row))
        
with open(comb_test_data_dir, 'r') as f_reader:
    for row in f_reader:
        comb_test_data.append(json.loads(row))

In [None]:
combined_train_data=comb_train_data[:100]
combined_test_data=comb_test_data[:10]

In [None]:
save_src_trg_dataset(combined_test_data, filename= 'test', folder_dir= 'data/combined/small/')
save_src_trg_dataset(combined_train_data, filename= 'train', folder_dir= 'data/combined/small/')

# Introducing Sonnets

In [None]:
with open('data/source/sonnets.txt', 'r') as f:
    data = f.read().lower()

In [None]:
sonnets = data.split('\n\n')
sonnets[:4]

In [None]:
sonnets_num = []
sonnets_text = []
for i in range(len(sonnets)):
    if len(sonnets[i]) <= 10:
        sonnets_num.append(sonnets[i])
    else:
        sonnets_text.append(sonnets[i])

In [None]:
sonnets_cleaned = []
for s in sonnets_text:
    # Shakespeare's sonnets are made of 3 quatrains (4-line stanza) and last 2 lines belong together
    curr = s.split('\n')
    curr = [i.strip().replace("'d", "ed") for i in curr]
    q1 = curr[:4]
    q2 = curr[4:8]
    q3 = curr[8:12]
    c1 = curr[12:]
    sonnets_cleaned.append([' '.join(q1), ' '.join(q2), ' '.join(q3), ' '.join(c1)])

In [None]:
sonnets_cleaned[:2]

In [None]:
sonnets_src_trg_data = []
for s in sonnets_cleaned:
    for i in range(3):
        dictionary = {}
        dictionary["src"] = s[i].strip()
        dictionary["trg"] = s[i+1].strip()
        sonnets_src_trg_data.append(dictionary)

In [None]:
sonnets_src_trg_data[:3]

In [None]:
len(sonnets_src_trg_data)

## Combining Sonnets with Plays

In [None]:
processed_data = []
for play in all_plays:
    text_col = shakespeare_data.data[shakespeare_data.data['Play'] == play]['text']
    processed_data += generate_src_trg_dataset(text_col)

In [None]:
processed_data += sonnets_src_trg_data

In [None]:
random.shuffle(processed_data) # in-place shuffle

In [None]:
n = len(processed_data)
n

In [None]:
# cutoff = int(n*0.9)
cutoff = -20
ss_train_data, ss_test_data = processed_data[:cutoff], processed_data[cutoff:]

In [None]:
save_src_trg_dataset(ss_train_data, filename= 'train', folder_dir= 'data/')

In [None]:
save_src_trg_dataset(ss_test_data, filename= 'test', folder_dir= 'data/')

## Creating Sonnets tokenizer

In [None]:
sonnets_tokenizer = create_tokenizer(np.array(sonnets_cleaned).flatten().flatten())

In [None]:
tokenizer_save_path = 'shakespeare-tokenizer-bert/sonnets'
sonnets_tokenizer.save_pretrained(tokenizer_save_path)