In [2]:
import pandas as pd
import numpy as np
import pickle
import os
from transformers import AutoTokenizer, PreTrainedTokenizerFast, BertTokenizerFast
import json
import random

In [3]:
random.seed(102)

# Exploratory

In [4]:
df = pd.read_csv('data/source/shakespeare.csv')

In [5]:
df.head(5)

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


In [6]:
df[df['Play'] == 'Henry IV'].groupby(['Play', 'PlayerLinenumber']).agg(PlayerLine=('PlayerLine', np.sum)).reset_index()

Unnamed: 0,Play,PlayerLinenumber,PlayerLine
0,Henry IV,1.0,"So shaken as we are, so wan with care,Find we ..."
1,Henry IV,2.0,"My liege, this haste was hot in question,And m..."
2,Henry IV,3.0,It seems then that the tidings of this broilBr...
3,Henry IV,4.0,"This match'd with other did, my gracious lord,..."
4,Henry IV,5.0,"Here is a dear, a true industrious friend,Sir ..."
...,...,...,...
193,Henry IV,194.0,"Nothing but papers, my lord."
194,Henry IV,195.0,Let's see what they be: read them.
195,Henry IV,196.0,"[Reads] Item, A capon,. . 2s. 2d.Item, Sauce,..."
196,Henry IV,197.0,O monstrous! but one half-penny-worth of bread...


In [7]:
remove = df[(df['Player'].notna()) & (df["PlayerLine"]!='Exeunt')]
data = remove[['ActSceneLine', "Player", "PlayerLine"]]
data.head()

Unnamed: 0,ActSceneLine,Player,PlayerLine
3,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"
5,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils
6,1.1.4,KING HENRY IV,To be commenced in strands afar remote.
7,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil


# Preprocessing Shakespeare

In [8]:
class ShakespeareData:
    def __init__(self, filepath='data/source/shakespeare.csv', tokenizer_config='bert-base-uncased', max_tokens=32000):
        self.tokenizer_config = tokenizer_config
        self.max_tokens = max_tokens
        
        data = pd.read_csv(filepath)
        self.data = self.clean(data) 
        self.tokenizer = self.create_tokenizer()
        
    
    def create_tokenizer(self):
        tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_config) # Bert will be the base tokenizer
        lines_iter = iter(self.data['text']) # Most likely not needed, already in list

        # Creates new tokenizer with our vocabulary set
        return tokenizer.train_new_from_iterator(lines_iter, self.max_tokens)
    
    def clean(self, data):
        repl = {
            '@\w*': ' ', 
            '&amp;' : 'and',
            '\su\s':' you ', 
            '&#\w*;': ' ', 
            '#':' ', 
            '\s2\s': 'two', 
            "ð[^ ]*": ' ' ,
            "â[^ ]*": ' ',
            "(dont)|(don't)": 'do not', 
            "(cant)|(can't)": "can not",
            "(yous)|(you's)": "you is", 
            "(yous)|(you's)": "you is", 
            "(youve)|(you've)": "you have", 
            "(doesnt)|(doesn't)": 'does not', 
            "(wont)|(won't)": 'will not', 
            "\'tis'": 'this',
            "[0-9]+\.*[0-9%]+\w*" : "NUMBER",
            '\\n\.':' ' ,
            '\\n':' ',
            "\.{2,}": '.', 
            "!{2,}":'!', 
            "\?{2,}":'?', 
#             'ing[^a-z]':' ', 
#             'ed[^a-z]': ' ', 
            '_':" ",
            ' +': ' ', 
            '\-{2,}': ' ', 
            '\:': '',
            ',$':'', 
            "\'d": "ed",
            '\\.':' <EOS>'}

        data = data[(data['Player'].notna()) & (data["PlayerLine"]!='Exeunt')]
        data = data[['ActSceneLine', "Player", "PlayerLine"]]
        cleaned = data['PlayerLine'].apply(lambda x: x.strip().lower())
        cleaned = cleaned.replace(repl, regex=True)
        cleaned_data = data.assign(text = cleaned)
        return cleaned_data

In [9]:
def generate_src_trg_dataset(text_col):
    data = []
    
    for i in range(len(text_col)-1):
        dictionary = {}
        dictionary["src"] = text_col.iloc[i]
        dictionary["trg"] = text_col.iloc[i+1]
        data.append(dictionary)
    
    return data

def save_src_trg_dataset(data, filename = 'data', folder_dir = ''):
    fn = folder_dir + filename + '.jsonl'
    with open(fn, 'w') as outfile:
        for entry in data:
            json.dump(entry, outfile)
            outfile.write('\n')

def generate_and_save_dataset(data_text_column, filename= 'data', folder_dir= ''):
    data = generate_src_trg_dataset(data_text_column)
    save_src_trg_dataset(data, filename=filename, folder_dir=folder_dir)

In [10]:
# with open('vocab_list.pickle', 'wb') as handle:
#     pickle.dump(vocab.idx_word, handle)

In [11]:
tokenizer_save_path = 'shakespeare-tokenizer-bert'
shakespeare_data = ShakespeareData()

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]






In [15]:
shakespeare_data.data.head()

Unnamed: 0,ActSceneLine,Player,PlayerLine,text
3,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,","so shaken as we are, so wan with care"
4,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,",find we a time for frighted peace to pant
5,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils,and breathe short-winded accents of new broils
6,1.1.4,KING HENRY IV,To be commenced in strands afar remote.,to be commenced in strands afar remote <EOS>
7,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil,no more the thirsty entrance of this soil


### Preparing train/test Shakespeare data

In [31]:
processed_data = generate_src_trg_dataset(shakespeare_data.data['text'])
random.shuffle(processed_data) # in-place shuffle

In [32]:
n = len(processed_data)

In [33]:
cutoff = int(n*0.9)
train, test = processed_data[:cutoff], processed_data[cutoff:]

In [34]:
save_src_trg_dataset(train, filename= 'train', folder_dir= 'data/shakespeare/')

In [35]:
save_src_trg_dataset(test, filename= 'test', folder_dir= 'data/shakespeare/')

In [16]:
# Just for sanity check
print(f'Tokenizer contains vocab size {shakespeare_data.tokenizer.vocab_size}')
shakespeare_data.tokenizer.tokenize(shakespeare_data.data.text.iloc[0])

Tokenizer contains vocab size 30474


['so', 'shaken', 'as', 'we', 'are', ',', 'so', 'wan', 'with', 'care']

In [17]:
isinstance(shakespeare_data.tokenizer, PreTrainedTokenizerFast)

True

In [18]:
shakespeare_data.tokenizer

PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30474, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [19]:
# Save tokenizer
shakespeare_data.tokenizer.save_pretrained("shakespeare-tokenizer-bert")

('shakespeare-tokenizer-bert/tokenizer_config.json',
 'shakespeare-tokenizer-bert/special_tokens_map.json',
 'shakespeare-tokenizer-bert/vocab.txt',
 'shakespeare-tokenizer-bert/added_tokens.json',
 'shakespeare-tokenizer-bert/tokenizer.json')

# Preparing Shakespeare train/test dataset

In [21]:
ss_all_data_dir='data/all.jsonl'

In [22]:
ss_all_data = []
with open(ss_all_data_dir, 'r') as f_reader:
    for row in f_reader:
        ss_all_data.append(json.loads(row))

In [23]:
random.shuffle(ss_all_data)

In [27]:
ss_train_data = ss_all_data[:-20]
ss_test_data = ss_all_data[-20:]

In [28]:
len(ss_test_data), len(ss_train_data)

(20, 110814)

In [29]:
save_src_trg_dataset(ss_train_data, filename= 'train', folder_dir= 'data/')

In [31]:
save_src_trg_dataset(ss_test_data, filename= 'test', folder_dir= 'data/')

# Preparing combined dataset (Commonsense Dialogue & Shakespeare)

In [60]:
cc_train_data_dir='data/commonsense/train.jsonl'
ss_train_data_dir='data/shakespeare/train.jsonl'
cc_test_data_dir='data/commonsense/test.jsonl'
ss_test_data_dir='data/shakespeare/test.jsonl'

In [47]:
cc_train_data = []
with open(cc_train_data_dir, 'r') as f_reader:
    for row in f_reader:
        cc_train_data.append(json.loads(row))

In [61]:
cc_test_data = []
with open(cc_test_data_dir, 'r') as f_reader:
    for row in f_reader:
        cc_test_data.append(json.loads(row))

In [48]:
ss_train_data = []
with open(ss_train_data_dir, 'r') as f_reader:
    for row in f_reader:
        ss_train_data.append(json.loads(row))

In [62]:
ss_test_data = []
with open(ss_test_data_dir, 'r') as f_reader:
    for row in f_reader:
        ss_test_data.append(json.loads(row))

In [63]:
len(cc_train_data), len(ss_train_data), len(cc_test_data), len(ss_test_data)

(3382137, 99754, 10000, 11084)

In [50]:
random.shuffle(cc_train_data)

In [66]:
combined_train_data = cc_train_data[:1_000_000] + ss_train_data
combined_test_data = cc_test_data + ss_test_data

In [67]:
random.shuffle(combined_train_data)
random.shuffle(combined_test_data)
len(combined_train_data), len(combined_test_data)

(1099754, 21084)

In [68]:
save_src_trg_dataset(combined_test_data, filename= 'test', folder_dir= 'data/combined/')

In [69]:
save_src_trg_dataset(combined_train_data, filename= 'train', folder_dir= 'data/combined/')

# Preparing tokenizer

In [70]:
tokenizer_config = 'bert-base-uncased'

In [71]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_config)

In [75]:
ss_tokenizer = BertTokenizerFast('shakespeare-tokenizer-bert/vocab.txt')

In [81]:
new_tokens = set(ss_tokenizer.vocab.keys())-set(tokenizer.vocab.keys())

In [82]:
len(new_tokens) # these tokens will be added to bert tokenizer

19044