In [1]:
import pandas as pd
import numpy as np
import pickle
import os
from transformers import AutoTokenizer, PreTrainedTokenizerFast

In [11]:
df = pd.read_csv('data/source/shakespeare.csv')

In [37]:
df.head(50)

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"
5,6,Henry IV,1.0,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils
6,7,Henry IV,1.0,1.1.4,KING HENRY IV,To be commenced in strands afar remote.
7,8,Henry IV,1.0,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil
8,9,Henry IV,1.0,1.1.6,KING HENRY IV,Shall daub her lips with her own children's bl...
9,10,Henry IV,1.0,1.1.7,KING HENRY IV,"Nor more shall trenching war channel her fields,"


In [35]:
df[df['Play'] == 'Henry IV'].groupby(['Play', 'PlayerLinenumber']).agg(PlayerLine=('PlayerLine', np.sum)).reset_index()

Unnamed: 0,Play,PlayerLinenumber,PlayerLine
0,Henry IV,1.0,"So shaken as we are, so wan with care,Find we ..."
1,Henry IV,2.0,"My liege, this haste was hot in question,And m..."
2,Henry IV,3.0,It seems then that the tidings of this broilBr...
3,Henry IV,4.0,"This match'd with other did, my gracious lord,..."
4,Henry IV,5.0,"Here is a dear, a true industrious friend,Sir ..."
...,...,...,...
193,Henry IV,194.0,"Nothing but papers, my lord."
194,Henry IV,195.0,Let's see what they be: read them.
195,Henry IV,196.0,"[Reads] Item, A capon,. . 2s. 2d.Item, Sauce,..."
196,Henry IV,197.0,O monstrous! but one half-penny-worth of bread...


In [12]:
remove = df[(df['Player'].notna()) & (df["PlayerLine"]!='Exeunt')]

In [13]:
data = remove[['ActSceneLine', "Player", "PlayerLine"]]
data

Unnamed: 0,ActSceneLine,Player,PlayerLine
3,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"
5,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils
6,1.1.4,KING HENRY IV,To be commenced in strands afar remote.
7,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil
...,...,...,...
111390,5.3.179,LEONTES,"Is troth-plight to your daughter. Good Paulina,"
111391,5.3.180,LEONTES,"Lead us from hence, where we may leisurely"
111392,5.3.181,LEONTES,Each one demand an answer to his part
111393,5.3.182,LEONTES,Perform'd in this wide gap of time since first


In [16]:
class Data_Creator:
    def __init__(self, data):
        #data = pd.read_csv(filepath)
        self.data = self.clean(data)
#         self.tokenized_df = self.tokenize(data)
        

    def tokenize(self, df):
        data = df['text'].apply(lambda x: x.strip().lower().split())
        tokenized = df.assign(PlayerLine = data)
        return tokenized
    
    def clean(self, data):
        
        repl = {'@\w*': ' ', '&amp;' : 'and','\su\s':' you ', '&#\w*;': ' ', 
        '#':' ', '\s2\s': 'two', "ð[^ ]*": ' ' ,
        "â[^ ]*": ' ',"(dont)|(don't)": 'do not', "(cant)|(can't)": "can not",
        "(yous)|(you's)": "you is", "(yous)|(you's)": "you is", 
        "(youve)|(you've)": "you have", "(doesnt)|(doesn't)": 'does not', 
        "(wont)|(won't)": 'will not', "[0-9]+\.*[0-9%]+\w*" : "NUMBER",'\\n\.':' ' ,'\\n':' ',
        "\.{2,}": '.', "!{2,}":'!', "\?{2,}":'?', 'ing[^a-z]':' ', 'ed[^a-z]': ' ', '_':" ",
        ' +': ' ', ',':'', '\\.':' <EOS>'}

        cleaned_tweet = data['PlayerLine'].str.lower()
        cleaned_tweet = cleaned_tweet.replace(repl, regex=True)
        cleaned = data.assign(text = cleaned_tweet)
        return cleaned
    
class vocab_builder:
    def __init__(self, tokenized_df):
        self.longest= 0
        self.idx_word= {}
        self.word_idx = {}
        self.tracker = {}
        tokenized_df['PlayerLine'].apply(vocab_builder.vocab_gen, args=(self,))
        self.word_idx['<PAD>'] = len(self.word_idx)
        self.idx_word[len(self.idx_word)] = '<PAD>'
        self.word_idx['<UNK>'] = len(self.word_idx)
        self.idx_word[len(self.idx_word)] = '<UNK>'
        self.word_idx['<EOS>'] = len(self.word_idx)
        self.idx_word[len(self.idx_word)] = '<EOS>'
        
    def vocab_gen(sentence, self):
        count = 0 
        for word in sentence:
            count +=1
            if word not in self.word_idx:
                self.tracker[word] = 1
                self.word_idx[word] = len(self.word_idx)
                self.idx_word[len(self.idx_word)] = word
            else:
                self.tracker[word] += 1
        if self.longest < count:
            self.longest = count
        return 

In [18]:
created = Data_Creator(data)
# vocab = vocab_builder(created.tokenized_df)

In [None]:
created.tokenized_df

In [27]:
info = created.tokenized_df['text']
info.iloc[0]

'so shaken as we are so wan with care'

In [28]:
len(info)

110839

In [29]:
train_data = []
info = created.tokenized_df['text']
for i in range(len(info)-1):
    dictionary = {}
    dictionary["src"] = info.iloc[i]
    dictionary["trg"] = info.iloc[i+1]
    train_data.append(dictionary)

In [None]:
import json

with open('output.jsonl', 'w') as outfile:
    for entry in train_data:
        json.dump(entry, outfile)
        outfile.write('\n')

In [None]:
vocab.idx_word

In [None]:
vocab.idx_word.values()

In [None]:
with open('vocab_list.pickle', 'wb') as handle:
    pickle.dump(vocab.idx_word, handle)

In [20]:
tokenizer_save_path = 'shakespeare-tokenizer-bert'
data_path = os.path.join('data/source', 'shakespeare.csv')


# Load and slightly clean data
df = pd.read_csv(data_path)
remove = df[(df['Player'].notna()) & (df["PlayerLine"]!='Exeunt')]
data = remove[['ActSceneLine', "Player", "PlayerLine"]]
created = Data_Creator(data)


In [21]:
created.data

Unnamed: 0,ActSceneLine,Player,PlayerLine,text
3,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,",so shaken as we are so wan with care
4,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,",find we a time for fright peace to pant
5,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils,and breathe short-wind accents of new broils
6,1.1.4,KING HENRY IV,To be commenced in strands afar remote.,to be commenc in strands afar remote <EOS>
7,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil,no more the thirsty entrance of this soil
...,...,...,...,...
111390,5.3.179,LEONTES,"Is troth-plight to your daughter. Good Paulina,",is troth-plight to your daughter <EOS> good pa...
111391,5.3.180,LEONTES,"Lead us from hence, where we may leisurely",lead us from hence where we may leisurely
111392,5.3.181,LEONTES,Each one demand an answer to his part,each one demand an answer to his part
111393,5.3.182,LEONTES,Perform'd in this wide gap of time since first,perform'd in this wide gap of time since first


In [26]:

bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') # Bert will be the base tokenizer
lines_iter = iter(created.data['text']) # Most likely not needed, already in list


# Creates new tokenizer with our vocabulary set
new_tokenizer = bert_tokenizer.train_new_from_iterator(lines_iter, 32000)

# Just for sanity check
print(f'Tokenizer contains vocab size {new_tokenizer.vocab_size}')
test_tokenize = new_tokenizer.tokenize(created.data.text.iloc[0])
test_tokenize




Tokenizer contains vocab size 27829


['so', 'shaken', 'as', 'we', 'are', 'so', 'wan', 'with', 'care']

In [27]:
isinstance(bert_tokenizer, PreTrainedTokenizerFast)

True

In [28]:
new_tokenizer.save_pretrained("shakespeare-tokenizer-bert")

vocab_path = os.path.join(tokenizer_save_path, 'vocab.txt')