In [1]:
from math import sqrt
import regex as re
import os
from glob import glob
import numpy as np
import pandas as pd
import torch
import torch.nn
from torch.utils.data import DataLoader, Dataset
import transformers as ppb
from tokenizers import ByteLevelBPETokenizer
import warnings
import xml.etree.ElementTree as ET
import json

import spacy

warnings.filterwarnings('ignore')

### Fine-tune using BNC Corpus

In [4]:
# prep-clean up documents
# Feed into BERT / roBERTa to fine-tune language model

In [5]:
bnc_corpus_loc = '../data/bnc2014spoken-xml/spoken/untagged/*'

In [17]:
len(glob(bnc_corpus_loc))

1251

In [7]:
texts = {}
text_speakers = {}
child_els = {}
speakers = {}
for file in glob(bnc_corpus_loc):
    name = file.split('/')[-1].split('.')[0]
    tree = ET.parse(file)
    root = tree.getroot()
    file_id = root.attrib['id']
    speakers[name] = root.find('header').find('list_speakers').text.split(' ')
    text_s = []
    text = []
    c_els = []
    utterances = tree.getroot().find('body').findall('u')
    for u in utterances:
        c_els += u.getchildren()
        text_s.append(u.attrib['who'])
        s = ET.tostring(u, encoding='unicode')
        s = re.sub('<pause dur="short"\s*/>', '...', s)
        s = re.sub('<pause dur="long"\s*/>', '... ...', s)
        s = re.sub('<anon .*\/>', '', s)
        s = re.sub('<shift.*\/>', '', s)
        s = re.sub('<trunc>(\w*)</trunc>', '\\1', s)
        s = re.sub('<unclear>(.*)</unclear>', '\\1', s)
        s = re.sub('<\/?unclear\s*\/?>', '', s)
        s = re.sub('<vocal desc="laugh"\s*\/>', '&amp;=laughs', s)
        s = re.sub('<\/?event\s*\/?>', '', s)
        s = re.sub('<foreign.*>(.*)</foreign>', '\\1', s)
        try:
            text.append(ET.fromstring(s).text)
        except Exception as e:
            import traceback
            traceback.print_exc()
            print(f'{name} --- {s}')
            text.append(s)
    child_els[name] = c_els
    texts[name] = text
    text_speakers[name] = text_s

Traceback (most recent call last):
  File "<ipython-input-7-d27e133ea087>", line 30, in <module>
    text.append(ET.fromstring(s).text)
  File "/Users/tom/anaconda3/envs/distil_bert/lib/python3.7/xml/etree/ElementTree.py", line 1315, in XML
    parser.feed(text)
  File "<string>", line None
xml.etree.ElementTree.ParseError: mismatched tag: line 1, column 107


S99Z --- <u n="343" who="UNKMALE" whoConfidence="low">good good ... ... <foreign lang="fre">quelle heure est que ?</u>



- pause tags become '...'
- trunc tags get the text included
- shift is ignored.
- anon removed - should be replaced with a random place?? 
- event is ignored
- anon is ignored
- vocal (laugh), &=laughs

In [24]:
text_speakers_clean = {k: [s for s, t in zip(v, texts[k]) if t is None or len(t) > 0] for k,v in text_speakers.items()}
texts_clean = {k: [t for t in v if t is not None and len(t) > 0] for k,v in texts.items()}

In [9]:
df = pd.DataFrame([{'file': f, 'speakers': speakers[f], 'text_speaker': text_speakers[f], 'texts': texts[f]} for f in texts.keys()])

In [34]:
df['texts_joined'] = df.file.apply(lambda f: ', '.join(texts_clean[f]))

In [36]:
df.to_pickle('bnc_corpus_df.pickle')

In [None]:
# shuffle and split corpus text 80/20, train / test,
# join each text into a single doc 

In [2]:
df = pd.read_pickle('bnc_corpus_df.pickle')

In [3]:
nlp = spacy.load("en_core_web_md")
nlp.pipeline = []
sbd = nlp.create_pipe('sentencizer')
nlp.add_pipe(sbd)

In [4]:
%%time
sliding_window_sents = []
for doc in df.texts_joined:
    output_doc = []
    sent_idx = 0
    doc_sents = nlp(doc).sents
    split_sents = [re.sub('\s+', ' ', sent.text).split(' ') for sent in doc_sents]
    trunc_split_sents = []
    # spoken speech has potentially very long sentences. Split arbitrarily...
    for sent in split_sents:
        if len(sent) < 250:
            trunc_split_sents.append(' '.join(sent))
        else:
            for idx in range(0, len(sent), 250):
                new_sent = sent[idx:idx+250 if idx+250 <= len(sent) else len(sent)]
                trunc_split_sents.append(' '.join(new_sent))
    # sliding window of 4 'sentences' per line
    for i, sent in enumerate(trunc_split_sents):
        end_idx = i+6 if i+6 <= len(trunc_split_sents) else len(trunc_split_sents)
        sliding_window_sents.append(' '.join(trunc_split_sents[i:end_idx]))

CPU times: user 24.2 s, sys: 343 ms, total: 24.6 s
Wall time: 24.6 s


In [70]:
f = open('../data/pre-train/all_text_raw', 'w')
f.write('\n'.join(sliding_window_sents))
f.close()

### Train BPE / BERTWordPiece Tokenizers

In [7]:
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer(lowercase=False)

# For BERT
# from tokenizers import BertWordPieceTokenizer, ByteLevelBPETokenizer
# # Initialize a tokenizer
# tokenizer = BertWordPieceTokenizer(lowercase=False, handle_chinese_chars=False)

# vocab size must be the same to fine-tune BERT / RoBERTa??

# Customize training
tokenizer.train(files='../data/pre-train/all_text_raw', vocab_size=50265, min_frequency=3, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

In [8]:
os.makedirs('bncRoBERTaConfig', exist_ok=True)

### Split text into LM train / test sets

In [9]:
from sklearn.model_selection import train_test_split 
random_state = 42
with open('../data/pre-train/all_text_raw') as f:
    all_text = f.readlines()
train, test = train_test_split(all_text, test_size=0.2, random_state=random_state)
f = open('train', 'w')
f.write('\n'.join(train))
f.close()
f = open('test', 'w')
f.write('\n'.join(test))
f.close()

In [10]:
tokenizer.save('bncRoBERTaConfig')

['bncRoBERTaConfig/vocab.json', 'bncRoBERTaConfig/merges.txt']

In [None]:
!bash run_training.sh