In [29]:
from math import sqrt
import regex as re
import os
from glob import glob
import numpy as np
import pandas as pd
import torch
import torch.nn
from torch.utils.data import DataLoader, Dataset
import transformers as ppb
import warnings
import xml.etree.ElementTree as ET
import json

warnings.filterwarnings('ignore')

### Fine-tune using BNC Corpus

In [None]:
# prep-clean up documents
# Feed into BERT / roBERTa to fine-tune language model

In [233]:
bnc_corpus_loc = '../data/bnc2014spoken-xml/spoken/untagged/*'

In [None]:
[[set(e.tag) for e in els] for els in child_els.values()]

In [454]:
glob(bnc_corpus_loc)[0]

'../data/bnc2014spoken-xml/spoken/untagged/SQ2W.xml'

In [468]:
texts = {}
text_speakers = {}
child_els = {}
speakers = {}
for file in glob(bnc_corpus_loc):
    name = file.split('/')[-1].split('.')[0]
    tree = ET.parse(file)
    root = tree.getroot()
    file_id = root.attrib['id']
    speakers[name] = root.find('header').find('list_speakers').text.split(' ')
    text_s = []
    text = []
    c_els = []
    utterances = tree.getroot().find('body').findall('u')
    for u in utterances:
        c_els += u.getchildren()
        text_s.append(u.attrib['who'])
        s = ET.tostring(u, encoding='unicode')
        s = re.sub('<pause dur="short"\s*/>', '...', s)
        s = re.sub('<pause dur="long"\s*/>', '... ...', s)
        s = re.sub('<anon .*\/>', '', s)
        s = re.sub('<shift.*\/>', '', s)
        s = re.sub('<trunc>(\w*)</trunc>', '\\1', s)
        s = re.sub('<unclear>(.*)</unclear>', '\\1', s)
        s = re.sub('<\/?unclear\s*\/?>', '', s)
        s = re.sub('<vocal desc="laugh"\s*\/>', '&amp;=laughs', s)
        s = re.sub('<\/?event\s*\/?>', '', s)
        s = re.sub('<foreign.*>(.*)</foreign>', '\\1', s)
        try:
            text.append(ET.fromstring(s).text)
        except Exception as e:
            import traceback
            traceback.print_exc()
            print(f'{name} --- {s}')
            text.append(s)
    child_els[name] = c_els
    texts[name] = text
    text_speakers[name] = text_s

Traceback (most recent call last):
  File "<ipython-input-468-d27e133ea087>", line 30, in <module>
    text.append(ET.fromstring(s).text)
  File "/Users/tom/anaconda3/envs/distil_bert/lib/python3.7/xml/etree/ElementTree.py", line 1315, in XML
    parser.feed(text)
  File "<string>", line None
xml.etree.ElementTree.ParseError: mismatched tag: line 1, column 107


S99Z --- <u n="343" who="UNKMALE" whoConfidence="low">good good ... ... <foreign lang="fre">quelle heure est que ?</u>



- pause tags become '...'
- trunc tags get the text included
- shift is ignored.
- anon removed - should be replaced with a random place?? 
- event is ignored
- anon is ignored
- vocal (laugh), &=laughs

In [None]:
text_speakers_clean = {k: [s for s, t in zip(v, texts[k]) if t is None or len(t) > 0] for k,v in text_speakers.items()}
texts_clean = {k: [t for t in v if t is None or len(t) > 0] for k,v in texts.items()}

In [503]:
df = pd.DataFrame([{'file': f, 'speakers': speakers[f], 'text_speaker': text_speakers[f], 'texts': texts[f]} for f in texts.keys()])

In [None]:
df['texts_joined'] = df.texts.apply(lambda s: '. '.join(s))

In [13]:
df.to_excel('bnc_corpus_df.csv', index=False)

In [None]:
# shuffle and split corpus text 80/20, train / test,
# join each text into a single doc 

In [42]:
df = pd.read_excel('bnc_corpus_df.xlsx')