In [None]:
%load_ext autoreload
%autoreload 2


In [None]:
#| default_exp core

# BNCparse

> Parsing the BNC2014 Spoken with Python.

Quirin Würschinger, LMU Munich

[q.wuerschinger@lmu.de](mailto:q.wuerschinger@lmu.de)

## Data overview

```{mermaid}
%%| fig-width: 7
classDiagram

class text {
    <<conversation>>
    text_id : "Text ID"
}

class u {
    <<utterance>>
    n : "Consecutive utterance number"
    who : "Speaker ID"
    trans : "Transition type"
    whoConfidence: "Attribution confidence"
    + u_toks : "Number of tokens in the utterance"
}

class w {
    <<token>>
    pos : "part-of-speech tag [CLAWS]"
    lemma : "lemmatised form"
    class : "“simple” POS tag or major word-class"
    usas : "semantic tag [USAS]"
    + w_idx : "token position in the given utterance"
    + w_idx_rel : "relative token position in the given utterance"
}

class meta_speaker {
    <<meta_speaker>>
    id : "Speaker ID"
    exactage : "Exact age"
    age1994 : "Age [BNC1994 groups]"
    agerange : "Age range"
    gender : "Gender"
    nat : "Nationality"
    birthplace : "Place of birth"
    birthcountry : "Country of birth"
    l1 : "First language"
    lingorig : "Linguistic origin"
    dialect_rep : "Accent/dialect as reported"
    hab_city : "City/town living"
    hab_country : "Country living"
    hab_dur : "Duration living [years]"
    dialect_l1 : "Dialect at Level 1"
    dialect_l2 : "Dialect at Level 2"
    dialect_l3 : "Dialect at Level 3"
    dialect_l4 : "Dialect at Level 4"
    edqual : "Highest qualification"
    occupation : "Occupation: title"
    socgrade : "Class: Social grade"
    nssec : "Class: NS-SEC"
    l2 : "L2 [if bilingual]"
    fls : "Foreign languages spoken"
    in_core : "Part of core set of speakers"
}

class meta_text {
    <<meta_text>>
    text_id : "Text ID"
    rec_length : "Recording length"
    rec_date : "Recording date"
    rec_year : "Year of recording"
    rec_period : "Recording period"
    n_speakers : "Number of speakers"
    list_speakers : "List of speaker IDs"
    rec_loc : "Recording location"
    relationships : "Inter-speaker relationship"
    topics : "Topics covered"
    activity : "Activity description"
    conv_type : "Selected characterisations of conversation type"
    conventions : "Transcription conventions used"
    in_sample : "Sample release inclusion"
    transcriber : "Transcriber"
}

text ..* u : contains
u ..* w : contains
text .. meta_text : text_id
u .. meta_speaker : who=id
```

# Load packages

Package requirements are stored in `requirements.yml`.

In [None]:
#| export
from pathlib import Path
from collections import defaultdict

from lxml import etree
import pandas as pd


# Variables

BNC2014 needs to be downloaded for this script to work. It can be obtained from the official [BNC website](http://corpora.lancs.ac.uk/bnc2014/). 

The following variables need to be updated to the corpus’ local path. In the current setting the BNC2014 data were stored in the project folder in the folder `data/bnc-2014-spoken`.



For development, I use a small subset of the corpus contained in `data/test` that only contains the first 10 texts.

In [None]:
testing = True

if testing:
    path_bnc = Path('../data/test/bnc-2014-spoken')
    texts_n = 10
    tokens_n = 94_659
else:
    path_bnc = Path('../data/bnc-2014-spoken')
    texts_n = 1251
    tokens_n = 11_422_615

In [None]:
path_corpus = Path(path_bnc / 'spoken' / 'tagged')
path_metadata = Path(path_bnc / 'spoken' / 'metadata')


In [None]:
assert path_bnc.exists()
assert path_corpus.exists()
assert path_metadata.exists()


# Load and parse XML

In [None]:
path_texts = list(path_corpus.glob('*.xml'))


In [None]:
assert len(path_texts) == texts_n


In [None]:
#| export
def get_xml(f_path):
    with open(f_path, 'r') as f:
        f = f.read()
    xml = etree.fromstring(f)
    return xml


In [None]:
texts = [get_xml(path) for path in path_texts]


# Corpus statistics

## Texts

Calculate the total number of texts in the corpus.

In [None]:
text_ids = [xml.get('id') for xml in texts]

print(f"number of documents in the corpus: {len(text_ids)}")


number of documents in the corpus: 10


In [None]:
assert len(text_ids) == texts_n


## Speakers

1. Determine all speakers in the corpus.
2. Calculate the total number of words each speaker has contributed to the corpus.

In [None]:
speakers_words = defaultdict(int)
for text in texts:
    for u in text.iter('u'):
        speaker = u.get('who')
        n_words = len([w for w in u.iter('w')])
        speakers_words[speaker] += n_words


### Number of speakers

In [None]:
print(f"number of speakers: {len(speakers_words)}")


number of speakers: 24


### Words per speaker

In [None]:
df_speakers_tokens = pd.DataFrame(
    list(speakers_words.items()), columns=['speaker', 'tokens'])
df_speakers_tokens = df_speakers_tokens.sort_values('tokens', ascending=False)
df_speakers_tokens


Unnamed: 0,speaker,tokens
20,S0336,14332
21,S0362,11261
16,S0115,7522
17,S0037,6970
1,S0623,6621
11,S0024,5086
5,S0611,4698
10,S0144,4183
14,S0687,3810
22,S0261,3596


The table containing all speakers and their total token counts can be found in `speakers_tokens.csv`.

In [None]:
if not testing:
    df_speakers_tokens.to_csv('../out/speakers_tokens.csv', index=False)


## Vocabulary

In [None]:
tokens = []
for text in texts:
    for w in text.iter('w'):
        tokens.append(w.text)


In [None]:
n_toks_types = pd.DataFrame(
    {'tokens': f'{len(tokens):,}', 
    'types': f'{len(set(tokens)):,}'}, 
    index=[0]
)

n_toks_types


Unnamed: 0,tokens,types
0,94659,5334


# Export corpus data in tabular format

In addition to the metadata present in the corpus, I’ve added three columns providing positional information about the tokens:

- `u_toks`: total number of tokens in the given utterance
- `w_idx`: token position (‘index’) in the given utterance, starting at 1
- `w_idx_rel`: relative token position in the given utterance: `w_idx / u_toks`

In [None]:
%%time

tokens = []

for text in texts:
    for u in text.findall('u'):
        for i, w in enumerate(u.iter('w')):
            tok_d = {}

            tok_d['text_id'] = text.get('id')

            tok_d['u_n'] = u.get('n')
            tok_d['u_who'] = u.get('who')
            tok_d['u_trans'] = u.get('trans')
            tok_d['u_whoConfidence'] = u.get('whoConfidence')
            tok_d['u_toks'] = len(list(u.iter('w')))

            tok_d['w_pos'] = w.get('pos')
            tok_d['w_lemma'] = w.get('lemma')
            tok_d['w_class'] = w.get('class')
            tok_d['w_usas'] = w.get('usas')
            tok_d['w_text'] = w.text
            tok_d['w_idx'] = i + 1
            tok_d['w_idx_rel'] = round(tok_d['w_idx'] / tok_d['u_toks'], 2)

            tokens.append(tok_d)


CPU times: user 700 ms, sys: 79.4 ms, total: 780 ms
Wall time: 794 ms


In [None]:
%%time
tokens = pd.DataFrame(tokens)


CPU times: user 147 ms, sys: 18.8 ms, total: 166 ms
Wall time: 166 ms


In [None]:
tokens.head(50)


Unnamed: 0,text_id,u_n,u_who,u_trans,u_whoConfidence,u_toks,w_pos,w_lemma,w_class,w_usas,w_text,w_idx,w_idx_rel
0,S2EF,1,S0567,nonoverlap,high,12,VM,shall,VERB,T1:1:3,shall,1,0.08
1,S2EF,1,S0567,nonoverlap,high,12,PPIS1,i,PRON,Z8,I,2,0.17
2,S2EF,1,S0567,nonoverlap,high,12,VVI,move,VERB,M2,move,3,0.25
3,S2EF,1,S0567,nonoverlap,high,12,AT,the,ART,Z5,the,4,0.33
4,S2EF,1,S0567,nonoverlap,high,12,NN2,laptop,SUBST,Y2,laptops,5,0.42
5,S2EF,1,S0567,nonoverlap,high,12,RT,then,ADV,N4,then,6,0.5
6,S2EF,1,S0567,nonoverlap,high,12,VV0,stick,VERB,M2,stick,7,0.58
7,S2EF,1,S0567,nonoverlap,high,12,PPH1,it,PRON,Z8,it,8,0.67
8,S2EF,1,S0567,nonoverlap,high,12,II,on,PREP,N6,on,9,0.75
9,S2EF,1,S0567,nonoverlap,high,12,AT,the,ART,N6,the,10,0.83


In [None]:
assert len(tokens) == tokens_n


I export the full token table to `tokens.csv`.

In [None]:
if not testing:
    tokens.to_csv('../out/tokens.csv', index=False)


I also export a smaller version for use in spreadsheet software. This version contains the first 50,000 tokens in the corpus and is stored in `tokens_50k.csv`.

In [None]:
if not testing:
    (tokens
     .head(50_000)
     .to_csv('../out/tokens_50k.csv', index=False))


In [None]:
#| hide
import nbdev
nbdev.nbdev_export()
