In [None]:
%load_ext autoreload
%autoreload 2


In [None]:
#| default_exp core


# BNCparse

> Parsing the BNC2014 Spoken with Python.

Quirin Würschinger, LMU Munich

[q.wuerschinger@lmu.de](mailto:q.wuerschinger@lmu.de)

## Data overview

```{mermaid}
classDiagram

class text {
    <<conversation>>
    text_id("Text ID")
}

class u {
    <<utterance>>
    n("Consecutive utterance number")
    who("Speaker ID")
    trans("Transition type")
    whoConfidence("Attribution confidence")
}

class w {
    <<token>>
    pos("part-of-speech tag [CLAWS]")
    lemma("lemmatised form")
    class("“simple” POS tag or major word-class")
    usas("semantic tag [USAS]")
}

class meta_speaker {
    <<meta_speaker>>
    id("Speaker ID")
    exactage("Exact age")
    age1994("Age (BNC1994 groups)")
    agerange("Age range")
    gender("Gender")
    nat("Nationality")
    birthplace("Place of birth")
    birthcountry("Country of birth")
    l1("First language")
    lingorig("Linguistic origin")
    dialect_rep("Accent/dialect as reported")
    hab_city("City/town living")
    hab_country("Country living")
    hab_dur("Duration living (years)")
    dialect_l1("Dialect at Level 1")
    dialect_l2("Dialect at Level 2")
    dialect_l3("Dialect at Level 3")
    dialect_l4("Dialect at Level 4")
    edqual("Highest qualification")
    occupation("Occupation: title")
    socgrade("Class: Social grade")
    nssec("Class: NS-SEC")
    l2("L2 (if bilingual)")
    fls("Foreign languages spoken")
    in_core("Part of core set of speakers")
}

class meta_text {
    <<meta_text>>
    text_id("Text ID")
    rec_length("Recording length")
    rec_date("Recording date")
    rec_year("Year of recording")
    rec_period("Recording period")
    n_speakers("Number of speakers")
    list_speakers("List of speaker IDs")
    rec_loc("Recording location")
    relationships("Inter-speaker relationship")
    topics("Topics covered")
    activity("Activity description")
    conv_type("Selected characterisations of conversation type")
    conventions("Transcription conventions used")
    in_sample("Sample release inclusion")
    transcriber("Transcriber")
}

text ..* u : contains
u ..* w : contains
text .. meta_text : text_id
u .. meta_speaker : who=id
```

# Load packages

Package requirements are stored in `requirements.yml`.

In [None]:
#| export
from pathlib import Path
from collections import defaultdict

from lxml import etree
import pandas as pd


# Variables

BNC2014 needs to be downloaded for this script to work. It can be obtained from the official [BNC website](http://corpora.lancs.ac.uk/bnc2014/). 

The following variables need to be updated to the corpus’ local path. In the current setting the BNC2014 data were stored in the project folder in the folder `data/bnc-2014-spoken`.



For development, I use a small subset of the corpus contained in `data/test` that only contains the first 10 texts.

In [None]:
testing = True

if testing:
    path_bnc = Path('../data/test/bnc-2014-spoken')
    texts_n = 10
    tokens_n = 94_659
else:
    path_bnc = Path('../data/bnc-2014-spoken')
    texts_n = 1251
    tokens_n = 11_422_615

In [None]:
path_corpus = Path(path_bnc / 'spoken' / 'tagged')
path_metadata = Path(path_bnc / 'spoken' / 'metadata')


In [None]:
assert path_bnc.exists()
assert path_corpus.exists()
assert path_metadata.exists()


# Load and parse XML

In [None]:
path_texts = list(path_corpus.glob('*.xml'))


In [None]:
assert len(path_texts) == texts_n


In [None]:
#| export
def get_xml(f_path):
    with open(f_path, 'r') as f:
        f = f.read()
    xml = etree.fromstring(f)
    return xml


In [None]:
texts = [get_xml(path) for path in path_texts]


# Corpus statistics

## Texts

Calculate the total number of texts in the corpus.

In [None]:
text_ids = [xml.get('id') for xml in texts]

print(f"number of documents in the corpus: {len(text_ids)}")


number of documents in the corpus: 1251


In [None]:
assert len(text_ids) == texts_n


## Speakers

1. Determine all speakers in the corpus.
2. Calculate the total number of words each speaker has contributed to the corpus.

In [None]:
speakers_words = defaultdict(int)
for text in texts:
    for u in text.iter('u'):
        speaker = u.get('who')
        n_words = len([w for w in u.iter('w')])
        speakers_words[speaker] += n_words


### Number of speakers

In [None]:
print(f"number of speakers: {len(speakers_words)}")


number of speakers: 671


### Words per speaker

In [None]:
df_speakers_tokens = pd.DataFrame(
    list(speakers_words.items()), columns=['speaker', 'tokens'])
df_speakers_tokens = df_speakers_tokens.sort_values('tokens', ascending=False)
df_speakers_tokens


Unnamed: 0,speaker,tokens
179,S0192,362107
6,S0012,277953
17,S0084,276558
18,S0041,208025
59,S0439,205049
...,...,...
537,S0121,61
654,S0414,43
388,S0413,36
670,S0066,28


The table containing all speakers and their total token counts can be found in `speakers_tokens.csv`.

In [None]:
if not testing:
    df_speakers_tokens.to_csv('../out/speakers_tokens.csv', index=False)


## Vocabulary

In [None]:
tokens = []
for text in texts:
    for w in text.iter('w'):
        tokens.append(w.text)


In [None]:
pd.DataFrame([
    ['tokens', f'{len(tokens):,}'],
    ['types', f'{len(set(tokens)):,}'],
]
)


Unnamed: 0,0,1
0,tokens,11422615
1,types,69190


# Export corpus data in tabular format

In addition to the metadata present in the corpus, I’ve added three columns providing positional information about the tokens:

- `u_toks`: total number of tokens in the given utterance
- `w_idx`: token position (‘index’) in the given utterance, starting at 1
- `w_idx_rel`: relative token position in the given utterance: `w_idx / u_toks`

In [None]:
%%time

tokens = []

for text in texts:
    for u in text.findall('u'):
        for i, w in enumerate(u.iter('w')):
            tok_d = {}

            tok_d['text_id'] = text.get('id')

            tok_d['u_n'] = u.get('n')
            tok_d['u_who'] = u.get('who')
            tok_d['u_trans'] = u.get('trans')
            tok_d['u_whoConfidence'] = u.get('whoConfidence')
            tok_d['u_toks'] = len(list(u.iter('w')))

            tok_d['w_pos'] = w.get('pos')
            tok_d['w_lemma'] = w.get('lemma')
            tok_d['w_class'] = w.get('class')
            tok_d['w_usas'] = w.get('usas')
            tok_d['w_text'] = w.text
            tok_d['w_idx'] = i + 1
            tok_d['w_idx_rel'] = round(tok_d['w_idx'] / tok_d['u_toks'], 2)

            tokens.append(tok_d)


CPU times: user 1min 43s, sys: 1min 42s, total: 3min 26s
Wall time: 4min 19s


In [None]:
%%time
tokens = pd.DataFrame(tokens)


CPU times: user 54.8 s, sys: 3min 36s, total: 4min 31s
Wall time: 6min 1s


In [None]:
tokens.head(50)


Unnamed: 0,text_id,u_n,u_who,u_trans,u_whoConfidence,u_toks,w_pos,w_lemma,w_class,w_usas,w_text,w_idx,w_idx_rel
0,SN64,1,S0590,nonoverlap,high,18,PPIS1,i,PRON,Z8,I,1,0.06
1,SN64,1,S0590,nonoverlap,high,18,VBM,be,VERB,A3,'m,2,0.11
2,SN64,1,S0590,nonoverlap,high,18,JJ,glad,ADJ,E4:2,glad,3,0.17
3,SN64,1,S0590,nonoverlap,high,18,EX,there,PRON,Z5,there,4,0.22
4,SN64,1,S0590,nonoverlap,high,18,VBZ,be,VERB,A3,'s,5,0.28
5,SN64,1,S0590,nonoverlap,high,18,AT,the,ART,Z5,the,6,0.33
6,SN64,1,S0590,nonoverlap,high,18,MC,two,ADJ,N1,two,7,0.39
7,SN64,1,S0590,nonoverlap,high,18,PNQS,who,PRON,Z8,who,8,0.44
8,SN64,1,S0590,nonoverlap,high,18,PPIS1,i,PRON,Z8,I,9,0.5
9,SN64,1,S0590,nonoverlap,high,18,RR,almost,ADV,A13:4,almost,10,0.56


In [None]:
assert len(tokens) == tokens_n


I export the full token table to `tokens.csv`.

In [None]:
if not testing:
    tokens.to_csv('../out/tokens.csv', index=False)


I also export a smaller version for use in spreadsheet software. This version contains the first 50,000 tokens in the corpus and is stored in `tokens_50k.csv`.

In [None]:
if not testing:
    (tokens
     .head(50_000)
     .to_csv('../out/tokens_50k.csv', index=False))


In [None]:
#| hide
import nbdev
nbdev.nbdev_export()
