In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| default_exp core

# BNCparse

> Parsing the BNC2014 Spoken with Python.

Quirin Würschinger, LMU Munich

[q.wuerschinger@lmu.de](mailto:q.wuerschinger@lmu.de)

# Load packages

Package requirements are stored in `requirements.yml`.

In [None]:
#| export
from pathlib import Path
from collections import defaultdict

from lxml import etree
import pandas as pd

# Variables

BNC2014 needs to be downloaded for this script to work. It can be obtained from the official [BNC website](http://corpora.lancs.ac.uk/bnc2014/). 

The following variables need to be updated to the corpus’ local path. In the current setting the BNC2014 data were stored in the project folder in the folder `data/bnc-2014-spoken`.



For development, I use a small subset of the corpus contained in `data/test` that only contains the first 10 texts.

In [None]:
# test version
path_bnc = Path('../data/test/bnc-2014-spoken')
texts_n = 10
tokens_n = 94659

# full version
# path_bnc = Path('../data/bnc-2014-spoken')
# texts_n = 10
# tokens_n = 94659

In [None]:
path_corpus = Path(path_bnc / 'spoken' / 'tagged')
path_metadata = Path(path_bnc / 'spoken' / 'metadata')

In [None]:
assert path_bnc.exists()
assert path_corpus.exists()
assert path_metadata.exists()

# Load and parse XML

In [None]:
path_texts = list(path_corpus.glob('*.xml'))

In [None]:
assert len(path_texts) == texts_n

In [None]:
#| export
def get_xml(f_path):
    with open(f_path, 'r') as f:
        f = f.read()
    xml = etree.fromstring(f)
    return xml

In [None]:
texts = [get_xml(path) for path in path_texts]

# Corpus statistics

## Texts

Calculate the total number of texts in the corpus.

In [None]:
text_ids = [xml.get('id') for xml in texts]

print(f"number of documents in the corpus: {len(text_ids)}")

In [None]:
assert len(text_ids) == texts_n

## Speakers

1. Determine all speakers in the corpus.
2. Calculate the total number of words each speaker has contributed to the corpus.

In [None]:
speakers_words = defaultdict(int)
for text in texts:
    for u in text.iter('u'):
        speaker = u.get('who')
        n_words = len([w for w in u.iter('w')])
        speakers_words[speaker] += n_words

### Number of speakers

In [None]:
print(f"number of speakers: {len(speakers_words)}")

### Words per speaker

In [None]:
df_speakers_tokens = pd.DataFrame(list(speakers_words.items()), columns=['speaker', 'tokens'])
df_speakers_tokens = df_speakers_tokens.sort_values('tokens', ascending=False)
df_speakers_tokens

The table containing all speakers and their total token counts can be found in `speakers_tokens.csv`.

In [None]:
df_speakers_tokens.to_csv('../out/speakers_tokens.csv', index=False)

## Vocabulary

In [None]:
tokens = []
for text in texts:
    for w in text.iter('w'):
        tokens.append(w.text)

In [None]:
pd.DataFrame([
	['tokens', f'{len(tokens):,}'],
	['types', f'{len(set(tokens)):,}'],
	]
)

# Export corpus data in tabular format

In [None]:
%%time

tokens = []

for text in texts:
    for u in text.findall('u'):
        for i, w in enumerate(u.iter('w')):
            tok_d = {}

            tok_d['text_id'] = text.get('id')

            tok_d['u_n'] = u.get('n')
            tok_d['u_who'] = u.get('who')
            tok_d['u_trans'] = u.get('trans')
            tok_d['u_whoConfidence'] = u.get('whoConfidence')

            tok_d['w_pos'] = w.get('pos')
            tok_d['w_lemma'] = w.get('lemma')
            tok_d['w_class'] = w.get('class')
            tok_d['w_usas'] = w.get('usas')
            tok_d['w_text'] = w.text
            tok_d['w_idx'] = i + 1

            tokens.append(tok_d)


In [None]:
%%time
tokens = pd.DataFrame(tokens)

In [None]:
tokens.head(50)

In [None]:
len(tokens)

In [None]:
assert len(tokens) == tokens_n

I export the full token table to `tokens.csv`.

In [None]:
tokens.to_csv('../out/tokens.csv', index=False)

I also export a smaller version for use in spreadsheet software. This version contains the first 50,000 tokens in the corpus and is stored in `tokens_50k.csv`.

In [None]:
# write out first 50000 rows of `tokens`
(tokens
 .head(50_000)
 .to_csv('../out/tokens_50k.csv', index=False))


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()