In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| default_exp core

# BNCparse

> Parsing the BNC2014 Spoken with Python.

Quirin Würschinger, LMU Munich
[q.wuerschinger@lmu.de](mailto:q.wuerschinger@lmu.de)

# Load packages

Package requirements are stored in `requirements.yml`.

In [None]:
#| export
from pathlib import Path
from collections import defaultdict

from lxml import etree
import pandas as pd

# Variables

BNC2014 needs to be downloaded for this script to work. It can be obtained from the official [BNC website](http://corpora.lancs.ac.uk/bnc2014/). 

The following variables need to be updated to the corpus' local path. In the current setting the BNC2014 data were stored in the project folder in the folder `data`, so relative paths were used.

In [None]:
path_bnc = Path('../data/bnc-2014-spoken')
path_corpus = Path(path_bnc / 'spoken' / 'tagged')
path_metadata = Path(path_bnc / 'spoken' / 'metadata')

In [None]:
assert path_bnc.exists()
assert path_corpus.exists()
assert path_metadata.exists()

# Load and parse XML

In [None]:
path_texts = list(path_corpus.glob('*.xml'))

In [None]:
assert len(path_texts) == 1251

In [None]:
#| export
def get_xml(f_path):
    with open(f_path, 'r') as f:
        f = f.read()
    xml = etree.fromstring(f)
    return xml

In [None]:
texts = [get_xml(path) for path in path_texts]

# Corpus statistics

## Texts

Calculate the total number of texts in the corpus.

In [None]:
text_ids = [xml.get('id') for xml in texts]

print(f"number of documents in the corpus: {len(text_ids)}")

number of documents in the corpus: 1251


In [None]:
assert len(text_ids) == 1251

## Speakers

1. Determine all speakers in the corpus.
2. Calculate the total number of words each speaker has contributed to the corpus.

In [None]:
speakers_words = defaultdict(int)
for text in texts:
    for u in text.iter('u'):
        speaker = u.get('who')
        n_words = len([w for w in u.iter('w')])
        speakers_words[speaker] += n_words

### Number of speakers

In [None]:
print(f"number of speakers: {len(speakers_words)}")

number of speakers: 671


### Words per speaker

In [None]:
df_speakers_tokens = pd.DataFrame(list(speakers_words.items()), columns=['speaker', 'tokens'])
df_speakers_tokens = df_speakers_tokens.sort_values('tokens', ascending=False)
df_speakers_tokens

Unnamed: 0,speaker,tokens
179,S0192,362107
6,S0012,277953
17,S0084,276558
18,S0041,208025
59,S0439,205049
...,...,...
537,S0121,61
654,S0414,43
388,S0413,36
670,S0066,28


In [None]:
df_speakers_tokens.to_csv('../out/speakers_tokens.csv', index=False)

## Vocabulary

In [None]:
tokens = []
for text in texts:
    for w in text.iter('w'):
        tokens.append(w.text)

In [None]:
pd.DataFrame([
	['tokens', f'{len(tokens):,}'],
	['types', f'{len(set(tokens)):,}'],
	]
)

        0           1
0  tokens  11,422,615
1   types      69,190


# Export corpus data in tabular format

In [None]:
%%time

tokens = []

for text in texts:
    for u in text.findall('u'):
        for i, w in enumerate(u.iter('w')):
            tok_d = {}

            tok_d['text_id'] = text.get('id')

            tok_d['u_n'] = u.get('n')
            tok_d['u_who'] = u.get('who')
            tok_d['u_trans'] = u.get('trans')
            tok_d['u_whoConfidence'] = u.get('whoConfidence')

            tok_d['w_pos'] = w.get('pos')
            tok_d['w_lemma'] = w.get('lemma')
            tok_d['w_class'] = w.get('class')
            tok_d['w_usas'] = w.get('usas')
            tok_d['w_text'] = w.text
            tok_d['w_idx'] = i + 1

            tokens.append(tok_d)


CPU times: user 58 s, sys: 1min 46s, total: 2min 44s
Wall time: 3min 57s


In [None]:
%%time
tokens = pd.DataFrame(tokens)

CPU times: user 25.5 s, sys: 1min 36s, total: 2min 1s
Wall time: 2min 58s


In [None]:
tokens.head(50)

Unnamed: 0,text_id,u_n,u_who,u_trans,u_whoConfidence,w_pos,w_lemma,w_class,w_usas,w_text,w_idx
0,SN64,1,S0590,nonoverlap,high,PPIS1,i,PRON,Z8,I,1
1,SN64,1,S0590,nonoverlap,high,VBM,be,VERB,A3,'m,2
2,SN64,1,S0590,nonoverlap,high,JJ,glad,ADJ,E4:2,glad,3
3,SN64,1,S0590,nonoverlap,high,EX,there,PRON,Z5,there,4
4,SN64,1,S0590,nonoverlap,high,VBZ,be,VERB,A3,'s,5
5,SN64,1,S0590,nonoverlap,high,AT,the,ART,Z5,the,6
6,SN64,1,S0590,nonoverlap,high,MC,two,ADJ,N1,two,7
7,SN64,1,S0590,nonoverlap,high,PNQS,who,PRON,Z8,who,8
8,SN64,1,S0590,nonoverlap,high,PPIS1,i,PRON,Z8,I,9
9,SN64,1,S0590,nonoverlap,high,RR,almost,ADV,A13:4,almost,10


In [None]:
assert len(tokens) == 11422615

I export the full token table to `tokens.csv`.

In [None]:
tokens.to_csv('../out/tokens.csv', index=False)

I also export a smaller version for use in spreadsheet software. This version contains the first 50,000 tokens in the corpus and is stored as `tokens_50k.csv`.

In [None]:
# write out first 50000 rows of `tokens`
(tokens
 .head(50_000)
 .to_csv('../out/tokens_50k.csv', index=False))


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()