In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#| default_exp core

# BNCparse

> Parsing the BNC2014 Spoken with Python.

Quirin Würschinger, LMU Munich

[q.wuerschinger@lmu.de](mailto:q.wuerschinger@lmu.de)

# Load packages

Package requirements are stored in `requirements.yml`.

In [3]:
#| export
from pathlib import Path
from collections import defaultdict

from lxml import etree
import pandas as pd

# Variables

BNC2014 needs to be downloaded for this script to work. It can be obtained from the official [BNC website](http://corpora.lancs.ac.uk/bnc2014/). 

The following variables need to be updated to the corpus’ local path. In the current setting the BNC2014 data were stored in the project folder in the folder `data/bnc-2014-spoken`.



For development, I use a small subset of the corpus contained in `data/test` that only contains the first 10 texts.

In [32]:
# test version
path_bnc = Path('../data/test/bnc-2014-spoken')
texts_n = 10
tokens_n = 94659

# full version
path_bnc = Path('../data/bnc-2014-spoken')
# texts_n = 10
# tokens_n = 94659

In [None]:
path_corpus = Path(path_bnc / 'spoken' / 'tagged')
path_metadata = Path(path_bnc / 'spoken' / 'metadata')

In [11]:
assert path_bnc.exists()
assert path_corpus.exists()
assert path_metadata.exists()

# Load and parse XML

In [12]:
path_texts = list(path_corpus.glob('*.xml'))

In [13]:
assert len(path_texts) == texts_n

In [14]:
#| export
def get_xml(f_path):
    with open(f_path, 'r') as f:
        f = f.read()
    xml = etree.fromstring(f)
    return xml

In [15]:
texts = [get_xml(path) for path in path_texts]

# Corpus statistics

## Texts

Calculate the total number of texts in the corpus.

In [16]:
text_ids = [xml.get('id') for xml in texts]

print(f"number of documents in the corpus: {len(text_ids)}")

number of documents in the corpus: 10


In [17]:
assert len(text_ids) == texts_n

## Speakers

1. Determine all speakers in the corpus.
2. Calculate the total number of words each speaker has contributed to the corpus.

In [18]:
speakers_words = defaultdict(int)
for text in texts:
    for u in text.iter('u'):
        speaker = u.get('who')
        n_words = len([w for w in u.iter('w')])
        speakers_words[speaker] += n_words

### Number of speakers

In [19]:
print(f"number of speakers: {len(speakers_words)}")

number of speakers: 24


### Words per speaker

In [20]:
df_speakers_tokens = pd.DataFrame(list(speakers_words.items()), columns=['speaker', 'tokens'])
df_speakers_tokens = df_speakers_tokens.sort_values('tokens', ascending=False)
df_speakers_tokens

Unnamed: 0,speaker,tokens
20,S0336,14332
21,S0362,11261
16,S0115,7522
17,S0037,6970
1,S0623,6621
11,S0024,5086
5,S0611,4698
10,S0144,4183
14,S0687,3810
22,S0261,3596


The table containing all speakers and their total token counts can be found in `speakers_tokens.csv`.

In [21]:
df_speakers_tokens.to_csv('../out/speakers_tokens.csv', index=False)

## Vocabulary

In [22]:
tokens = []
for text in texts:
    for w in text.iter('w'):
        tokens.append(w.text)

In [23]:
pd.DataFrame([
	['tokens', f'{len(tokens):,}'],
	['types', f'{len(set(tokens)):,}'],
	]
)

Unnamed: 0,0,1
0,tokens,94659
1,types,5334


# Export corpus data in tabular format

In [24]:
%%time

tokens = []

for text in texts:
    for u in text.findall('u'):
        for i, w in enumerate(u.iter('w')):
            tok_d = {}

            tok_d['text_id'] = text.get('id')

            tok_d['u_n'] = u.get('n')
            tok_d['u_who'] = u.get('who')
            tok_d['u_trans'] = u.get('trans')
            tok_d['u_whoConfidence'] = u.get('whoConfidence')

            tok_d['w_pos'] = w.get('pos')
            tok_d['w_lemma'] = w.get('lemma')
            tok_d['w_class'] = w.get('class')
            tok_d['w_usas'] = w.get('usas')
            tok_d['w_text'] = w.text
            tok_d['w_idx'] = i + 1

            tokens.append(tok_d)


CPU times: user 350 ms, sys: 22.7 ms, total: 373 ms
Wall time: 375 ms


In [25]:
%%time
tokens = pd.DataFrame(tokens)

CPU times: user 92.4 ms, sys: 9.43 ms, total: 102 ms
Wall time: 101 ms


In [26]:
tokens.head(50)

Unnamed: 0,text_id,u_n,u_who,u_trans,u_whoConfidence,w_pos,w_lemma,w_class,w_usas,w_text,w_idx
0,S2EF,1,S0567,nonoverlap,high,VM,shall,VERB,T1:1:3,shall,1
1,S2EF,1,S0567,nonoverlap,high,PPIS1,i,PRON,Z8,I,2
2,S2EF,1,S0567,nonoverlap,high,VVI,move,VERB,M2,move,3
3,S2EF,1,S0567,nonoverlap,high,AT,the,ART,Z5,the,4
4,S2EF,1,S0567,nonoverlap,high,NN2,laptop,SUBST,Y2,laptops,5
5,S2EF,1,S0567,nonoverlap,high,RT,then,ADV,N4,then,6
6,S2EF,1,S0567,nonoverlap,high,VV0,stick,VERB,M2,stick,7
7,S2EF,1,S0567,nonoverlap,high,PPH1,it,PRON,Z8,it,8
8,S2EF,1,S0567,nonoverlap,high,II,on,PREP,N6,on,9
9,S2EF,1,S0567,nonoverlap,high,AT,the,ART,N6,the,10


In [28]:
len(tokens)

94659

In [30]:
assert len(tokens) == tokens_n

I export the full token table to `tokens.csv`.

In [None]:
tokens.to_csv('../out/tokens.csv', index=False)

I also export a smaller version for use in spreadsheet software. This version contains the first 50,000 tokens in the corpus and is stored in `tokens_50k.csv`.

In [None]:
# write out first 50000 rows of `tokens`
(tokens
 .head(50_000)
 .to_csv('../out/tokens_50k.csv', index=False))


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()