In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

# core

> Fill in a module description here

# Load packages

Package requirements are stored in `requirements.yml`.

In [None]:
#| export
from pathlib import Path
from collections import defaultdict

from lxml import etree
import pandas as pd

# Variables

BNC2014 needs to be downloaded for this script to work. It can be obtained from the official [BNC website](http://corpora.lancs.ac.uk/bnc2014/). 

The following variables need to be updated to the corpus' local path. In the current setting the BNC2014 data were stored in the project folder in the folder `data`, so relative paths were used.

In [None]:
path_bnc = Path('../data/bnc-2014-spoken')
path_corpus = Path(path_bnc / 'spoken' / 'tagged')
path_metadata = Path(path_bnc / 'spoken' / 'metadata')

In [None]:
assert path_bnc.exists()
assert path_corpus.exists()
assert path_metadata.exists()

# Load and parse XML

In [None]:
path_texts = list(path_corpus.glob('*.xml'))

In [None]:
assert len(path_texts) == 1251

In [None]:
#| export
def get_xml(f_path):
    with open(f_path, 'r') as f:
        f = f.read()
    xml = etree.fromstring(f)
    return xml

# Corpus statistics

## Texts

Calculate the total number of texts in the corpus.

In [None]:
%%time
text_ids = []
for path in path_texts:
    xml = get_xml(path)
    id = xml.get('id')
    text_ids.append(id)

print(f"number of documents in the corpus: {len(text_ids)}")

number of documents in the corpus: 1251
CPU times: user 23.7 s, sys: 2.31 s, total: 26 s
Wall time: 26.6 s


In [None]:
assert len(text_ids) == 1251

## Speakers

1. Determine all speakers in the corpus.
2. Calculate the total number of words each speaker has contributed to the corpus.

In [None]:
%%time
speakers_words = defaultdict(int)
for path in path_texts:
    xml = get_xml(path)
    for u in xml.iter('u'):
        speaker = u.get('who')
        n_words = len([w for w in u.iter('w')])
        speakers_words[speaker] += n_words

CPU times: user 27.8 s, sys: 2.11 s, total: 29.9 s
Wall time: 29.9 s


### Number of speakers

In [None]:
print(f"number of speakers: {len(speakers_words)}")

number of speakers: 671


### Words per speaker

In [None]:
df_speakers_words = pd.DataFrame(list(speakers_words.items()), columns=['speaker', 'n_words'])
df_speakers_words.sort_values('n_words', ascending=False)

Unnamed: 0,speaker,n_words
179,S0192,362107
6,S0012,277953
17,S0084,276558
18,S0041,208025
59,S0439,205049
...,...,...
537,S0121,61
654,S0414,43
388,S0413,36
670,S0066,28


## Vocabulary

In [None]:
%%time
tokens = []
for path in path_texts:
    xml = get_xml(path)
    for w in xml.iter('w'):
        tokens.append(w.text)

CPU times: user 26 s, sys: 2.54 s, total: 28.6 s
Wall time: 28.9 s


In [None]:
print(f"total number of tokens in the corpus: {len(tokens)}")

total number of tokens in the corpus: 11422615


In [None]:
print(f"total number of types in the corpus: {len(set(tokens))}")

total number of types in the corpus: 69190


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()