In [None]:
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| default_exp core

# BNCparse

> Parsing the BNC2014 Spoken with Python.

Quirin Würschinger, LMU Munich

[q.wuerschinger@lmu.de](mailto:q.wuerschinger@lmu.de)

## Data overview

```{mermaid}
%%| fig-width: 7
classDiagram

class text {
    <<conversation>>
    text_id : "Text ID"
}

class u {
    <<utterance>>
    n : "Consecutive utterance number"
    who : "Speaker ID"
    trans : "Transition type"
    whoConfidence: "Attribution confidence"
    + u_toks : "Number of tokens in the utterance"
}

class w {
    <<token>>
    pos : "part-of-speech tag [CLAWS]"
    lemma : "lemmatised form"
    class : "“simple” POS tag or major word-class"
    usas : "semantic tag [USAS]"
    + w_idx : "token position in the given utterance"
    + w_idx_rel : "relative token position in the given utterance"
}

class meta_speaker {
    <<meta_speaker>>
    id : "Speaker ID"
    exactage : "Exact age"
    age1994 : "Age [BNC1994 groups]"
    agerange : "Age range"
    gender : "Gender"
    nat : "Nationality"
    birthplace : "Place of birth"
    birthcountry : "Country of birth"
    l1 : "First language"
    lingorig : "Linguistic origin"
    dialect_rep : "Accent/dialect as reported"
    hab_city : "City/town living"
    hab_country : "Country living"
    hab_dur : "Duration living [years]"
    dialect_l1 : "Dialect at Level 1"
    dialect_l2 : "Dialect at Level 2"
    dialect_l3 : "Dialect at Level 3"
    dialect_l4 : "Dialect at Level 4"
    edqual : "Highest qualification"
    occupation : "Occupation: title"
    socgrade : "Class: Social grade"
    nssec : "Class: NS-SEC"
    l2 : "L2 [if bilingual]"
    fls : "Foreign languages spoken"
    in_core : "Part of core set of speakers"
}

class meta_text {
    <<meta_text>>
    text_id : "Text ID"
    rec_length : "Recording length"
    rec_date : "Recording date"
    rec_year : "Year of recording"
    rec_period : "Recording period"
    n_speakers : "Number of speakers"
    list_speakers : "List of speaker IDs"
    rec_loc : "Recording location"
    relationships : "Inter-speaker relationship"
    topics : "Topics covered"
    activity : "Activity description"
    conv_type : "Selected characterisations of conversation type"
    conventions : "Transcription conventions used"
    in_sample : "Sample release inclusion"
    transcriber : "Transcriber"
}

text ..* u : contains
u ..* w : contains
text .. meta_text : text_id
u .. meta_speaker : who=id
```

# Load packages

Package requirements are stored in `requirements.yml`.

In [None]:
#| export
from pathlib import Path
from collections import defaultdict

from lxml import etree
import pandas as pd


# Variables

BNC2014 needs to be downloaded for this script to work. It can be obtained from the official [BNC website](http://corpora.lancs.ac.uk/bnc2014/). 

The following variables need to be updated to the corpus’ local path. In the current setting the BNC2014 data were stored in the project folder in the folder `data/bnc-2014-spoken`.



For development, I use a small subset of the corpus contained in `data/test` that only contains the first 10 texts.

In [None]:
testing = True

if testing:
    path_bnc = Path('../data/test/bnc-2014-spoken')
    assert path_bnc.exists()
    texts_n = 10
    tokens_n = 94_659
else:
    path_bnc = Path('../data/bnc-2014-spoken')
    assert path_bnc.exists()
    texts_n = 1251
    tokens_n = 11_422_615

In [None]:
path_corpus = Path(path_bnc / 'spoken' / 'tagged')
assert path_corpus.exists()
path_metadata = Path(path_bnc / 'spoken' / 'metadata')
assert path_metadata.exists()


In [None]:
fp_meta_speakers = Path('../data/bnc-2014-spoken/spoken/metadata/bnc2014spoken-speakerdata.tsv')
assert fp_meta_speakers.exists()
fp_meta_speakers_fields = Path('../data/bnc-2014-spoken/spoken/metadata/metadata-fields-speaker.txt')
assert fp_meta_speakers_fields.exists()
fp_meta_texts = Path('../data/bnc-2014-spoken/spoken/metadata/bnc2014spoken-textdata.tsv')
assert fp_meta_texts.exists()
fp_meta_texts_fields = Path('../data/bnc-2014-spoken/spoken/metadata/metadata-fields-text.txt')
assert fp_meta_texts_fields.exists()

# Load and parse XML

In [None]:
path_texts = list(path_corpus.glob('*.xml'))


In [None]:
assert len(path_texts) == texts_n


In [None]:
#| export
def get_xml(f_path):
    with open(f_path, 'r') as f:
        f = f.read()
    xml = etree.fromstring(f)
    return xml


In [None]:
texts = [get_xml(path) for path in path_texts]


# Corpus statistics

## Texts

Calculate the total number of texts in the corpus.

In [None]:
text_ids = [xml.get('id') for xml in texts]

print(f"number of documents in the corpus: {len(text_ids)}")


In [None]:
assert len(text_ids) == texts_n


## Speakers

1. Determine all speakers in the corpus.
2. Calculate the total number of words each speaker has contributed to the corpus.

In [None]:
speakers_words = defaultdict(int)
for text in texts:
    for u in text.iter('u'):
        speaker = u.get('who')
        n_words = len([w for w in u.iter('w')])
        speakers_words[speaker] += n_words


### Number of speakers

In [None]:
print(f"number of speakers: {len(speakers_words)}")


### Words per speaker

In [None]:
df_speakers_tokens = pd.DataFrame(
    list(speakers_words.items()), columns=['speaker', 'tokens'])
df_speakers_tokens = df_speakers_tokens.sort_values('tokens', ascending=False)
df_speakers_tokens


The table containing all speakers and their total token counts can be found in `speakers_tokens.csv`.

In [None]:
if not testing:
    df_speakers_tokens.to_csv('../out/speakers_tokens.csv', index=False)


## Vocabulary

In [None]:
tokens = []
for text in texts:
    for w in text.iter('w'):
        tokens.append(w.text)


In [None]:
n_toks_types = pd.DataFrame(
    {'tokens': f'{len(tokens):,}', 
    'types': f'{len(set(tokens)):,}'}, 
    index=[0]
)

n_toks_types


# Export corpus data in tabular format

In addition to the metadata present in the corpus, I’ve added three columns providing positional information about the tokens:

- `u_toks`: total number of tokens in the given utterance
- `w_idx`: token position (‘index’) in the given utterance, starting at 1
- `w_idx_rel`: relative token position in the given utterance: `w_idx / u_toks`

In [None]:
%%time

tokens = []

for text in texts:
    for u in text.findall('u'):
        for i, w in enumerate(u.iter('w')):
            tok_d = {}

            tok_d['text_id'] = text.get('id')

            tok_d['u_n'] = u.get('n')
            tok_d['u_who'] = u.get('who')
            tok_d['u_trans'] = u.get('trans')
            tok_d['u_whoConfidence'] = u.get('whoConfidence')
            tok_d['u_toks'] = len(list(u.iter('w')))

            tok_d['w_pos'] = w.get('pos')
            tok_d['w_lemma'] = w.get('lemma')
            tok_d['w_class'] = w.get('class')
            tok_d['w_usas'] = w.get('usas')
            tok_d['w_text'] = w.text
            tok_d['w_idx'] = i + 1
            tok_d['w_idx_rel'] = round(tok_d['w_idx'] / tok_d['u_toks'], 2)

            tokens.append(tok_d)


In [None]:
%%time
tokens = pd.DataFrame(tokens)


In [None]:
tokens.head(50)


In [None]:
assert len(tokens) == tokens_n


I export the full token table to `tokens.csv`.

In [None]:
if not testing:
    tokens.to_csv('../out/tokens.csv', index=False)


I also export a smaller version for use in spreadsheet software. This version contains the first 50,000 tokens in the corpus and is stored in `tokens_50k.csv`.

In [None]:
if not testing:
    (tokens
     .head(50_000)
     .to_csv('../out/tokens_50k.csv', index=False))


# Add metadata

## Speakers

In [None]:
meta_speakers_head = pd.read_csv(
    fp_meta_speakers_fields,
    delimiter='\t',
    skiprows=1,
    index_col=0
)

In [None]:
meta_speakers_head

Unnamed: 0_level_0,XML tag,Metadata Title
#,Unnamed: 1_level_1,Unnamed: 2_level_1
1,exactage,Exact age
2,age1994,Age (BNC1994 groups)
3,agerange,Age range
4,gender,Gender
5,nat,Nationality
6,birthplace,Place of birth
7,birthcountry,Country of birth
8,l1,First language
9,lingorig,Linguistic origin
10,dialect_rep,Accent/dialect as reported


In [None]:
meta_speakers = pd.read_csv(
    fp_meta_speakers, 
    delimiter='\t', 
    names=meta_speakers_head['XML tag'],
    index_col=0
)

In [None]:
meta_speakers

Unnamed: 0,exactage,age1994,agerange,gender,nat,birthplace,birthcountry,l1,lingorig,dialect_rep,...,dialect_l2,dialect_l3,dialect_l4,edqual,occupation,socgrade,nssec,l2,fls,in_core
S0001,32,25_34,30_39,F,British,"Wordsley, West Midlands",England,English,England,None indicated,...,unspecified,unspecified,unspecified,5_postgrad,University researcher,A,1_2,,,n
S0002,,Unknown,19_29,F,British,Birmingham,England,English,England,Midlands,...,england,midlands,unspecified,5_postgrad,Teacher,B,2,,Japanese -- Intermediate,n
S0003,,Unknown,19_29,F,British,"Royal Leamington Spa, Warwickshire",England,English,England,Northern,...,england,north,unspecified,4_graduate,Student,E,uncat,,,n
S0004,,Unknown,30_39,M,British,,Germany,English,England,Northern,...,england,north,unspecified,5_postgrad,Engineer,C2,5,,Spanish -- Beginner,n
S0005,,60plus,80_89,F,British,Birmingham,England,English,England,Midlands,...,england,midlands,unspecified,2_secondary,Insurance Broker (retired),E,8,,French -- Beginner,n
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
S0691,45,45_59,40_49,F,British,Barrow-In-Furness,UK,English,England,Northern/ Cumbrian,...,england,north,unspecified,3_sixthform,dental nurse (trainee),D,6,,,y
S0692,22,15_24,19_29,M,British,Barrow-in-Furness,England,English,England,Northern,...,england,north,unspecified,3_sixthform,Sales Assistant (Part time),D,6,,,n
UNKFEMALE,,Unknown,Unknown,F,,,,,,None indicated,...,unspecified,unspecified,unspecified,9_unknown,,unknown,unknown,,,n
UNKMALE,,Unknown,Unknown,M,,,,,,None indicated,...,unspecified,unspecified,unspecified,9_unknown,,unknown,unknown,,,n


## Texts

In [None]:
meta_texts_head = pd.read_csv(
    fp_meta_texts_fields,
    delimiter='\t',
    skiprows=1,
    index_col=0
)

In [None]:
meta_texts_head

Unnamed: 0_level_0,XML tag,Metadata Title
#,Unnamed: 1_level_1,Unnamed: 2_level_1
1,rec_length,Recording length
2,rec_date,Recording date
3,rec_year,Year of recording
4,rec_period,Recording period
5,n_speakers,Number of speakers
6,list_speakers,List of speaker IDs
7,rec_loc,Recording location
8,relationships,Inter-speaker relationship
9,topics,Topics covered
10,activity,Activity description


In [None]:
meta_texts = pd.read_csv(
    fp_meta_texts, 
    delimiter='\t', 
    names=meta_texts_head['XML tag'],
    index_col=0
)

In [None]:
meta_texts

Unnamed: 0,rec_length,rec_date,rec_year,rec_period,n_speakers,list_speakers,rec_loc,relationships,topics,activity,conv_type,conventions,in_sample,transcriber
S23A,1:50:43,2014-12-27,2014,2014_Q4,4,S0021 S0032 S0094 S0095,Speakers' home,"Close family, partners, very close friends","Computer programming, food, wine, temperature,...",Catching up with family over food and presents,"Discussing, explaining, anecdote telling",Revised,n,T15
S24A,0:17:24,2014-09-12,2014,2014_Q3,2,S0261 S0262,"Modern Art Museum, London","Close family, partners, very close friends",The art,A couple discussing modern art at a museum,"Discussing, explaining, inquiring",Revised,y,T09
S24D,0:20:00,2016-01-14,2016,2016_Q1,3,S0653 S0654 S0655,"Home kitchen, Comberton","Close family, partners, very close friends","Lego Ninjago, Minecraft worlds",Spending time on electronic toys instead of re...,"Discusing, explaining",Revised,n,T18
S24E,0:45:53,2015-09-15,2015,2015_Q3,3,S0519 S0520 S0521,"Hunsonby, Cumbria","Close family, partners, very close friends","food, exercise, choir, family plans, family me...",Midweek family dinner,"Discussing, explaining, Inquiring, advising, a...",Revised,n,T09
S263,2:00:00,2016-02-07,2016,2016_Q1,4,S0588 S0589 S0590 S0616,ANON’s home,"Close family, partners, very close friends",,,"Discussing, explaining",Revised,n,T10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SZVB,1:00:31,2015-11-02,2015,2015_Q4,2,S0517 S0525,"(ANON’s home, Fradley, Staffs)","Close family, partners, very close friends","Babies, family, friends",Sisters talking about their family (new baby d...,"Discussing, explaining, inquiring, anecdote te...",Revised,n,T15
SZVC,0:32:00,2015-09-14,2015,2015_Q3,2,S0324 S0325,"ANON's home, Linton","Close family, partners, very close friends","school orchestra (windband), Playing the Clari...",Friends talking about school,"discussing, explaining, inquiring, complaining...",Revised,n,T10
SZW4,0:21:09,2015-10-19,2015,2015_Q4,2,S0509 S0510,"ANON & ANON's home, Hastings","Close family, partners, very close friends","Poetry, Morning Routine, Food, Social Events, ...",Mother and Daughter,"Discussing, inquiring, anecdote telling",Revised,n,T18
SZXQ,0:40:44,2012-03-21,2012,2012_Q1,2,S0058 S0120,"Botanic Gardens, Cambridge","Friends, wider family circle","TV, languages, friends, holidays, offices, comedy",,"Discussing, explaining, inquiring, complaining...",Original,y,T11


## Merge tokens with speaker & text metadata

In [None]:
#| hide
import nbdev
nbdev.nbdev_export()
