In [None]:
%load_ext autoreload
%autoreload 2


In [None]:
#| default_exp core

# BNCparse

> Parsing the BNC2014 Spoken with Python.

Quirin Würschinger, LMU Munich

[q.wuerschinger@lmu.de](mailto:q.wuerschinger@lmu.de)

## Data overview

The diagram below illustrates all of the data that is currently available. Variables that have been added to what was available from the downloadable version of the BNC are marked with a `+` prefix.

```{mermaid}
%%| fig-width: 7
classDiagram

class text {
    <<conversation>>
    text_id : "Text ID"
}

class u {
    <<utterances.csv>>
    n : "Consecutive utterance number"
    who : "Speaker ID"
    trans : "Transition type"
    whoConfidence: "Attribution confidence"

    + u_toks_n : "Number of tokens in the utterance"
}

class w {
    <<tokens.csv>>
    pos : "part-of-speech tag [CLAWS]"
    lemma : "lemmatised form"
    class : "“simple” POS tag or major word-class"
    usas : "semantic tag [USAS]"

    + w_idx : "token position in the given utterance"
    + w_idx_rel : "relative token position in the given utterance"
    + w_L1 : "preceding token"
    + w_R1 : = "subsequent token
}

class meta_speaker {
    <<speakers.csv>>
    id : "Speaker ID"
    exactage : "Exact age"
    age1994 : "Age [BNC1994 groups]"
    agerange : "Age range"
    gender : "Gender"
    nat : "Nationality"
    birthplace : "Place of birth"
    birthcountry : "Country of birth"
    l1 : "First language"
    lingorig : "Linguistic origin"
    dialect_rep : "Accent/dialect as reported"
    hab_city : "City/town living"
    hab_country : "Country living"
    hab_dur : "Duration living [years]"
    dialect_l1 : "Dialect at Level 1"
    dialect_l2 : "Dialect at Level 2"
    dialect_l3 : "Dialect at Level 3"
    dialect_l4 : "Dialect at Level 4"
    edqual : "Highest qualification"
    occupation : "Occupation: title"
    socgrade : "Class: Social grade"
    nssec : "Class: NS-SEC"
    l2 : "L2 [if bilingual]"
    fls : "Foreign languages spoken"
    in_core : "Part of core set of speakers"
    + speaker_toks_n : "Total number of tokens"
}

class meta_text {
    <<texts.csv>>
    text_id : "Text ID"
    rec_length : "Recording length"
    rec_date : "Recording date"
    rec_year : "Year of recording"
    rec_period : "Recording period"
    n_speakers : "Number of speakers"
    list_speakers : "List of speaker IDs"
    rec_loc : "Recording location"
    relationships : "Inter-speaker relationship"
    topics : "Topics covered"
    activity : "Activity description"
    conv_type : "Selected characterisations of conversation type"
    conventions : "Transcription conventions used"
    in_sample : "Sample release inclusion"
    transcriber : "Transcriber"
}

text ..* u : contains
u ..* w : contains
text .. meta_text : text_id
u .. meta_speaker : who
```

# Load packages

Package requirements are stored in `requirements.yml`.

In [78]:
#| export
import os
from pathlib import Path
from collections import defaultdict

from lxml import etree
import pandas as pd


# Variables


For development, I use a small subset of the corpus contained in `data/test` that only contains the first 10 texts.

In [None]:
testing = False

if testing:
    path_bnc = Path('../data/test/bnc-2014-spoken')
    assert path_bnc.exists()
    texts_n = 10
    tokens_n = 94_659
else:
    path_bnc = Path('../data/bnc-2014-spoken')
    assert path_bnc.exists()
    texts_n = 1251
    tokens_n = 11_422_615

In [None]:
path_corpus = Path(path_bnc / 'spoken' / 'tagged')
path_metadata = Path(path_bnc / 'spoken' / 'metadata')
fp_meta_speakers = Path('../data/bnc-2014-spoken/spoken/metadata/bnc2014spoken-speakerdata.tsv')
fp_meta_speakers_fields = Path('../data/bnc-2014-spoken/spoken/metadata/metadata-fields-speaker.txt')
fp_meta_texts = Path('../data/bnc-2014-spoken/spoken/metadata/bnc2014spoken-textdata.tsv')
fp_meta_texts_fields = Path('../data/bnc-2014-spoken/spoken/metadata/metadata-fields-text.txt')

In [None]:
assert path_corpus.exists()
assert path_metadata.exists()
assert fp_meta_speakers.exists()
assert fp_meta_speakers_fields.exists()
assert fp_meta_texts.exists()
assert fp_meta_texts_fields.exists()

# Load and parse XML

In [None]:
path_texts = list(path_corpus.glob('*.xml'))


In [None]:
assert len(path_texts) == texts_n


In [None]:
#| export
def get_xml(f_path):
    with open(f_path, 'r') as f:
        f = f.read()
    xml = etree.fromstring(f)
    return xml


In [None]:
texts = [get_xml(path) for path in path_texts]


# Texts

In [None]:
meta_texts_head = pd.read_csv(
    fp_meta_texts_fields,
    delimiter='\t',
    skiprows=1,
    index_col=0
)

In [None]:
meta_texts = pd.read_csv(
    fp_meta_texts, 
    delimiter='\t', 
    names=meta_texts_head['XML tag'],
    index_col=0
)

## Add number of tokens per text

In [None]:
texts_tokens = []

for text in texts:
    text_d = {}
    text_d['text_id'] = text.get('id')
    text_d['text_toks_n'] = 0
    for tok in text.iter('w'):
        text_d['text_toks_n'] += 1
    texts_tokens.append(text_d)


In [None]:
texts_tokens = pd.DataFrame(texts_tokens)
texts_tokens

Unnamed: 0,text_id,text_toks_n
0,S2EF,16644
1,S2CY,2706
2,S2AJ,4161
3,S2B5,7372
4,S2DD,11452
5,S2A5,1897
6,S2AX,14492
7,S2E2,4883
8,S2C9,25593
9,S2FQ,5459


In [None]:
# reset index and call it text_id
meta_texts_merge = meta_texts.reset_index().rename(columns={'index': 'text_id'})

In [None]:
meta_texts = pd.merge(
    left=meta_texts_merge,
    right=texts_tokens,
    on='text_id'
)


In [None]:
meta_texts

Unnamed: 0,text_id,rec_length,rec_date,rec_year,rec_period,n_speakers,list_speakers,rec_loc,relationships,topics,activity,conv_type,conventions,in_sample,transcriber,text_toks_n
0,S2A5,0:12:20,2014-08-28,2014,2014_Q3,2,S0024 S0144,Speakers' home,"Close family, partners, very close friends",meeting; making arrangements for going to loca...,Partners have a chat about jetlag and babies.,Discussing,Revised,y,T15,1897
1,S2AJ,0:19:24,2015-08-04,2015,2015_Q3,2,S0439 S0441,Home - kitchen,"Close family, partners, very close friends","Food, old school friends, complaining about th...",Catch-up with housemate.,"Discussing, explaining, inquiring, complaining...",Revised,y,T19,4161
2,S2AX,1:03:29,2012-04-01,2012,2012_Q2,2,S0037 S0115,"ANON and ANON’s home, Cambridge","Close family, partners, very close friends","Music, elitism, magazines, dreams, Christmas d...",ANON and ANON talking while listening to the r...,"Discussing, explaining, anecdote telling",Original,y,T15,14492
3,S2B5,0:36:03,2012-03-06,2012,2012_Q1,2,S0024 S0144,"The Swan pub, Norfolk","Close family, partners, very close friends","Dogs, property, economics, health",Husband and wife discuss some issues over a dr...,"Discussing, explaining",Original,y,T20,7372
4,S2C9,2:12:08,2015-02-24,2015,2015_Q1,2,S0336 S0362,Speaker's home,"Friends, wider family circle","Friends, family, work, holidays, festivals, ho...",Friends catching up,"Discussing, explaining, inquiring, complaining...",Revised,n,T10,25593
5,S2CY,0:14:25,2015-11-27,2015,2015_Q4,2,S0679 S0680,"ANON’s living room, Leeds","Close family, partners, very close friends","Computers, Work colleagues in computing, Furni...",Late evening chat,"Discussing, anecdote telling, making arrangements",Revised,n,T04,2706
6,S2DD,1:04:04,2016-06-21,2016,2016_Q2,4,S0687 S0688 S0689 S0690,"A restaurant, Istria, Croatia","Close family, partners, very close friends","Food, Drink, Weather, Cities and towns in Istr...",,"Discussing, explaining, inquiring",Revised,n,T10,11452
7,S2E2,0:23:55,2012-04-16,2012,2012_Q2,2,S0030 S0096,"The university, Salford",Colleagues,CVs,Colleagues Talking about Writing a CV,"Discussing, explaining, advising",Original,y,T11,4883
8,S2EF,1:25:40,2016-01-10,2016,2016_Q1,4,S0567 S0611 S0620 S0623,"All speakers’ rented uni home, Lancaster","Close family, partners, very close friends","Discussing/eating food, bedrooms, maths, Sugar...",Talking while eating pizza with housemates,"Discussing, explaining, inquiring, anecdote te...",Revised,n,T10,16644
9,S2FQ,0:37:35,2014-09-05,2014,2014_Q3,2,S0261 S0262,House-sitting in Australia,"Close family, partners, very close friends","social story-telling, problem-solving techniqu...",A couple discussing their ideas for innovative...,"Discussing, explaining, inquiring, advising, r...",Revised,y,T19,5459


In [None]:
if not testing:
    meta_texts.to_csv('../out/texts.csv', index=False)

# Utterances

In [None]:
utterances = []

for text in texts:
    for u in text.findall('u'):
        u_d = {}
        u_d['text_id'] = text.get('id')
        u_d['u_n'] = u.get('n')
        u_d['u_who'] = u.get('who')
        u_d['u_trans'] = u.get('trans')
        u_d['u_whoConfidence'] = u.get('whoConfidence')
        u_d['u_toks_n'] = len(list(u.iter('w')))
        utterances.append(u_d)

In [None]:
utterances = pd.DataFrame(utterances)

In [None]:
utterances

Unnamed: 0,text_id,u_n,u_who,u_trans,u_whoConfidence,u_toks_n
0,S2EF,1,S0567,nonoverlap,high,12
1,S2EF,2,S0623,nonoverlap,high,8
2,S2EF,3,S0620,nonoverlap,high,1
3,S2EF,4,S0623,nonoverlap,high,0
4,S2EF,5,S0620,nonoverlap,high,4
...,...,...,...,...,...,...
8803,S2FQ,239,S0261,nonoverlap,high,52
8804,S2FQ,240,S0262,nonoverlap,high,12
8805,S2FQ,241,S0261,nonoverlap,high,9
8806,S2FQ,242,S0262,nonoverlap,high,7


In [None]:
if not testing:
    utterances.to_csv('../out/utterances.csv', index=False)

# Speakers

In [None]:
meta_speakers_head = pd.read_csv(
    fp_meta_speakers_fields,
    delimiter='\t',
    skiprows=1,
    index_col=0
)

In [None]:
meta_speakers = pd.read_csv(
    fp_meta_speakers, 
    delimiter='\t', 
    names=meta_speakers_head['XML tag'],
    index_col=0
)

In [None]:
meta_speakers

Unnamed: 0,exactage,age1994,agerange,gender,nat,birthplace,birthcountry,l1,lingorig,dialect_rep,...,dialect_l2,dialect_l3,dialect_l4,edqual,occupation,socgrade,nssec,l2,fls,in_core
S0001,32,25_34,30_39,F,British,"Wordsley, West Midlands",England,English,England,None indicated,...,unspecified,unspecified,unspecified,5_postgrad,University researcher,A,1_2,,,n
S0002,,Unknown,19_29,F,British,Birmingham,England,English,England,Midlands,...,england,midlands,unspecified,5_postgrad,Teacher,B,2,,Japanese -- Intermediate,n
S0003,,Unknown,19_29,F,British,"Royal Leamington Spa, Warwickshire",England,English,England,Northern,...,england,north,unspecified,4_graduate,Student,E,uncat,,,n
S0004,,Unknown,30_39,M,British,,Germany,English,England,Northern,...,england,north,unspecified,5_postgrad,Engineer,C2,5,,Spanish -- Beginner,n
S0005,,60plus,80_89,F,British,Birmingham,England,English,England,Midlands,...,england,midlands,unspecified,2_secondary,Insurance Broker (retired),E,8,,French -- Beginner,n
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
S0691,45,45_59,40_49,F,British,Barrow-In-Furness,UK,English,England,Northern/ Cumbrian,...,england,north,unspecified,3_sixthform,dental nurse (trainee),D,6,,,y
S0692,22,15_24,19_29,M,British,Barrow-in-Furness,England,English,England,Northern,...,england,north,unspecified,3_sixthform,Sales Assistant (Part time),D,6,,,n
UNKFEMALE,,Unknown,Unknown,F,,,,,,None indicated,...,unspecified,unspecified,unspecified,9_unknown,,unknown,unknown,,,n
UNKMALE,,Unknown,Unknown,M,,,,,,None indicated,...,unspecified,unspecified,unspecified,9_unknown,,unknown,unknown,,,n


## Add number of tokens per speaker

In [None]:
speakers_toks = defaultdict(int)

for text in texts:
    for u in text.iter('u'):
        who = u.get('who')
        n_words = len([w for w in u.iter('w')])
        speakers_toks[who] += n_words

In [None]:
speaker_toks = pd.DataFrame(list(speakers_toks.items()), columns=['who', 'speaker_toks_n'])

In [None]:
speaker_toks.sort_values(by='speaker_toks_n', ascending=False).head(10)

Unnamed: 0,who,speaker_toks_n
20,S0336,14332
21,S0362,11261
16,S0115,7522
17,S0037,6970
1,S0623,6621
11,S0024,5086
5,S0611,4698
10,S0144,4183
14,S0687,3810
22,S0261,3596


In [None]:
meta_speakers_merge = meta_speakers.reset_index().rename(columns={'index': 'who'})

In [None]:
meta_speakers = pd.merge(
    left=meta_speakers_merge,
    right=speaker_toks,
    on='who'
)


In [None]:
meta_speakers

Unnamed: 0,who,exactage,age1994,agerange,gender,nat,birthplace,birthcountry,l1,lingorig,...,dialect_l3,dialect_l4,edqual,occupation,socgrade,nssec,l2,fls,in_core,speaker_toks_n
0,S0024,36.0,35_44,30_39,F,British,Norwich,England,English,England,...,south,unspecified,5_postgrad,lecturer,A,1_2,,,n,5086
1,S0030,,Unknown,40_49,F,British,London,England,English,England,...,south,unspecified,5_postgrad,Careers Consultant,B,2,,,n,2622
2,S0037,,Unknown,19_29,F,British,"Sunderland, Tyne and Wear",England,English,England,...,north,northeast,5_postgrad,Research Manager,A,1_2,,,n,6970
3,S0096,,Unknown,30_39,F,British,York,England,English,England,...,north,unspecified,5_postgrad,Careers Consultant,B,2,Kutchi,,n,2261
4,S0115,,Unknown,30_39,M,British,Birmingham,England,English,England,...,midlands,unspecified,5_postgrad,PhD student,A,1_2,,French -- Advanced; German -- Advanced,n,7522
5,S0144,36.0,35_44,30_39,M,British,London,England,English,England,...,south,unspecified,5_postgrad,Lecturer,A,1_2,,,y,4183
6,S0261,41.0,35_44,40_49,M,British/New Zealand,Wellington,New Zealand,English,England/NZ,...,non_uk,non_uk,4_graduate,Entrepreneur,A,1_2,,,n,3596
7,S0262,41.0,35_44,40_49,F,British,Dorchester,England,English,England,...,south,unspecified,5_postgrad,teacher,B,2,,French -- level unspecified; German -- level u...,y,1863
8,S0336,24.0,15_24,19_29,F,British,Wegberg,Germany,English,England,...,unspecified,unspecified,5_postgrad,Administrator,C1,3,,German -- Beginner,n,14332
9,S0362,25.0,25_34,19_29,M,British,Leicester,England,English,England,...,unspecified,unspecified,5_postgrad,Coach/Franchise owner,B,2,,,y,11261


## Write out

In [None]:
if not testing:
    meta_speakers.to_csv('../out/speakers.csv', index=False)

# Tokens

In addition to the metadata present in the corpus, I’ve added the following columns:

- `w_idx`: token position (‘index’) in the given utterance, starting at 1
- `w_L1`: preceding token
- `w_R1`: subsequent token

In [None]:
tokens = []

for text in texts:
    tok_d = {}
    tok_d['text_id'] = text.get('id')

    for u in text.findall('u'):
        tok_d['u_n'] = u.get('n')

        u_toks = list(u.iter('w'))
        for i, w in enumerate(u_toks):
            tok_d['w_pos'] = w.get('pos')
            tok_d['w_lemma'] = w.get('lemma')
            tok_d['w_class'] = w.get('class')
            tok_d['w_usas'] = w.get('usas')
            tok_d['w_text'] = w.text
            tok_d['w_idx'] = i + 1
            tok_d['w_L1'] = u_toks[i-1].text if i > 0 else '<s>'
            tok_d['w_R1'] = u_toks[i+1].text if i < len(u_toks) - 1 else '</s>'

            tokens.append(tok_d.copy())


In [None]:
tokens = pd.DataFrame(tokens)


In [None]:
tokens.head(20)

Unnamed: 0,text_id,u_n,w_pos,w_lemma,w_class,w_usas,w_text,w_idx,w_L1,w_R1
0,S2EF,1,VM,shall,VERB,T1:1:3,shall,1,<s>,I
1,S2EF,1,PPIS1,i,PRON,Z8,I,2,shall,move
2,S2EF,1,VVI,move,VERB,M2,move,3,I,the
3,S2EF,1,AT,the,ART,Z5,the,4,move,laptops
4,S2EF,1,NN2,laptop,SUBST,Y2,laptops,5,the,then
5,S2EF,1,RT,then,ADV,N4,then,6,laptops,stick
6,S2EF,1,VV0,stick,VERB,M2,stick,7,then,it
7,S2EF,1,PPH1,it,PRON,Z8,it,8,stick,on
8,S2EF,1,II,on,PREP,N6,on,9,it,the
9,S2EF,1,AT,the,ART,N6,the,10,on,table


In [None]:
assert len(tokens) == tokens_n


I export the full token table to `tokens.csv`.

In [None]:
if not testing:
    tokens.to_csv('../out/tokens.csv', index=False)

I also export a smaller version for use in spreadsheet software. This version contains the first 50,000 tokens in the corpus and is stored in `tokens_50k.csv`.

In [None]:
if not testing:
    (tokens
     .head(50_000)
     .to_csv('../out/tokens_50k.csv', index=False))

# Merge tokens with metadata

In [None]:
tokens.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94659 entries, 0 to 94658
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text_id  94659 non-null  object
 1   u_n      94659 non-null  object
 2   w_pos    94659 non-null  object
 3   w_lemma  94659 non-null  object
 4   w_class  94659 non-null  object
 5   w_usas   94659 non-null  object
 6   w_text   94659 non-null  object
 7   w_idx    94659 non-null  int64 
dtypes: int64(1), object(7)
memory usage: 5.8+ MB


## + utterance information

In [None]:
toks_utt = pd.merge(
    tokens,
    utterances,
    on = ['text_id', 'u_n']
)


In [None]:
toks_utt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 94659 entries, 0 to 94658
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   text_id          94659 non-null  object
 1   u_n              94659 non-null  object
 2   w_pos            94659 non-null  object
 3   w_lemma          94659 non-null  object
 4   w_class          94659 non-null  object
 5   w_usas           94659 non-null  object
 6   w_text           94659 non-null  object
 7   w_idx            94659 non-null  int64 
 8   u_who            94659 non-null  object
 9   u_trans          94659 non-null  object
 10  u_whoConfidence  94659 non-null  object
 11  u_toks_n         94659 non-null  int64 
dtypes: int64(2), object(10)
memory usage: 9.4+ MB


## + text information

In [None]:
toks_utt_text = pd.merge(
    toks_utt,
    meta_texts,
    on = 'text_id'
)


In [None]:
toks_utt_text.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 94659 entries, 0 to 94658
Data columns (total 27 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   text_id          94659 non-null  object
 1   u_n              94659 non-null  object
 2   w_pos            94659 non-null  object
 3   w_lemma          94659 non-null  object
 4   w_class          94659 non-null  object
 5   w_usas           94659 non-null  object
 6   w_text           94659 non-null  object
 7   w_idx            94659 non-null  int64 
 8   u_who            94659 non-null  object
 9   u_trans          94659 non-null  object
 10  u_whoConfidence  94659 non-null  object
 11  u_toks_n         94659 non-null  int64 
 12  rec_length       94659 non-null  object
 13  rec_date         94659 non-null  object
 14  rec_year         94659 non-null  int64 
 15  rec_period       94659 non-null  object
 16  n_speakers       94659 non-null  int64 
 17  list_speakers    94659 non-null

## + speaker information

In [None]:
toks_utt_text_speakers = pd.merge(
    toks_utt_text,
    meta_speakers,
    left_on = 'u_who',
    right_on = 'who'
)


In [None]:
toks_utt_text_speakers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 94659 entries, 0 to 94658
Data columns (total 53 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   text_id          94659 non-null  object
 1   u_n              94659 non-null  object
 2   w_pos            94659 non-null  object
 3   w_lemma          94659 non-null  object
 4   w_class          94659 non-null  object
 5   w_usas           94659 non-null  object
 6   w_text           94659 non-null  object
 7   w_idx            94659 non-null  int64 
 8   u_who            94659 non-null  object
 9   u_trans          94659 non-null  object
 10  u_whoConfidence  94659 non-null  object
 11  u_toks_n         94659 non-null  int64 
 12  rec_length       94659 non-null  object
 13  rec_date         94659 non-null  object
 14  rec_year         94659 non-null  int64 
 15  rec_period       94659 non-null  object
 16  n_speakers       94659 non-null  int64 
 17  list_speakers    94659 non-null

## Write out

In [None]:
if not testing:
    toks_utt_text_speakers.to_csv('../out/tokens-plus-meta.csv', index=False)

In [None]:
print(f'number of rows: {len(toks_utt_text_speakers)}')
print(f'file size: {os.path.getsize("../out/tokens-plus-meta.csv") / 1_000_000:.2f} MB')

number of rows: 94659
file size: 64.40 MB


I also write out a small version containing the first 50,000 rows for use in spreadsheet software:

In [None]:
if not testing:
    toks_utt_text_speakers.iloc[:50_000].to_csv(
        '../out/tokens-plus-meta_small.csv', index=False)

In [None]:
print(f'number of rows: {len(toks_utt_text_speakers.iloc[:50_000])}')
print(f'file size: {os.path.getsize("../out/tokens-plus-meta_small.csv") / 1_000_000:.2f} MB')

number of rows: 50000
file size: 40.04 MB


In [None]:
#| hide
import nbdev
nbdev.nbdev_export()
