# Load packages

Package requirements are stored in `requirements.yml`.

In [1]:
import os
import random
from collections import defaultdict
from pprint import pprint

import numpy as np
import pandas as pd

from lxml import etree

import altair as alt
#alt.data_transformers.disable_max_rows()

# Variables

BNC2014 needs to be downloaded for this script to work. It can be obtained from the official [BNC website](http://corpora.lancs.ac.uk/bnc2014/). 

The following variables need to be updated to the corpus' local path. In the current setting the BNC2014 data were stored in the project folder in the folder `data`, so relative paths were used.

In [2]:
dir_corpus = 'data/spoken/tagged/'
dir_meta = 'data/spoken/metadata/'

# Load and parse XML

In [3]:
f_names = os.listdir(dir_corpus)
f_paths = [f"{dir_corpus}{f_name}" for f_name in f_names]

In [4]:
def get_xml(f_path):
    with open(f_path, 'r') as f:
        f = f.read()
    xml = etree.fromstring(f)
    return xml

# Corpus size

## Texts

Calculate the total number of texts in the corpus.

In [5]:
%%time
texts = []
for f_path in f_paths:
    xml = get_xml(f_path)
    id = xml.get('id')
    texts.append(id)

CPU times: user 40.6 s, sys: 6.72 s, total: 47.4 s
Wall time: 49.5 s


In [6]:
print(f"number of documents in the corpus: {len(texts)}")

number of documents in the corpus: 1251


## Speakers

1. Store all speakers in the corpus.
2. Store the total number of words each speaker has contributed to the corpus.

In [7]:
%%time
speakers_words = defaultdict(int)
for f_path in f_paths:
    xml = get_xml(f_path)
    for u in xml.iter('u'):
        speaker = u.get('who')
        n_words = len([w for w in u.iter('w')])
        speakers_words[speaker] += n_words

CPU times: user 47 s, sys: 6.69 s, total: 53.7 s
Wall time: 56.6 s


### Number of speakers

In [8]:
print(f"number of speakers: {len(speakers_words)}")

number of speakers: 671


### Words per speaker

In [9]:
df_speakers_words = pd.DataFrame(list(speakers_words.items()), columns=['speaker', 'n_words'])

In [10]:
df_speakers_words.sort_values('speaker', ascending=True, inplace=True)

In [44]:
df_speakers_words

Unnamed: 0,speaker,n_words
468,S0001,3000
459,S0002,8535
440,S0003,1893
312,S0004,3634
143,S0005,1449
...,...,...
219,S0691,2135
214,S0692,1105
26,UNKFEMALE,28108
32,UNKMALE,30316


### Write out

In [12]:
with open('out/metadata/speakers_words.csv', 'w') as f_out:
    df_speakers_words.to_csv(f_out, index=False)

## Words

In [13]:
%%time
n_words = 0
for f_path in f_paths:
    xml = get_xml(f_path)
    for w in xml.iter('w'):
        n_words += 1

CPU times: user 44.5 s, sys: 6.42 s, total: 50.9 s
Wall time: 53.8 s


In [14]:
print(f"total number of words in the corpus: {n_words}")

total number of words in the corpus: 11422615


# Query for `that's ADJ`

In [15]:
blacklist = ['to', 'timing', 'news',  'bullshit', 'awesome', 'enough']

In [16]:
def get_thats_adj_start_disc(u):
    query = "that 's ADJ - start_disc"
    n_slots = 3
    hits = []
    words = [w for w in u.iter('w')]
    for w in words:
        # check first token
        if w.text.lower() == 'that' and w.get('pos') == 'DD1':
            # check sequence length to avoid crossing utterance boundaries
            if words.index(w) + n_slots - 1 < len(words):
                w2 = words[words.index(w) + 1]
                w3 = words[words.index(w) + 2]
                # check second token
                if w2.text.lower() == "'s":
                    # check third token
                    if w3.get('pos') == "JJ":
                        # store information about hits
                        hit = {}
                        hit['doc'] = xml.get('id')
                        hit['utterance'] = u.get('n')
                        hit['speaker'] = u.get('who')
                        hit['query'] = query
                        hit['result'] = ' '.join(
                            [
                                w.text,
                                w2.text,
                                w3.text,
                            ]
                        )
                        hit['adj_text'] = w3.text
                        hit['adj_semtag'] = w3.get('usas')
                        # check whether pattern occurs at start of utterances or is only preceded by an interjection
                        if words.index(w) == 0 or (words[0].get('pos') == 'UH' and words.index(w) == 1):
                            # exclude hits that are followed by blacklist items
                            if words.index(w) + n_slots < len(words):
                                w4 = words[words.index(w) + n_slots]
                                if w4.text.lower() not in blacklist:
                                    hits.append(hit)
                            else:
                                hits.append(hit)
    return hits    

In [17]:
%%time
hits = []
for f_path in f_paths:
    xml = get_xml(f_path)
    for u in xml.iter('u'):
        hits.extend(get_thats_adj_start_disc(u))

CPU times: user 49.6 s, sys: 6.22 s, total: 55.8 s
Wall time: 59.9 s


In [21]:
print(f"number of hits: {len(hits)}")

number of hits: 8027


In [22]:
print('random selection of 10 hits:\n')
for hit in random.choices(hits, k=10):
    print(hit['result'])

random selection of 10 hits:

that 's great
that 's true
that 's right
that 's nice
that 's right
that 's fatal
that 's good
that 's fine
that 's nice
that 's awful


In [23]:
df_hits = pd.DataFrame(hits)
df_hits

Unnamed: 0,doc,utterance,speaker,query,result,adj_text,adj_semtag
0,SN64,521,S0588,that 's ADJ - start_disc,that 's good,good,A5:1
1,SN64,577,S0588,that 's ADJ - start_disc,that 's strange,strange,A6:2
2,SN64,834,S0588,that 's ADJ - start_disc,that 's ridiculous,ridiculous,S1:2:6
3,SN64,854,S0588,that 's ADJ - start_disc,that 's ridiculous,ridiculous,S1:2:6
4,SN64,980,S0588,that 's ADJ - start_disc,that 's atrocious,atrocious,A5:1
...,...,...,...,...,...,...,...
8022,S37K,669,S0058,that 's ADJ - start_disc,that 's alright,alright,A5:1
8023,S37K,740,S0058,that 's ADJ - start_disc,that 's interesting,interesting,X5:2
8024,S37K,1555,S0058,that 's ADJ - start_disc,that 's true,true,A5:2
8025,SMHY,139,S0037,that 's ADJ - start_disc,that 's true,true,A5:2


# Semantic category descriptions

BNC2014 data are tagged using the USAS semantic tagger. Information for each hit were stored in `df_hits` and are now merged with semantic category descriptions from the USAS tagset which was download from <http://ucrel.lancs.ac.uk/usas/>.

In [24]:
semtags = pd.read_csv(
    'semtags.csv', 
    sep='\t',
    names=['tag', 'desc']
)

In [25]:
semtags.tag = semtags.tag.str.replace('.', ':')

In [26]:
df_hits = pd.merge(df_hits, semtags, left_on='adj_semtag', right_on='tag', how='left')

In [27]:
df_hits.drop('tag', axis=1, inplace=True)

In [28]:
df_hits.rename(columns={'desc': 'adj_semdesc'}, inplace=True)

In [29]:
df_hits

Unnamed: 0,doc,utterance,speaker,query,result,adj_text,adj_semtag,adj_semdesc
0,SN64,521,S0588,that 's ADJ - start_disc,that 's good,good,A5:1,Evaluation:- Good/bad
1,SN64,577,S0588,that 's ADJ - start_disc,that 's strange,strange,A6:2,Comparing:- Usual/unusual
2,SN64,834,S0588,that 's ADJ - start_disc,that 's ridiculous,ridiculous,S1:2:6,Sensible
3,SN64,854,S0588,that 's ADJ - start_disc,that 's ridiculous,ridiculous,S1:2:6,Sensible
4,SN64,980,S0588,that 's ADJ - start_disc,that 's atrocious,atrocious,A5:1,Evaluation:- Good/bad
...,...,...,...,...,...,...,...,...
8022,S37K,669,S0058,that 's ADJ - start_disc,that 's alright,alright,A5:1,Evaluation:- Good/bad
8023,S37K,740,S0058,that 's ADJ - start_disc,that 's interesting,interesting,X5:2,Interest/boredom/excited/energetic
8024,S37K,1555,S0058,that 's ADJ - start_disc,that 's true,true,A5:2,Evaluation:- True/false
8025,SMHY,139,S0037,that 's ADJ - start_disc,that 's true,true,A5:2,Evaluation:- True/false


In [30]:
query = df_hits.loc[0, 'query']
f_name_out = query.replace(" ", "-")
dir_hits_out = 'hits/'

In [31]:
with open(f"out/{dir_hits_out}{f_name_out}.csv", 'w') as f_out:
    df_hits.to_csv(f_out, index=False)

# Metadata

## Speakers

In [32]:
head_speakers = pd.read_csv(
    f"{dir_meta}metadata-fields-speaker.txt",
    delimiter='\t',
    skiprows=1,
    index_col=0
)

In [33]:
speakers = pd.read_csv(
    f"{dir_meta}bnc2014spoken-speakerdata.tsv", 
    delimiter='\t', 
    names=head_speakers['XML tag'],
    index_col=0
)

In [34]:
speakers

Unnamed: 0,exactage,age1994,agerange,gender,nat,birthplace,birthcountry,l1,lingorig,dialect_rep,...,dialect_l2,dialect_l3,dialect_l4,edqual,occupation,socgrade,nssec,l2,fls,in_core
S0001,32,25_34,30_39,F,British,"Wordsley, West Midlands",England,English,England,None indicated,...,unspecified,unspecified,unspecified,5_postgrad,University researcher,A,1_2,,,n
S0002,,Unknown,19_29,F,British,Birmingham,England,English,England,Midlands,...,england,midlands,unspecified,5_postgrad,Teacher,B,2,,Japanese -- Intermediate,n
S0003,,Unknown,19_29,F,British,"Royal Leamington Spa, Warwickshire",England,English,England,Northern,...,england,north,unspecified,4_graduate,Student,E,uncat,,,n
S0004,,Unknown,30_39,M,British,,Germany,English,England,Northern,...,england,north,unspecified,5_postgrad,Engineer,C2,5,,Spanish -- Beginner,n
S0005,,60plus,80_89,F,British,Birmingham,England,English,England,Midlands,...,england,midlands,unspecified,2_secondary,Insurance Broker (retired),E,8,,French -- Beginner,n
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
S0691,45,45_59,40_49,F,British,Barrow-In-Furness,UK,English,England,Northern/ Cumbrian,...,england,north,unspecified,3_sixthform,dental nurse (trainee),D,6,,,y
S0692,22,15_24,19_29,M,British,Barrow-in-Furness,England,English,England,Northern,...,england,north,unspecified,3_sixthform,Sales Assistant (Part time),D,6,,,n
UNKFEMALE,,Unknown,Unknown,F,,,,,,None indicated,...,unspecified,unspecified,unspecified,9_unknown,,unknown,unknown,,,n
UNKMALE,,Unknown,Unknown,M,,,,,,None indicated,...,unspecified,unspecified,unspecified,9_unknown,,unknown,unknown,,,n


## Texts

In [35]:
head_texts = pd.read_csv(
    f"{dir_meta}metadata-fields-text.txt",
    delimiter='\t',
    skiprows=1,
    index_col=0
)

In [36]:
texts = pd.read_csv(
    f"{dir_meta}bnc2014spoken-textdata.tsv", 
    delimiter='\t', 
    names=head_texts['XML tag'],
    index_col=0
)

In [37]:
texts

Unnamed: 0,rec_length,rec_date,rec_year,rec_period,n_speakers,list_speakers,rec_loc,relationships,topics,activity,conv_type,conventions,in_sample,transcriber
S23A,1:50:43,2014-12-27,2014,2014_Q4,4,S0021 S0032 S0094 S0095,Speakers' home,"Close family, partners, very close friends","Computer programming, food, wine, temperature,...",Catching up with family over food and presents,"Discussing, explaining, anecdote telling",Revised,n,T15
S24A,0:17:24,2014-09-12,2014,2014_Q3,2,S0261 S0262,"Modern Art Museum, London","Close family, partners, very close friends",The art,A couple discussing modern art at a museum,"Discussing, explaining, inquiring",Revised,y,T09
S24D,0:20:00,2016-01-14,2016,2016_Q1,3,S0653 S0654 S0655,"Home kitchen, Comberton","Close family, partners, very close friends","Lego Ninjago, Minecraft worlds",Spending time on electronic toys instead of re...,"Discusing, explaining",Revised,n,T18
S24E,0:45:53,2015-09-15,2015,2015_Q3,3,S0519 S0520 S0521,"Hunsonby, Cumbria","Close family, partners, very close friends","food, exercise, choir, family plans, family me...",Midweek family dinner,"Discussing, explaining, Inquiring, advising, a...",Revised,n,T09
S263,2:00:00,2016-02-07,2016,2016_Q1,4,S0588 S0589 S0590 S0616,ANON’s home,"Close family, partners, very close friends",,,"Discussing, explaining",Revised,n,T10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SZVB,1:00:31,2015-11-02,2015,2015_Q4,2,S0517 S0525,"(ANON’s home, Fradley, Staffs)","Close family, partners, very close friends","Babies, family, friends",Sisters talking about their family (new baby d...,"Discussing, explaining, inquiring, anecdote te...",Revised,n,T15
SZVC,0:32:00,2015-09-14,2015,2015_Q3,2,S0324 S0325,"ANON's home, Linton","Close family, partners, very close friends","school orchestra (windband), Playing the Clari...",Friends talking about school,"discussing, explaining, inquiring, complaining...",Revised,n,T10
SZW4,0:21:09,2015-10-19,2015,2015_Q4,2,S0509 S0510,"ANON & ANON's home, Hastings","Close family, partners, very close friends","Poetry, Morning Routine, Food, Social Events, ...",Mother and Daughter,"Discussing, inquiring, anecdote telling",Revised,n,T18
SZXQ,0:40:44,2012-03-21,2012,2012_Q1,2,S0058 S0120,"Botanic Gardens, Cambridge","Friends, wider family circle","TV, languages, friends, holidays, offices, comedy",,"Discussing, explaining, inquiring, complaining...",Original,y,T11


## Merge hits and metadata

In [38]:
df_merged = pd.merge(df_hits, speakers, left_on='speaker', right_on=speakers.index)

In [39]:
df_merged = pd.merge(df_merged, texts, left_on='doc', right_on=texts.index)

In [46]:
df_merged.head()

Unnamed: 0,doc,utterance,speaker,query,result,adj_text,adj_semtag,adj_semdesc,exactage,age1994,...,n_speakers,list_speakers,rec_loc,relationships,topics,activity,conv_type,conventions,in_sample,transcriber
0,SN64,521,S0588,that 's ADJ - start_disc,that 's good,good,A5:1,Evaluation:- Good/bad,49,45_59,...,3,S0588 S0589 S0590,ANON’s home.,"Close family, partners, very close friends","Party, community market, maps, hospital appoin...",Chatting whilst supper is cooking.,"Discussing, Complaining",Revised,n,T10
1,SN64,577,S0588,that 's ADJ - start_disc,that 's strange,strange,A6:2,Comparing:- Usual/unusual,49,45_59,...,3,S0588 S0589 S0590,ANON’s home.,"Close family, partners, very close friends","Party, community market, maps, hospital appoin...",Chatting whilst supper is cooking.,"Discussing, Complaining",Revised,n,T10
2,SN64,834,S0588,that 's ADJ - start_disc,that 's ridiculous,ridiculous,S1:2:6,Sensible,49,45_59,...,3,S0588 S0589 S0590,ANON’s home.,"Close family, partners, very close friends","Party, community market, maps, hospital appoin...",Chatting whilst supper is cooking.,"Discussing, Complaining",Revised,n,T10
3,SN64,854,S0588,that 's ADJ - start_disc,that 's ridiculous,ridiculous,S1:2:6,Sensible,49,45_59,...,3,S0588 S0589 S0590,ANON’s home.,"Close family, partners, very close friends","Party, community market, maps, hospital appoin...",Chatting whilst supper is cooking.,"Discussing, Complaining",Revised,n,T10
4,SN64,980,S0588,that 's ADJ - start_disc,that 's atrocious,atrocious,A5:1,Evaluation:- Good/bad,49,45_59,...,3,S0588 S0589 S0590,ANON’s home.,"Close family, partners, very close friends","Party, community market, maps, hospital appoin...",Chatting whilst supper is cooking.,"Discussing, Complaining",Revised,n,T10


## Write out

In [41]:
# speaker metadata
with open(f"out/metadata/speakers.csv", 'w') as f_out:
    speakers.to_csv(f_out)

In [42]:
# merged hits
with open(f"out/merged/{f_name_out}.csv", 'w') as f_out:
    df_merged.to_csv(f_out, index=False)