In [636]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_fscore_support
import torch
import transformers as ppb
import warnings

warnings.filterwarnings('ignore')

## Interspeech 2020 Challenge
Deadline: May 8th submission of predictions and paper.

Main Webpage: http://www.interspeech2020.org/index.php?m=content&c=index&a=lists&catid=66

Challenge Webpage: http://www.homepages.ed.ac.uk/sluzfil/ADReSS/

In [440]:
import regex as re

In [381]:
import os
from glob import glob

# Initial Plan
Only using the transcripts:
- [x] Simple clean and join all sentences, classifiy using DistillBERT, (BERT), (RoBERTa)   ## Done

## Further Feature Engineering:
### Time dimension
- Embed time total time taken - parse time blocks, take first and last
- Embed total time taken per sentence
- Time before starting speech
- Time in between each sentence
- Average / min / max / median time of sentence
- use of special characters
- number of sentences spoken

### Linguistic Features
- Embed special character tokens in speech, pauses etc. (not sure if this needed, tokenzier / and
- classify on a sentence level??
- Also use the Interviewer INV, questions / speech / time...
- Use POS Tags: as OHE vector

## Demographics
- Gender
- Age

## Fine-tuning BERT(-esque) models on spontaneous speech datasets
- fine-tune and re-classify using other spontaneous speech datasets: 

### Further work on
- Analysis of what roBERTa has actually learned in the attention heads

In [36]:
prob_ad_dir = '../train/transcription/cd/*'
controls_dir = '../train/transcription/cc/*'

In [612]:
def extract_data(file_name):
    par = {}
    par['id'] = file_name.split('/')[-1].split('.cha')[0]
    f = iter(open(file_name))
    l = next(f)
    speech = []
    try:
        curr_speech = ''
        while (True):
            if l.startswith('@ID'):
                participant = [i.strip() for i in l.split('|')]
                if participant[2] == 'PAR':
                    par['mmse'] = participant[8]
                    par['sex'] = participant[4][0]
                    par['age'] = int(participant[3].replace(';', ''))
            if l.startswith('*PAR:'):
                curr_speech = l
            elif len(curr_speech) != 0 and not(l.startswith('%') or l.startswith('*')):
                curr_speech += l
            elif len(curr_speech) > 0:
                speech.append(curr_speech)
                curr_speech = ''
            l = next(f)
    except StopIteration:
        pass

    clean_speech = []
    speech_time_segments = []
    for s in speech:
        s = re.sub('\*PAR:\t', '', s) # remove prefix    
        speech_time_segments.append([*map(int, re.search('\x15(\d*_\d*)\x15', s).groups()[0].split('_'))])
        s = re.sub('\x15\d*_\d*\x15', '', s) # remove time block 
        s = re.sub('\[.*\]', '', s) # remove other speech artifacts [.*]
        s = s.strip()
        s = re.sub('\t|\n|<|>', '', s) # remove tab, new lines, inferred speech??, ampersand, &
        clean_speech.append(s)
    par['speech'] = speech
    par['clean_speech'] = clean_speech
    par['all_clean_speech'] = ' '.join(clean_speech)
    
    # sentence times
    par['per_sent_times'] = speech_time_segments
    par['total_time'] =  speech_time_segments[-1][1] - speech_time_segments[0][0]
    par['time_before_par_speech'] = speech_time_segments[0][0]
    par['time_between_sents'] = [0 if i == 0 else max(0, speech_time_segments[i][0] - speech_time_segments[i-1][1]) 
                                 for i in range(len(speech_time_segments))]
    return par

In [None]:
for l in open('train/transcription/cc/S030.cha'):
    print(l)

In [593]:
def parse_data(train=True):
    data_dir = 'train'
    if not train:
        data_dir = 'test'
    
    prob_ad_dir = f'{data_dir}/transcription/cd/*'
    controls_dir = f'{data_dir}/transcription/cc/*'
    
    controls = [extract_data(fn) for fn in glob(prob_ad_dir)]
    prob_ad = [extract_data(fn) for fn in glob(controls_dir)]
    controls_df = pd.DataFrame(controls)
    prob_ad_df = pd.DataFrame(prob_ad)
    controls_df['ad'] = 0
    prob_ad_df['ad'] = 1
    df = pd.concat([controls_df, prob_ad_df]).sample(frac=1).reset_index(drop=True)
    return df

In [617]:
train_df = parse_data()

In [618]:
train_df

Unnamed: 0,id,mmse,sex,age,speech,clean_speech,all_clean_speech,per_sent_times,total_time,time_before_par_speech,time_between_sents,ad
0,S009,30,m,67,[*PAR:\ta boy is taking &uh cookies from the c...,[a boy is taking &uh cookies from the cookie j...,a boy is taking &uh cookies from the cookie ja...,"[[2460, 8778], [8778, 11522], [11522, 14064], ...",51190,2460,"[0, 0, 0, 0, 0, 0, 0, 1609, 0, 0, 554]",1
1,S089,18,m,65,"[*PAR:\tso she will find her . 0_2741\n, *PA...","[so she will find her ., and xxx the mother wa...",so she will find her . and xxx the mother wash...,"[[0, 2741], [2741, 6461], [6461, 12031], [1203...",59237,0,"[0, 0, 0, 0, 0, 0, 2935, 1257, 0, 0, 0, 4857]",0
2,S017,28,f,65,[*PAR:\twell I see the mother doing the dishes...,"[well I see the mother doing the dishes ., the...",well I see the mother doing the dishes . the s...,"[[0, 1979], [1979, 4792], [4792, 15051], [1505...",26442,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0]",1
3,S154,20,f,65,[*PAR:\tyou want me to tell you ? [+ exc] 536...,"[you want me to tell you ?, okay &uh the boy's...",you want me to tell you ? okay &uh the boy's g...,"[[5361, 6358], [6358, 9618], [9618, 12111], [1...",25602,5361,"[0, 0, 0, 0, 0, 0, 0]",0
4,S103,27,m,64,[*PAR:\tthe boy's &uh fallin(g) off the stool ...,"[the boy's &uh fallin(g) off the stool ., the ...",the boy's &uh fallin(g) off the stool . the t...,"[[6035, 13351], [13351, 17373], [17373, 21502]...",52252,6035,"[0, 0, 0, 0, 0, 0, 0, 12702, 0]",0
...,...,...,...,...,...,...,...,...,...,...,...,...
103,S082,11,m,66,[*PAR:\t&=clears:throat well &=clears:throat &...,[&=clears:throat well &=clears:throat &uh the ...,&=clears:throat well &=clears:throat &uh the k...,"[[2913, 14570], [14570, 29556], [29556, 42535]...",186681,2913,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1177, 575, 0...",0
104,S003,29,f,69,"[*PAR:\tokay . [+ exc] 0_1074\n, *PAR:\tther...","[okay ., there's a little boy and he's getting...",okay . there's a little boy and he's getting ...,"[[0, 1074], [1074, 6133], [6133, 16452], [1645...",67484,0,"[0, 0, 0, 0, 0, 0, 2549, 0, 382, 0, 0, 0, 0, 0...",1
105,S068,29,m,50,[*PAR:\tthe &uh water's running on the floor ....,"[the &uh water's running on the floor ., boy's...",the &uh water's running on the floor . boy's t...,"[[0, 2291], [2291, 6296], [6296, 10256], [1025...",31269,0,"[0, 0, 0, 0, 0, 9533]",1
106,S156,13,f,71,"[*PAR:\tmhm . [+ exc] 1200_1800\n, *PAR:\twe...","[mhm ., well this one is in the cookie jar ., ...",mhm . well this one is in the cookie jar . and...,"[[1200, 1800], [4100, 9000], [10097, 14770], [...",67311,1200,"[0, 2300, 1097, 0, 0, 0, 0, 0, 2845, 550, 7661...",0


## BERT (type) model Experimentaton

## Load Distill_BERT

In [None]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

## Load BERT

In [None]:
# Want BERT instead of distilBERT? Uncomment the following line:
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

## Load RoBERTa

In [383]:
# Want BERT instead of distilBERT? Uncomment the following line:
model_class, tokenizer_class, pretrained_weights = (ppb.RobertaModel, ppb.RobertaTokenizer, 'roberta-base')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=524.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…




In [384]:
tokenized = df.all_clean_speech.apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512)))

In [385]:
tkn_inds = []
for sent in df.all_clean_speech:
    tkn_inds.append(tokenizer.encode(sent, add_special_tokens=True, max_length=512))
tokenized.apply(len).sort_values(ascending=False)

100    483
1      363
11     329
83     328
12     298
      ... 
61      60
41      56
94      43
37      40
48      39
Name: all_clean_speech, Length: 108, dtype: int64

In [386]:
# pad so can be treated as one batch
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

# attention mask - zero out attention scores where there is no input to be processed (i.e. is padding)
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

In [388]:
# Distil_BERT Embed / BERT Embed / roBERTa Embed

In [389]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# check if multiple GPUs are available
multi_gpu = torch.cuda.device_count() > 1:

if torch.cuda.is_available():
    model.to(device)
    input_ids.to(device)
    attention_mask.to(device)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [390]:
features = last_hidden_states[0][:,0,:].numpy()

In [391]:
features.shape

(108, 768)

In [392]:
last_hidden_states[0].shape

torch.Size([108, 483, 768])

In [622]:
# NLP Features
bert_features = features

In [693]:
# time features
# - Embed time total time taken 
# - parse time blocks, take first and last
# - Embed total time taken per sentence
# - Time before starting speech
# - Time in between each sentence
# - Average / min / max / median time of sentence
time_dims = train_df.loc[:, ['total_time', 'time_before_par_speech', 'time_between_sents']]

In [694]:
# time_dims['avg_betweeen_sents'] = time_dims.time_between_sents.apply(lambda t: round(sum(t) / len(t)))
# time_dims['max'] = time_dims.time_between_sents.apply(max)
# time_dims['min'] = time_dims.time_between_sents.apply(min)
time_dims_eng = time_dims.drop('time_between_sents', axis=1)

In [702]:
time_features = StandardScaler().fit_transform(time_dims_eng.to_numpy())

In [None]:
time_features

In [704]:
# Concat all features
features = np.hstack([bert_features, time_features])

In [705]:
train_features, test_features, train_labels, test_labels = train_test_split(features, df.ad)

In [708]:
parameters = {'C': np.linspace(0.0001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(penalty='L2'), parameters)
grid_search.fit(train_features, train_labels)

print('best parameters:', grid_search.best_params_)
print('best scores: ', grid_search.best_score_)

best parameters:  {'C': 89.4736947368421}
best scores:  0.7661764705882353


In [709]:
lr_clf = LogisticRegression(**grid_search.best_params_)
lr_clf.fit(train_features, train_labels)

LogisticRegression(C=89.4736947368421, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [710]:
lr_clf.score(test_features, test_labels)

0.6296296296296297

In [711]:
preds = lr_clf.predict(test_features)

In [712]:
precision_recall_fscore_support(preds, test_labels, average='binary')

(0.75, 0.5625, 0.6428571428571429, None)

#### Out-Of-The-Box Distill BERT and BERT achieve scores:
Distill BERT: 0.74

BERT: 0.77

RoBERTA: 0.89

In [374]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.542 (+/- 0.23)
