In [36]:
from math import sqrt
import regex as re
import os
from glob import glob
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.svm import SVC, SVR
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_fscore_support, mean_squared_error
import torch
import transformers as ppb
import warnings

warnings.filterwarnings('ignore')

## Interspeech 2020 Challenge
Deadline: May 8th submission of predictions and paper.

Main Webpage: http://www.interspeech2020.org/index.php?m=content&c=index&a=lists&catid=66

Challenge Webpage: http://www.homepages.ed.ac.uk/sluzfil/ADReSS/

# Initial Plan
Only using the transcripts:
- [x] Simple clean and join all sentences, classifiy using DistillBERT, (BERT), (RoBERTa)   ## Done

## Further Feature Engineering:
### Time dimension
- Embed time total time taken - parse time blocks, take first and last
- Embed total time taken per sentence
- Time before starting speech
- Time in between each sentence
- Average / min / max / median time of sentence
- use of special characters
- number of sentences spoken

### Linguistic Features
- Embed special character tokens in speech, pauses etc. (not sure if this needed, tokenzier / and
- classify on a sentence level??
- Also use the Interviewer INV, questions / speech / time...
- Use POS Tags: as OHE vector

## Demographics
- Gender
- Age

## Fine-tuning BERT(-esque) models on spontaneous speech datasets
- fine-tune and re-classify using other spontaneous speech datasets: 

### Further work on
- Analysis of what roBERTa has actually learned in the attention heads

In [2]:
prob_ad_dir = '../train/transcription/cd/*'
controls_dir = '../train/transcription/cc/*'

In [94]:
def extract_data(file_name):
    par = {}
    par['id'] = file_name.split('/')[-1].split('.cha')[0]
    f = iter(open(file_name))
    l = next(f)
    speech = []
    try:
        curr_speech = ''
        while (True):
            if l.startswith('@ID'):
                participant = [i.strip() for i in l.split('|')]
                if participant[2] == 'PAR':
                    par['mmse'] = '' if len(participant[8]) == 0 else float(participant[8])
                    par['sex'] = participant[4][0]
                    par['age'] = int(participant[3].replace(';', ''))
            if l.startswith('*PAR:') or l.startswith('*INV'):
                curr_speech = l
            elif len(curr_speech) != 0 and not(l.startswith('%') or l.startswith('*')):
                curr_speech += l
            elif len(curr_speech) > 0:
                speech.append(curr_speech)
                curr_speech = ''
            l = next(f)
    except StopIteration:
        pass

    clean_par_speech = []
    clean_all_speech = []
    speech_time_segments = []
    is_par = False
    for s in speech:
        def _clean(s):
            speech_time_segments.append([*map(int, re.search('\x15(\d*_\d*)\x15', s).groups()[0].split('_'))])
            s = re.sub('\x15\d*_\d*\x15', '', s) # remove time block 
            s = re.sub('\[.*\]', '', s) # remove other speech artifacts [.*]
            s = s.strip()
            s = re.sub('\t|\n|<|>', '', s) # remove tab, new lines, inferred speech??, ampersand, &
            return s
        
        if s.startswith('*PAR:'):
            is_par = True
        elif s.startswith('*INV:'):
            is_par = False
            s = re.sub('\*INV:\t', '', s) # remove prefix
        if is_par:
            s = re.sub('\*PAR:\t', '', s) # remove prefix    
            clean_par_speech.append(_clean(s))
        clean_all_speech.append(_clean(s))
    
    par['speech'] = speech
    par['clean_speech'] = clean_all_speech
    par['clean_par_speech'] = clean_par_speech
    par['joined_all_speech'] = ' '.join(clean_all_speech)
    par['joined_all_par_speech'] = ' '.join(clean_par_speech)
    
    # sentence times
    par['per_sent_times'] = [speech_time_segments[i][1] - speech_time_segments[i][0] for i in range(len(speech_time_segments))]
    par['total_time'] =  speech_time_segments[-1][1] - speech_time_segments[0][0]
    par['time_before_par_speech'] = speech_time_segments[0][0]
    par['time_between_sents'] = [0 if i == 0 else max(0, speech_time_segments[i][0] - speech_time_segments[i-1][1]) 
                                 for i in range(len(speech_time_segments))]
    return par

In [95]:
def parse_train_data():
    return _parse_data('../data/train')
    
def parse_test_data():
    return _parse_data('../data/test')

def parse_pre_train_data():
    return _parse_data('/data/train')

def _parse_data(data_dir):
    prob_ad_dir = f'{data_dir}/transcription/cd/*'
    controls_dir = f'{data_dir}/transcription/cc/*'
    
    prob_ad = [extract_data(fn) for fn in glob(prob_ad_dir)]
    controls = [extract_data(fn) for fn in glob(controls_dir)]
    controls_df = pd.DataFrame(controls)
    prob_ad_df = pd.DataFrame(prob_ad)
    controls_df['ad'] = 0
    prob_ad_df['ad'] = 1
    df = pd.concat([controls_df, prob_ad_df]).sample(frac=1).reset_index(drop=True)
    return df

In [96]:
train_df = parse_train_data()

In [12]:
segmented_speech = train_df.loc[:, ['clean_par_speech', 'ad', 'mmse']].apply(lambda r: pd.DataFrame({'speech_sent': r.clean_par_speech, 'ad': r.ad, 'mmse': r.mmse}), axis=1).tolist()

In [13]:
# explode out each segment into AD / control segments
pd.concat(segmented_speech)

Unnamed: 0,speech_sent,ad,mmse
0,can I start ?,0,30
1,there is a boy on the stool .,0,30
2,the stool is tipping over .,0,30
3,the boy is taking cookies out_of the jar .,0,30
4,&uh the cupboard is open .,0,30
...,...,...,...
7,the little boy is standing on a stool and he's...,1,16
8,the little girl's standing there .,1,16
9,the mother's &st at the sink doing dishes .,1,16
10,(.) her water's overflowing .,1,16


### Tran Base-line TfIdf / Random Forest Models / SVM Models

In [14]:
train_df.columns

Index(['id', 'mmse', 'sex', 'age', 'speech', 'clean_speech',
       'clean_par_speech', 'joined_all_speech', 'joined_all_par_speech',
       'per_sent_times', 'total_time', 'time_before_par_speech',
       'time_between_sents', 'ad'],
      dtype='object')

In [15]:
random_state = 42

In [25]:
def cv10_avg(score, model, features, labels):
    return round(cross_val_score(model, features, labels, cv=10, scoring=score).sum() / 10, 2)

In [16]:
base_lines_train_df = train_df.loc[:, ['joined_all_par_speech', 'ad', 'mmse']]

In [66]:
def baseline_models(input_col):
    ## AD Classification Pred
    # sklearn pipeline
    param_space = {
        'vec__max_features': [100, 500, 1000, 2000, 10000],
        'vec__stop_words': ['english', None],
        'vec__analyzer': ['word', 'char'],
        'vec__sublinear_tf': [True, False],
        'clf__n_estimators': [100, 200, 500, 700],   # gbdt params
        'clf__max_depth': [3, 5, 10],   # gbdt params
#         'clf__C': [0.1, 0.5, 1.],               # SVC params
#         'clf__kernel': ['rbf', 'sigmoid']       # SVC params
    }

    pipe = Pipeline([
        ('vec', TfidfVectorizer()),
        ('clf', GradientBoostingClassifier())
#         ('clf', SVC())
    ])
    train_features, test_features, train_labels, test_labels = train_test_split(train_df[input_col], train_df.ad, random_state=random_state, test_size=0.2)
    search = GridSearchCV(pipe, param_space, cv=5, n_jobs=6)
    search.fit(train_features, train_labels)

    pipe.set_params(**search.best_params_)
    print(search.best_params_)
    pipe.fit(train_features, train_labels)
    preds = pipe.predict(test_features)
    print('prec, rec, f1 test', precision_recall_fscore_support(test_labels, preds))
    print(f'accu:{cv10_avg("accuracy", pipe, train_df[input_col], train_df.ad)}')
    print(f'prec:{cv10_avg("precision", pipe, train_df[input_col], train_df.ad)}')
    print(f'rec:{cv10_avg("recall", pipe, train_df[input_col], train_df.ad)}')
    print(f'f1:{cv10_avg("f1", pipe, train_df[input_col], train_df.ad)}')

    ## MMSE Regression Pred
    reg_features, reg_scores = train_df[input_col].iloc[0:36].tolist() + train_df[input_col][37:].tolist(), train_df[train_df.mmse != ''].mmse
    train_features, test_features, train_scores, test_scores = train_test_split(reg_features, reg_scores, random_state=random_state, test_size=0.2)

    # sklearn pipeline
    param_space = {
        'vec__max_features': [100, 500, 1000, 2000, 10000],
        'vec__stop_words': ['english', None],
        'vec__analyzer': ['word', 'char'],
        'vec__sublinear_tf': [True, False],
        'clf__n_estimators': [100, 200, 500],   # gbdt params
        'clf__max_depth': [3, 5, 10, 20, 50],   # gbdt params
#         'clf__C': [0.1, 0.5, 1.],               # SVC params
#         'clf__kernel': ['rbf', 'sigmoid']       # SVC params
    }

    pipe = Pipeline([
        ('vec', TfidfVectorizer()),
        ('clf', GradientBoostingRegressor())
#         ('clf', SVR())
    ])

    search = GridSearchCV(pipe, param_space, cv=5, n_jobs=6)
    search.fit(train_features, train_scores)

    pipe.set_params(**search.best_params_)
    print(search.best_params_)
    pipe.fit(train_features, train_scores)
    preds = pipe.predict(test_features)
    print('rmse test:', sqrt(mean_squared_error(test_scores, preds)))
    print('rmse cv:', cross_val_score(pipe, reg_features, reg_scores, cv=10, scoring='neg_root_mean_squared_error').sum() / 10)

In [None]:
# par speech only
baseline_models('joined_all_par_speech')

In [67]:
# par + inv speech
baseline_models('joined_all_speech')

{'clf__max_depth': 3, 'clf__n_estimators': 100, 'vec__analyzer': 'word', 'vec__max_features': 10000, 'vec__stop_words': None, 'vec__sublinear_tf': True}
prec, rec, f1 test (array([0.69230769, 0.88888889]), array([0.9       , 0.66666667]), array([0.7826087 , 0.76190476]), array([10, 12]))
accu:0.78
prec:0.77
rec:0.79
f1:0.79
{'clf__max_depth': 5, 'clf__n_estimators': 500, 'vec__analyzer': 'word', 'vec__max_features': 2000, 'vec__stop_words': 'english', 'vec__sublinear_tf': False}
rmse test: 6.382709167789705
rmse cv: -5.898423756596404


## BERT (type) model Experimentaton

In [None]:
# For DistilBERT:
# model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

# For DistilroBERTa:
# model_class, tokenizer_class, pretrained_weights = (ppb.RobertaModel, ppb.RobertaTokenizer, 'distilroberta-base')

# BERT Base
# model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# BERT Large
# model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-large-uncased')

# roBERTa base
# model_class, tokenizer_class, pretrained_weights = (ppb.RobertaModel, ppb.RobertaTokenizer, 'roberta-base')

# roBERTa large
model_class, tokenizer_class, pretrained_weights = (ppb.RobertaModel, ppb.RobertaTokenizer, 'roberta-large')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [None]:
# Only Participant Speech
tokenized = train_df.joined_all_par_speech.apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512)))
# All (INV + PAR) speech
# tokenized = train_df.joined_all_speech.apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512)))

# pad so can be treated as one batch
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

# attention mask - zero out attention scores where there is no input to be processed (i.e. is padding)
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# check if multiple GPUs are available
multi_gpu = torch.cuda.device_count() > 1

if torch.cuda.is_available():
    model = model.to(device)
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
last_hidden_states = last_hidden_states[0]
if device.type == 'cuda':
    last_hidden_states = last_hidden_states.cpu()
features = last_hidden_states[:,0,:].numpy()

In [None]:
def fit_transform(features, train_df):
    def cv10_avg_nn(score, model, features, labels):
        return cv10_avg(score, lr_clf, test_features, train_df.ad)

    # AD classification task
    train_features, test_features, train_labels, test_labels = train_test_split(features, train_df.ad, random_state=random_state)
    parameters = {'C': np.linspace(0.0001, 100, 20)}
    grid_search = GridSearchCV(LogisticRegression(), parameters)
    grid_search.fit(train_features, train_labels)
    print('best parameters:', grid_search.best_params_)
    print('best scores: ', grid_search.best_score_)
    lr_clf = LogisticRegression(**grid_search.best_params_)
    lr_clf.fit(train_features, train_labels)
    preds = lr_clf.predict(test_features)
    print('prec, rec, f1 test', precision_recall_fscore_support(test_labels, preds))
    print(f'accu:{cv10_avg_nn("accuracy", model)} prec:{cv10_avg_nn("precision")}, rec:{cv10_avg_nn("recall")}, f1:{cv10_avg_nn("f1")}')
          
    # MMSE regression task
    # remove missing row
    reg_features, reg_scores = np.vstack([features[0:36], features[37:]]), train_df[train_df.mmse != ''].mmse
    train_features, test_features, train_scores, test_scores = train_test_split(reg_features, reg_scores, random_state=random_state)
    parameters = {'alpha': np.linspace(0.001, 100, 20)}
    grid_search = GridSearchCV(Ridge(), parameters)
    grid_search.fit(train_features, train_scores)
    print('best parameters:', grid_search.best_params_)
    print('best scores: ', grid_search.best_score_)
    reg_model = Ridge(**grid_search.best_params_)
    reg_model.fit(train_features, train_scores)
    preds = reg_model.predict(test_features)
    print('rmse test:', sqrt(mean_squared_error(test_scores, preds)))
    print('rmse cv:', cross_val_score(reg_model, reg_features, reg_scores, cv=10, scoring='neg_root_mean_squared_error').sum() / 10)

In [None]:
# NLP Features
bert_features = features

In [97]:
# time features
# - Embed time total time taken 
# - parse time blocks, take first and last
# - Embed total time taken per sentence
# - Time before starting speech
# - Time in between each sentence
# - Average / min / max / median time of sentence
time_dims = train_df.loc[:, ['total_time', 'time_before_par_speech', 'time_between_sents', 'per_sent_times']]

In [98]:
time_dims['avg_betweeen_sents'] = time_dims.time_between_sents.apply(lambda t: round(sum(t) / len(t)))
time_dims['max'] = time_dims.time_between_sents.apply(max)
time_dims['min'] = time_dims.time_between_sents.apply(min)
time_dims_eng = time_dims.drop('time_between_sents', axis=1)

In [99]:
time_dims

Unnamed: 0,total_time,time_before_par_speech,time_between_sents,per_sent_times,avg_betweeen_sents,max,min
0,28249,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[7580, 7580, 2750, 2750, 1670, 1670, 1277, 127...",0,0,0
1,39304,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[3700, 2167, 2633, 2633, 3795, 3795, 2785, 278...",0,0,0
2,108061,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2475, 3757, 7084, 7084, 7681, 7681, 3657, 365...",0,0,0
3,70330,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[997, 831, 1014, 1014, 3958, 3958, 3102, 3102,...",0,0,0
4,84322,0,"[0, 0, 0, 0, 0, 0, 100, 0, 0, 0, 0, 0, 0, 0, 0...","[1829, 1064, 3271, 3806, 770, 770, 14216, 1421...",3,100,0
...,...,...,...,...,...,...,...
103,190561,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2913, 11657, 11657, 14986, 14986, 12979, 1297...",49,732,0
104,56600,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1829, 2013, 2013, 558, 558, 1919, 1919, 12302...",0,0,0
105,88250,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1573, 5913, 5913, 2614, 2614, 4376, 4376, 652...",439,10520,0
106,81475,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1853, 6097, 6097, 1512, 1512, 7161, 7161, 610...",34,897,0


In [105]:
len(train_df.clean_par_speech.iloc[0])

7

In [82]:
time_features = StandardScaler().fit_transform(time_dims_eng.to_numpy())

In [None]:
class BertForADClassification(torch.nn.Module):
    def __init__(self, bertModel, time_dims,):
        super().__init__()
        
        self.bertModel = bertModel
        self.dropout = torch.nn.Dropout(0.5)
        
        # separate Linear for OHE of time
        bert_hidden_dim = self.bertModel
    
        self.classifier = nn.Linear()
    
    def forward(input_ids, attention_mask):
        last_hidden_states = self.bertModel(input_ids, attention_mask=attention_mask)
        features = last_hidden_states[:,0,:]

In [None]:
# Concat all features
features = np.hstack([bert_features, time_features])

In [None]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))