### Import Libraries

In [89]:
import numpy as np
import glob, os
import pandas as pd
import inflect
import re
import time
# next we can import some sklearn libraries to start working with stuff
## transformers and pipline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
## model selectors
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV, GroupKFold, GroupShuffleSplit
## models
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
## feature extractors
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from average_word_length_extractor import AverageWordLengthExtractor
from question_extractor import QuestionExtractor
from int_to_words_extractor import NumberStringExtractor
from greeting_extractor import GreetingExtractor
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from text_preprocessor import TextPreprocessor
from sentiment_extractor import SentimentExtractor
from ner_extractor import NERExtractor
# save model
from sklearn.externals import joblib
# deep learning models
import keras
from keras.models import Sequential, Model
from keras.layers import LSTM, TimeDistributed, Dense, Bidirectional, Input, Flatten
import keras.backend as K
from keras.wrappers.scikit_learn import KerasClassifier

In [2]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

### Reading train data

In [3]:
# read data
train_files = glob.glob('../data/data_v1/TrainCSV_Updated/*.csv')
train = pd.concat([pd.read_csv(fp).assign(train_set=re.split('_|, ',os.path.basename(fp))[0]) for fp in train_files])

In [4]:
train.head(3)

Unnamed: 0,stringList,speakerID,semanticType,leading,Symptom,PMH,MEDS,ALLG,FAMHx,lifestyle,...,GS4089,GS4090,GS4091,GS4092,GS4093,GS4094,GS4095,supportProvision,stringedQuestion,train_set
0,so why don't you tell me what brings you here ...,doctor,openQuestion,no,no,no,no,no,no,no,...,0.070285,0.009144,-0.020626,0.031314,-0.003403,-0.006069,-0.008571,no,no,3
1,so I've been having this kind of random fast w...,patient,statement,no,yes,no,no,no,no,no,...,0.035766,-0.009635,0.036159,-0.019223,0.01171,0.054135,0.056419,no,no,3
2,okay that's pretty fast,doctor,statement,no,no,no,no,no,no,no,...,0.130121,-0.03389,0.023881,0.00209,0.000385,0.019991,0.093138,no,no,3


In [5]:
# convert yes to True(1) and no to False(0)
train = train.replace(to_replace={'yes': 1, 'no': 0})

In [6]:
# remove train_set columns
train.drop(['train_set'], axis = 1, inplace = True)

In [7]:
# features over which we have to predict
prediction_columns = ['stringedQuestion',
                   'leading', 'Symptom', 'PMH', 'MEDS', 'ALLG', 'FAMHx', 'lifestyle',
                   'pysch', 'SOCHx', 'sexualHistory', 'substanceUse', 'PE', 'FORM',
                   'supportProvision', 'transition']

In [8]:
# extract doctor interactions
train = train.loc[train.speakerID == 'doctor']

In [9]:
# get case values
cases = train['case_ID'].values

In [10]:
# drop nas for now
train.dropna(inplace=True, axis = 1)

### Feature Selection

In [11]:
# check the balance for each prediction feature
for n,i in enumerate(prediction_columns):
    balance = train.loc[:, train.columns == prediction_columns[n]][i].value_counts()[1] / train.shape[0]
    print('%s: %0.3f%%'%(i, balance*100))

stringedQuestion: 2.496%
leading: 0.263%
Symptom: 20.297%
PMH: 4.643%
MEDS: 1.239%
ALLG: 0.671%
FAMHx: 3.735%
lifestyle: 1.562%
pysch: 2.216%
SOCHx: 3.548%
sexualHistory: 0.475%
substanceUse: 2.496%
PE: 13.531%
FORM: 11.104%
supportProvision: 3.311%
transition: 8.124%


### Feature Engineering

In [12]:
# feature engineering
## count vectorizer
### word level
ngram_count_word = CountVectorizer(ngram_range=(1, 3), analyzer='word', token_pattern=r'\w{1,}', max_features= 2000)
### char level
ngram_count_char = CountVectorizer(ngram_range=(1, 2), analyzer='char')
## tf idf vectorizer
### word level
tf_idf_word = TfidfVectorizer(ngram_range=(1, 3), analyzer='word', token_pattern=r'\w{1,}', max_features= 2000)
### char level
tf_idf_char = TfidfVectorizer(ngram_range=(1, 2), analyzer='char')
text_preprocessor = TextPreprocessor()
avg_word = AverageWordLengthExtractor()
question_ex = QuestionExtractor()
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
numberstring= NumberStringExtractor()
std_scaler = StandardScaler()
greeting_ex = GreetingExtractor()
sent_extractor = SentimentExtractor()
ner_extractor = NERExtractor()

## Modelling

In [76]:
def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [13]:
#convert numbers to text
num_to_str = ColumnTransformer([
    ('numberstring', numberstring, 3)
], remainder='passthrough')

# original feature encoders
original_feat = ColumnTransformer([
    ('avg_word', avg_word, 0),
    ('question', question_ex, 0),
    ('greeting', greeting_ex, 0),
    ('sentiment', sent_extractor, 0),
    ('ner', ner_extractor, 0)
], remainder = 'passthrough')

# text preprocessor
text_pre = ColumnTransformer([
    ('text_preprocessor', text_preprocessor, 0)
], remainder= 'passthrough')

# text encoders
encoders = ColumnTransformer([
    ('ngram_char', ngram_count_char, 0),
    ('ngram_word', ngram_count_word, 0),
    ('tdf_idf_char', tf_idf_char, 0),
    ('tdf_idf_word', tf_idf_char, 0)
], remainder = 'passthrough')

# one hot encoding
one_hot = ColumnTransformer([
    ('one_hot', one_hot_encoder, ['semanticType'])
], remainder = 'passthrough')

# text pipeline
text_features = Pipeline([
    ('num_to_str', num_to_str),
    ('original_feat', original_feat),
    ('text_pre', text_pre),
    ('encoders', encoders)
])

# preprocessing
preprocess = Pipeline([
    ('one_hot_encoder', one_hot),
    ('text_feat', text_features),
    ('std_scaler', std_scaler)
])

# ml pipeline
ml_pipeline = Pipeline([
    ('preprocessor', preprocess),
    ('model', LGBMClassifier(n_estimators = 100, n_jobs = -1))
])

### Modelling

In [14]:
# model dictionary
models = {}
# select columns by data
X_train = train.drop(prediction_columns, axis=1)
unusable_columns = ['speakerID', 'case_ID']
X_train = X_train.drop(unusable_columns, axis=1)
# shuffle split generator
group_shuffle_split = GroupShuffleSplit(n_splits=1, test_size = 0.2, random_state = 10)

In [15]:
# process X_train
X_train_processed = preprocess.fit_transform(X_train)

In [17]:
input_x = []
for i in np.unique(cases):
    A = np.zeros(shape = (np.unique(cases, return_counts=True)[1].max(), X_train_processed.shape[1]))
    A[:X_train_processed[cases == i].shape[0], :X_train_processed[cases == i].shape[1]] = X_train_processed[cases == i]
    input_x.append(A)

In [26]:
input_x = np.array(input_x)

In [71]:
input_y = []
y_trains = train.loc[:, train.columns == prediction_columns[2]]
for i in np.unique(cases):
    A = np.zeros(shape = (np.unique(cases, return_counts=True)[1].max(), 1))
    A[:y_train[cases == i].shape[0], ] = y_train[cases == i]
    input_y.append(A)

In [95]:
input_y = np.array(input_y)

In [147]:
# define LSTM
# model = Sequential()
# model.add(Bidirectional(LSTM(100, return_sequences=True,dropout=0.50), input_shape=(input_x.shape[1], input_x.shape[2]),merge_mode='concat'))
# model.add(TimeDistributed(Dense(100, activation='relu')))
# model.add(TimeDistributed(Flatten()))
# model.add(Dense(100,activation='relu'))
# model.add(Dense(1,activation='sigmoid'))
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc', recall_m, precision_m, f1_m])

In [148]:
input = Input(shape=(input_x.shape[1], input_x.shape[2],))
model = Bidirectional(LSTM(100, return_sequences=True,dropout=0.50),merge_mode='concat')(input)
model = TimeDistributed(Dense(100,activation='relu'))(model)
model = TimeDistributed(Flatten())(model)
model = Dense(100,activation='relu')(model)
output = Dense(1,activation='sigmoid')(model)
model = Model(input,output)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc', recall_m, precision_m, f1_m])

In [149]:
# # main model
# input = Input(shape=(input_x.shape[1], input_x.shape[2],))
# model =  Bidirectional (LSTM (100,return_sequences=True,dropout=0.50),merge_mode='concat')(input)
# model = TimeDistributed(Dense(100,activation='relu'))(model)
# model = Flatten()(model)
# model = Dense(100,activation='relu')(model)
# output = Dense(1,activation='sigmoid')(model)
# model = Model(input,output)
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc', recall_m, precision_m, f1_m])

In [150]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_19 (InputLayer)        (None, 202, 7774)         0         
_________________________________________________________________
bidirectional_29 (Bidirectio (None, 202, 200)          6300000   
_________________________________________________________________
time_distributed_38 (TimeDis (None, 202, 100)          20100     
_________________________________________________________________
time_distributed_39 (TimeDis (None, 202, 100)          0         
_________________________________________________________________
dense_69 (Dense)             (None, 202, 100)          10100     
_________________________________________________________________
dense_70 (Dense)             (None, 202, 1)            101       
Total params: 6,330,301
Trainable params: 6,330,301
Non-trainable params: 0
_________________________________________________________________


In [151]:
model.fit(input_x, input_y, epochs=20, batch_size=16, verbose=1, validation_split = 0.2)

Train on 68 samples, validate on 18 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1ac50d3b00>