### Import Libraries

In [1]:
import numpy as np
import glob, os
import pandas as pd
import inflect
import re
import time
# next we can import some sklearn libraries to start working with stuff
## transformers and pipline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
## model selectors
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV, GroupKFold, GroupShuffleSplit
## models
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
## feature extractors
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from average_word_length_extractor import AverageWordLengthExtractor
from question_extractor import QuestionExtractor
from int_to_words_extractor import NumberStringExtractor
from greeting_extractor import GreetingExtractor
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from text_preprocessor import TextPreprocessor
from sentiment_extractor import SentimentExtractor
from ner_extractor import NERExtractor
# save model
from sklearn.externals import joblib

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

### Reading train data

In [3]:
# read data
train_files = glob.glob('../data/data_v1/TrainCSV_Updated/*.csv')
train = pd.concat([pd.read_csv(fp).assign(train_set=re.split('_|, ',os.path.basename(fp))[0]) for fp in train_files])

In [4]:
train.head(3)

Unnamed: 0,stringList,speakerID,semanticType,leading,Symptom,PMH,MEDS,ALLG,FAMHx,lifestyle,...,GS4089,GS4090,GS4091,GS4092,GS4093,GS4094,GS4095,supportProvision,stringedQuestion,train_set
0,so why don't you tell me what brings you here ...,doctor,openQuestion,no,no,no,no,no,no,no,...,0.070285,0.009144,-0.020626,0.031314,-0.003403,-0.006069,-0.008571,no,no,3
1,so I've been having this kind of random fast w...,patient,statement,no,yes,no,no,no,no,no,...,0.035766,-0.009635,0.036159,-0.019223,0.01171,0.054135,0.056419,no,no,3
2,okay that's pretty fast,doctor,statement,no,no,no,no,no,no,no,...,0.130121,-0.03389,0.023881,0.00209,0.000385,0.019991,0.093138,no,no,3


In [5]:
# convert yes to True(1) and no to False(0)
train = train.replace(to_replace={'yes': 1, 'no': 0})

In [6]:
# remove train_set columns
train.drop(['train_set'], axis = 1, inplace = True)

In [7]:
# features over which we have to predict
prediction_columns = ['stringedQuestion',
                   'leading', 'Symptom', 'PMH', 'MEDS', 'ALLG', 'FAMHx', 'lifestyle',
                   'pysch', 'SOCHx', 'sexualHistory', 'substanceUse', 'PE', 'FORM',
                   'supportProvision', 'transition']

In [8]:
# extract doctor interactions
train = train.loc[train.speakerID == 'doctor']

In [9]:
# get case values
cases = train['case_ID'].values

In [10]:
# drop nas for now
train.dropna(inplace=True, axis = 1)

### Feature Selection

In [11]:
# check the balance for each prediction feature
for n,i in enumerate(prediction_columns):
    balance = train.loc[:, train.columns == prediction_columns[n]][i].value_counts()[1] / train.shape[0]
    print('%s: %0.3f%%'%(i, balance*100))

stringedQuestion: 2.496%
leading: 0.263%
Symptom: 20.297%
PMH: 4.643%
MEDS: 1.239%
ALLG: 0.671%
FAMHx: 3.735%
lifestyle: 1.562%
pysch: 2.216%
SOCHx: 3.548%
sexualHistory: 0.475%
substanceUse: 2.496%
PE: 13.531%
FORM: 11.104%
supportProvision: 3.311%
transition: 8.124%


### Feature Engineering

In [12]:
# feature engineering
## count vectorizer
### word level
ngram_count_word = CountVectorizer(ngram_range=(1, 3), analyzer='word', token_pattern=r'\w{1,}', max_features= 2000)
### char level
ngram_count_char = CountVectorizer(ngram_range=(1, 2), analyzer='char')
## tf idf vectorizer
### word level
tf_idf_word = TfidfVectorizer(ngram_range=(1, 3), analyzer='word', token_pattern=r'\w{1,}', max_features= 2000)
### char level
tf_idf_char = TfidfVectorizer(ngram_range=(1, 2), analyzer='char')
text_preprocessor = TextPreprocessor()
avg_word = AverageWordLengthExtractor()
question_ex = QuestionExtractor()
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
numberstring= NumberStringExtractor()
std_scaler = StandardScaler()
greeting_ex = GreetingExtractor()
sent_extractor = SentimentExtractor()
ner_extractor = NERExtractor()

## Modelling

In [13]:
#convert numbers to text
num_to_str = ColumnTransformer([
    ('numberstring', numberstring, 3)
], remainder='passthrough')

# original feature encoders
original_feat = ColumnTransformer([
    ('avg_word', avg_word, 0),
    ('question', question_ex, 0),
    ('greeting', greeting_ex, 0),
    ('sentiment', sent_extractor, 0),
    ('ner', ner_extractor, 0)
], remainder = 'passthrough')

# text preprocessor
text_pre = ColumnTransformer([
    ('text_preprocessor', text_preprocessor, 0)
], remainder= 'passthrough')

# text encoders
encoders = ColumnTransformer([
    ('ngram_char', ngram_count_char, 0),
    ('ngram_word', ngram_count_word, 0),
    ('tdf_idf_char', tf_idf_char, 0),
    ('tdf_idf_word', tf_idf_char, 0)
], remainder = 'passthrough')

# one hot encoding
one_hot = ColumnTransformer([
    ('one_hot', one_hot_encoder, ['semanticType'])
], remainder = 'passthrough')

# text pipeline
text_features = Pipeline([
    ('num_to_str', num_to_str),
    ('original_feat', original_feat),
    ('text_pre', text_pre),
    ('encoders', encoders)
])

# preprocessing
preprocess = Pipeline([
    ('one_hot_encoder', one_hot),
    ('text_feat', text_features),
    ('std_scaler', std_scaler)
])

# ml pipeline
ml_pipeline = Pipeline([
    ('preprocessor', preprocess),
    ('model', LGBMClassifier(n_estimators = 100, n_jobs = -1))
])

### Grid Search

In [43]:
# model dictionary
models = {}
# select columns by data
X_train = train.drop(prediction_columns, axis=1)
unusable_columns = ['speakerID', 'case_ID']
X_train = X_train.drop(unusable_columns, axis=1)
# shuffle split generator
group_shuffle_split = GroupShuffleSplit(n_splits=1, test_size = 0.2, random_state = 10)

In [15]:
for n,i in enumerate(prediction_columns):
    # select y
    start = time.time()
    y_train = train.loc[:, train.columns == prediction_columns[n]]
    gen = group_shuffle_split.split(X_train,y_train,cases)
    
    print(i + ':' + str(cross_val_score(ml_pipeline, X_train, y_train, scoring = 'average_precision', cv = gen).mean()))
    print('Time for variable ' + i + ': ' + str(time.time()-start) + 'sec')

stringedQuestion:0.8014697734264332
Time for variable stringedQuestion: 229.39206719398499sec
leading:0.030889804276295313
Time for variable leading: 266.3376178741455sec
Symptom:0.9547048748032048
Time for variable Symptom: 295.232079744339sec
PMH:0.7885114023759484
Time for variable PMH: 299.2815730571747sec
MEDS:0.8615078717244583
Time for variable MEDS: 233.33761620521545sec
ALLG:1.0
Time for variable ALLG: 228.71168899536133sec
FAMHx:0.8320563896031345
Time for variable FAMHx: 255.08805584907532sec
lifestyle:0.8840681490477549
Time for variable lifestyle: 253.30153703689575sec
pysch:0.7780490757187416
Time for variable pysch: 284.6124269962311sec
SOCHx:0.7071571499405478
Time for variable SOCHx: 264.4002182483673sec
sexualHistory:0.7452115866589551
Time for variable sexualHistory: 248.07816982269287sec
substanceUse:0.9354087430257639
Time for variable substanceUse: 252.40762186050415sec
PE:0.9778911837393216
Time for variable PE: 277.16190814971924sec
FORM:0.8653104797926156
Time 

In [28]:
grid = {
    'model__n_estimators': np.logspace(1, 3.5, 6).astype(int),
    'model__max_depth': np.linspace(2, 10, 5).astype(int),
    'model__num_leaves': np.linspace(4, 100, 5).astype(int),
    'model__boosting_type': ['gbdt', 'dart'],
    'model__learning_rate': np.logspace(-3, 1, 9)
}

for n, i in enumerate(prediction_columns):
    # select y
    start = time.time()
    y_train = train.loc[:, train.columns == prediction_columns[n]]
    gen = group_shuffle_split.split(X_train,y_train,cases)
    
    grid_search = RandomizedSearchCV(ml_pipeline, param_distributions=grid, n_iter = 10, scoring = 'average_precision', cv = gen, random_state= 42)
    grid_search.fit(X_train, y_train)

    print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
    best_pipeline = grid_search.best_estimator_
    models[i] = best_pipeline
    print('Time for variable ' + i + ': ' + str(time.time()-start) + ' sec')

Best parameter (CV score=0.849):
Time for variable stringedQuestion: 3334.3416290283203 sec
Best parameter (CV score=0.286):
Time for variable leading: 3373.469489097595 sec
Best parameter (CV score=0.943):
Time for variable Symptom: 3757.107565164566 sec
Best parameter (CV score=0.769):
Time for variable PMH: 3517.8814380168915 sec
Best parameter (CV score=0.924):
Time for variable MEDS: 19912.929650068283 sec
Best parameter (CV score=1.000):
Time for variable ALLG: 3261.4875032901764 sec
Best parameter (CV score=0.894):
Time for variable FAMHx: 3195.3665862083435 sec
Best parameter (CV score=0.920):
Time for variable lifestyle: 3125.5118799209595 sec
Best parameter (CV score=0.785):
Time for variable pysch: 3242.5578072071075 sec
Best parameter (CV score=0.693):
Time for variable SOCHx: 3269.94765996933 sec
Best parameter (CV score=0.691):
Time for variable sexualHistory: 3021.43749499321 sec
Best parameter (CV score=0.895):
Time for variable substanceUse: 3012.010580062866 sec
Best 

In [52]:
# save model to directory
timestr = time.strftime("%Y%m%d-%H%M%S")
directory = '../model/' + timestr + '/'
if not os.path.exists(directory):
    os.makedirs(directory)

In [53]:
# save model in folder
file_name = 'LightGBM-grid-search-' + timestr + '.pkl'
joblib.dump(models, directory + file_name)

['../model/20190427-134642/LightGBM-grid-search-20190427-134642.pkl']