In [1]:
import pandas as pd
import pickle

from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score,  make_scorer, precision_score, recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.utils import shuffle

# Prepare the data

In [2]:
### YOUR PATH HERE
filepath = '.'

Load the datasets

In [3]:
train_df = pd.read_pickle(filepath + '/data/pharm_train_df.pkl')
test_df = pd.read_pickle(filepath + '/data/pharm_test_df.pkl')

design_recode_dict = {
    'casecontrol':'comparative',
    'cohortpro':'comparative',
    'cohortretro':'comparative',
    'metaanalysis':'review',
    'prepost':'comparative',
    'quasirando':'comparative',
    'rando':'comparative',
    'sysreview':'review',
}

field_recode_dict = {
    'anticoag':'Anticoagulation',
    'army':'Other field',
    'asthmaallergypneumo':'Other field',
    'cardiology':'Cardiology',
    'critcare':'Critical care',
    'dermatology':'Other field',
    'diabetesendocrino':'Other field',
    'druginfo':'Other field',
    'emerg':'Emergency medicine',
    'gastro':'Other field',
    'geriatrics':'Geriatric',
    'infectio':'Infectious diseases',
    'neonat':'Maternal / pediatric / neonatal',
    'nephro':'Other field',
    'neuro':'Other field',
    'obgyn':'Maternal / pediatric / neonatal',
    'oncology':'Oncology',
    'ophtalmo':'Other field',
    'ortho':'Other field',
    'other':'Other field',
    'pall':'Palliative care',
    'ped':'Maternal / pediatric / neonatal',
    'psych':'Psychiatric',
    'publichealth':'Infectious diseases',
    'surgery':'Other field',
    'transplant':'Solid organ transplantation',
    'vaccin':'Infectious diseases'
}

setting_recode_dict = {
    'ambulatory':'Ambulatory',
    'community':'Community',
    'hospital':'Inpatient',
    'mixed':'Other setting',
    'other':'Other setting',
}

comparative_columns = [k for k,v in design_recode_dict.items() if v == 'comparative']
review_columns = [k for k,v in design_recode_dict.items() if v == 'review']
design_labels = list(set([v for _, v in design_recode_dict.items()]))

field_labels = list(set([v for _, v in field_recode_dict.items()]))
setting_labels = list(set([v for _, v in setting_recode_dict.items()]))

for df in [train_df, test_df]:
    df['comparative'] = 0
    df.loc[df.loc[df[comparative_columns].values == 1].index,'comparative'] = 1
    df['review'] = 0
    df.loc[df.loc[df[review_columns].values == 1].index,'review'] = 1

for df in [train_df, test_df]:
    for recoded_field in list(field_recode_dict.values()):
        df[recoded_field] = 0
        orig_columns_to_recode = [k for k,v in field_recode_dict.items() if v == recoded_field]
        df.loc[df.loc[df[orig_columns_to_recode].values == 1].index, recoded_field] = 1

for df in [train_df, test_df]:
    for recoded_setting in list(setting_recode_dict.values()):
        df[recoded_setting] = 0
        orig_columns_to_recode = [k for k,v in setting_recode_dict.items() if v == recoded_setting]
        df.loc[df.loc[df[orig_columns_to_recode].values == 1].index, recoded_setting] = 1

print('DESIGN DATA\n')
for df, dfname in zip([train_df, test_df], ['Train', 'Test']):
    print(dfname + ' dataset')
    print('Number of papers in dataset: {}'.format(len(df)))
    print('Number of comparative papers: {}'.format(df['comparative'].sum()))
    print('Number of review papers: {}'.format(df['review'].sum()))
    print('\n')

design_train_df = train_df.drop(train_df.loc[(train_df['comparative'] == 0) & (train_df['review'] == 0)].index)
design_test_df = test_df.drop(test_df.loc[(test_df['comparative'] == 0) & (test_df['review'] == 0)].index)

print('After dropping papers that are not comparative nor reviews:')
print('Number of papers in train dataset: {}'.format(len(design_train_df)))
print('Number of papers in test dataset: {}'.format(len(design_test_df)))

print('Number of papers that are labeled both as comparative and review in train dataset: {}'.format(
    len(design_train_df.loc[(design_train_df['comparative'] == 1) & (design_train_df['review']) == 1])))
print('Number of papers that are labeled both as comparative and review in test dataset: {}'.format(
    len(design_test_df.loc[(design_test_df['comparative'] == 1) & (design_test_df['review']) == 1])))

print('\nFIELD DATA\n')
for df, dfname in zip([train_df, test_df], ['Train', 'Test']):
    print(dfname + ' dataset')
    print('Number of papers in dataset: {}'.format(len(df)))
    for recoded_field in list(set(list(field_recode_dict.values()))):
        print('Number of {} papers: {}'.format(recoded_field, df[recoded_field].sum()))
    print('\n')

print('Distribution of papers by number of fields: \n{}'.format((train_df[field_labels] == 1).sum(axis=1).value_counts()))

field_train_df = train_df.drop(train_df.loc[(train_df[field_labels] == 1).sum(axis=1) != 1].index)
field_test_df = test_df.drop(test_df.loc[(test_df[field_labels] == 1).sum(axis=1) != 1].index)
field_train_df['field'] = field_train_df[field_labels].apply(lambda x: field_train_df.columns[x.argmax()], axis=1)
field_test_df['field'] = field_test_df[field_labels].apply(lambda x: field_test_df.columns[x.argmax()], axis=1)

print('After dropping papers that have no fields or more than one field:')
print('Number of papers in train dataset: {}'.format(len(field_train_df)))
print('Number of papers in test dataset: {}'.format(len(field_test_df)))

print('\nSETTING DATA\n')
for df, dfname in zip([train_df, test_df], ['Train', 'Test']):
    print(dfname + ' dataset')
    print('Number of papers in dataset: {}'.format(len(df)))
    for recoded_field in list(set(list(setting_recode_dict.values()))):
        print('Number of {} papers: {}'.format(recoded_field, df[recoded_field].sum()))
    print('\n')

print('Distribution of papers by number of settings: \n{}'.format((train_df[setting_labels] == 1).sum(axis=1).value_counts()))

setting_train_df = train_df.drop(train_df.loc[(train_df[setting_labels] == 1).sum(axis=1) != 1].index)
setting_train_df = setting_train_df.drop(setting_train_df.loc[setting_train_df['Other setting'] == 1].index)
setting_test_df = test_df.drop(test_df.loc[(test_df[setting_labels] == 1).sum(axis=1) != 1].index)
setting_test_df = setting_test_df.drop(setting_test_df.loc[setting_test_df['Other setting'] == 1].index)
setting_train_df['setting'] = setting_train_df[setting_labels].apply(lambda x: setting_train_df.columns[x.argmax()], axis=1)
setting_test_df['setting'] = setting_test_df[setting_labels].apply(lambda x: setting_test_df.columns[x.argmax()], axis=1)

print('After dropping papers that have no setting, more than one setting or "Other" setting:')
print('Number of papers in train dataset: {}'.format(len(setting_train_df)))
print('Number of papers in test dataset: {}'.format(len(setting_test_df)))


DESIGN DATA

Train dataset
Number of papers in dataset: 1982
Number of comparative papers: 898
Number of review papers: 157


Test dataset
Number of papers in dataset: 496
Number of comparative papers: 216
Number of review papers: 49


After dropping papers that are not comparative nor reviews:
Number of papers in train dataset: 1055
Number of papers in test dataset: 265
Number of papers that are labeled both as comparative and review in train dataset: 0
Number of papers that are labeled both as comparative and review in test dataset: 0

FIELD DATA

Train dataset
Number of papers in dataset: 1982
Number of Critical care papers: 78
Number of Emergency medicine papers: 75
Number of Infectious diseases papers: 251
Number of Maternal, pediatric and neonatal papers: 101
Number of Psychiatric papers: 85
Number of Other field papers: 656
Number of Cardiology papers: 270
Number of Anticoagulation papers: 84
Number of Geriatric papers: 143
Number of Solid organ transplantation papers: 25
Number

## For the fields an settings, encore the "field" column with LabelEncoder

In [4]:
le = LabelEncoder()
encoded_train_fields = le.fit_transform(field_train_df['field'])
encoded_test_fields = le.fit_transform(field_test_df['field'])
encoded_train_settings = le.fit_transform(setting_train_df['setting'])
encoded_test_settings = le.fit_transform(setting_test_df['setting'])

# Helper functions


In [5]:
def grid_search(model_name, search_name, pipe, param, metrics_dict, refit, features, labels):
  grid = GridSearchCV(pipe,param,scoring=metrics_dict, refit=refit, cv=5, verbose=2, n_jobs=-1)
  grid.fit(features,labels)
  results_df = pd.DataFrame.from_dict(grid.cv_results_)
  with open(filepath + '/results/' + model_name + '/' + search_name + '.pkl', mode='wb') as file:
    pickle.dump(results_df, file)
  print('Best parameters: {}'.format(grid.best_params_))
  if refit == True:
    metrics_list = 'mean_test_score'
  else:
    metrics_list = ['mean_test_' + k for k,_ in metrics_dict.items()]
    if len(metrics_list) == 1:
      metrics_list = metrics_list[0]
  print(results_df[metrics_list])
  print('Best index: {}'.format(grid.best_index_))
  return grid, results_df

# Explore possible models to tag design type

## Basic models (LSI, LDA) with a bunch of classifiers

### First grid search for design classification (binary, comparative vs review)

In [10]:
basic_pipe = Pipeline ([
                  ('feature_ext_1', 'passthrough'),
                  ('feature_ext_2', 'passthrough'),
                  ('classify', 'passthrough')
])

basic_param = [
        {
          'feature_ext_1':[CountVectorizer()],
          'feature_ext_2':[LatentDirichletAllocation()],
          'feature_ext_2__n_components':[32,64,128],
          'classify':[LogisticRegression(), SVC(), RandomForestClassifier(),
                      ExtraTreesClassifier(), AdaBoostClassifier(),
                      KNeighborsClassifier()
                      ]  
        },
        {
          'feature_ext_1':[TfidfVectorizer()],
          'feature_ext_2':[TruncatedSVD()],
          'feature_ext_2__n_components':[32,64,128,256],
          'classify':[LogisticRegression(), SVC(), RandomForestClassifier(),
                      ExtraTreesClassifier(), AdaBoostClassifier(),
                      KNeighborsClassifier()
                      ]  
        },
        {
          'feature_ext_1':[CountVectorizer()],
          'classify':[MultinomialNB()]  
        },
]

basic_grid, basic_grid_df = grid_search('design/sklearn_hp_search', 'basic', basic_pipe, basic_param, {'f1':'f1', 'accuracy':'accuracy', 'recall':'recall', 'precision':'precision'}, 'f1', design_train_df['text'], design_train_df['review'])

Fitting 5 folds for each of 43 candidates, totalling 215 fits
Best parameters: {'classify': SVC(), 'feature_ext_1': TfidfVectorizer(), 'feature_ext_2': TruncatedSVD(n_components=128), 'feature_ext_2__n_components': 128}
    mean_test_f1  mean_test_accuracy  mean_test_recall  mean_test_precision
0       0.676260            0.914584          0.630873             0.838095
1       0.716860            0.924107          0.630000             0.880000
2       0.661988            0.895644          0.569286             0.792857
3       0.738986            0.912756          0.795397             0.848485
4       0.835829            0.973410          0.869127             0.834692
5       0.815245            0.963918          0.885397             0.815414
6       0.699756            0.919350          0.635635             0.850739
7       0.803100            0.959138          0.761508             0.871573
8       0.828410            0.956299          0.733333             0.969231
9       0.797914    

## Second grid search

Try to fine tune SVC

In [9]:
svc_pipe = Pipeline ([
                  ('feature_ext_1', 'passthrough'),
                  ('feature_ext_2', 'passthrough'),
                  ('classify', 'passthrough')
])

svc_param = [
        {
          'feature_ext_1':[TfidfVectorizer()],
          'feature_ext_2':[TruncatedSVD()],
          'feature_ext_2__n_components':[128],
          'classify':[SVC()
                      ],
          'classify__C':[0.1,0.2,0.5,0.8,1,2, 10, 20, 60, 100, 150, 200],
          'classify__kernel':['linear', 'poly', 'rbf', 'sigmoid'],
        },
]

basic_grid, basic_grid_df = grid_search('design/sklearn_hp_search', 'svc_finetune', svc_pipe, svc_param, {'f1':'f1', 'accuracy':'accuracy', 'recall':'recall', 'precision':'precision'}, 'f1', design_train_df['text'], design_train_df['review'])

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best parameters: {'classify': SVC(C=10, kernel='sigmoid'), 'classify__C': 10, 'classify__kernel': 'sigmoid', 'feature_ext_1': TfidfVectorizer(), 'feature_ext_2': TruncatedSVD(n_components=128), 'feature_ext_2__n_components': 128}
    mean_test_f1  mean_test_accuracy  mean_test_recall  mean_test_precision
0       0.030769            0.848025          0.016667             0.200000
1       0.094444            0.853740          0.053571             0.400000
2       0.346703            0.863259          0.224127             1.000000
3       0.508060            0.881350          0.370873             1.000000
4       0.563812            0.890864          0.428095             0.950000
5       0.467362            0.884180          0.323016             0.957143
6       0.835327            0.942126          0.734444             0.987234
7       0.858990            0.951614          0.768492             0.985965
8       0.885915        

# Explore models to tag fields (multiclass classification)

In [6]:
basic_pipe = Pipeline ([
                  ('feature_ext_1', 'passthrough'),
                  ('feature_ext_2', 'passthrough'),
                  ('classify', 'passthrough')
])

basic_param = [
        {
          'feature_ext_1':[CountVectorizer()],
          'feature_ext_2':[LatentDirichletAllocation()],
          'feature_ext_2__n_components':[32,64,128],
          'classify':[LogisticRegression(), SVC(), RandomForestClassifier(),
                      ExtraTreesClassifier(), AdaBoostClassifier(),
                      KNeighborsClassifier()
                      ]  
        },
        {
          'feature_ext_1':[TfidfVectorizer()],
          'feature_ext_2':[TruncatedSVD()],
          'feature_ext_2__n_components':[32,64,128,256,512,1024,2048],
          'classify':[LogisticRegression(), SVC(), RandomForestClassifier(),
                      ExtraTreesClassifier(), AdaBoostClassifier(),
                      KNeighborsClassifier()
                      ]  
        },
        {
          'feature_ext_1':[CountVectorizer()],
          'classify':[MultinomialNB()]  
        },
]

basic_grid, basic_grid_df = grid_search('field/sklearn_hp_search', 'basic', basic_pipe, basic_param, {'accuracy':'accuracy', 'precision':make_scorer(precision_score, average='weighted'), 'recall':make_scorer(recall_score, average='weighted'), 'f1':'f1_weighted'}, 'f1', field_train_df['text'], encoded_train_fields)

Fitting 5 folds for each of 61 candidates, totalling 305 fits
Best parameters: {'classify': SVC(), 'feature_ext_1': TfidfVectorizer(), 'feature_ext_2': TruncatedSVD(n_components=256), 'feature_ext_2__n_components': 256}
    mean_test_accuracy  mean_test_precision  mean_test_recall  mean_test_f1
0             0.392653             0.363987          0.392653      0.258384
1             0.393469             0.344970          0.393469      0.260474
2             0.415510             0.396006          0.415510      0.294085
3             0.395102             0.353401          0.395102      0.286062
4             0.449796             0.387689          0.449796      0.359058
..                 ...                  ...               ...           ...
56            0.693061             0.722532          0.693061      0.685698
57            0.668571             0.715639          0.668571      0.671158
58            0.691429             0.738883          0.691429      0.695904
59            0.6914

In [7]:
basic_grid_df.iloc[28]

mean_fit_time                                                                 4.496799
std_fit_time                                                                  0.066972
mean_score_time                                                               0.182166
std_score_time                                                                0.026189
param_classify                                                                   SVC()
param_feature_ext_1                                                  TfidfVectorizer()
param_feature_ext_2                                     TruncatedSVD(n_components=256)
param_feature_ext_2__n_components                                                  256
params                               {'classify': SVC(), 'feature_ext_1': TfidfVect...
split0_test_accuracy                                                          0.808163
split1_test_accuracy                                                          0.791837
split2_test_accuracy                       

## Second grid search

Fine tune SVC

In [7]:
svc_pipe = Pipeline ([
                  ('feature_ext_1', 'passthrough'),
                  ('feature_ext_2', 'passthrough'),
                  ('classify', 'passthrough')
])

svc_param = [
        {
          'feature_ext_1':[TfidfVectorizer()],
          'feature_ext_2':[TruncatedSVD()],
          'feature_ext_2__n_components':[256],
          'classify':[SVC()
                      ],
          'classify__C':[0.1,0.2,0.5,0.8,1,2, 10, 20, 60, 100, 150, 200],
          'classify__kernel':['linear', 'poly', 'rbf', 'sigmoid'],
        },
]

basic_grid, basic_grid_df = grid_search('field/sklearn_hp_search', 'svc_finetune', svc_pipe, svc_param, {'accuracy':'accuracy', 'precision':make_scorer(precision_score, average='weighted'), 'recall':make_scorer(recall_score, average='weighted'), 'f1':'f1_weighted'}, 'f1', field_train_df['text'], encoded_train_fields)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best parameters: {'classify': SVC(C=10), 'classify__C': 10, 'classify__kernel': 'rbf', 'feature_ext_1': TfidfVectorizer(), 'feature_ext_2': TruncatedSVD(n_components=256), 'feature_ext_2__n_components': 256}
    mean_test_accuracy  mean_test_precision  mean_test_recall  mean_test_f1
0             0.361633             0.133254          0.361633      0.194096
1             0.361633             0.133254          0.361633      0.194096
2             0.362449             0.153726          0.362449      0.195743
3             0.373061             0.281325          0.373061      0.216408
4             0.409796             0.436174          0.409796      0.280459
5             0.371429             0.329122          0.371429      0.213376
6             0.426122             0.440107          0.426122      0.304935
7             0.485714             0.453144          0.485714      0.390598
8             0.580408             0.690329   

In [8]:
basic_grid_df.iloc[26]

mean_fit_time                                                                 4.076437
std_fit_time                                                                    0.1865
mean_score_time                                                               0.152739
std_score_time                                                                0.013468
param_classify                                                               SVC(C=10)
param_classify__C                                                                   10
param_classify__kernel                                                             rbf
param_feature_ext_1                                                  TfidfVectorizer()
param_feature_ext_2                                     TruncatedSVD(n_components=256)
param_feature_ext_2__n_components                                                  256
params                               {'classify': SVC(C=10), 'classify__C': 10, 'cl...
split0_test_accuracy                       

# Models for setting classification

In [6]:
basic_pipe = Pipeline ([
                  ('feature_ext_1', 'passthrough'),
                  ('feature_ext_2', 'passthrough'),
                  ('classify', 'passthrough')
])

basic_param = [
        {
          'feature_ext_1':[CountVectorizer()],
          'feature_ext_2':[LatentDirichletAllocation()],
          'feature_ext_2__n_components':[32,64,128],
          'classify':[LogisticRegression(), SVC(), RandomForestClassifier(),
                      ExtraTreesClassifier(), AdaBoostClassifier(),
                      KNeighborsClassifier()
                      ]  
        },
        {
          'feature_ext_1':[TfidfVectorizer()],
          'feature_ext_2':[TruncatedSVD()],
          'feature_ext_2__n_components':[32,64,128,256,512,1024,2048],
          'classify':[LogisticRegression(), SVC(), RandomForestClassifier(),
                      ExtraTreesClassifier(), AdaBoostClassifier(),
                      KNeighborsClassifier()
                      ]  
        },
        {
          'feature_ext_1':[CountVectorizer()],
          'classify':[MultinomialNB()]  
        },
]

basic_grid, basic_grid_df = grid_search('setting/sklearn_hp_search', 'basic', basic_pipe, basic_param, {'accuracy':'accuracy', 'precision':make_scorer(precision_score, average='weighted'), 'recall':make_scorer(recall_score, average='weighted'), 'f1':'f1_weighted'}, 'f1', setting_train_df['text'], encoded_train_settings)

Fitting 5 folds for each of 61 candidates, totalling 305 fits
Best parameters: {'classify': SVC(), 'feature_ext_1': TfidfVectorizer(), 'feature_ext_2': TruncatedSVD(n_components=512), 'feature_ext_2__n_components': 512}
    mean_test_accuracy  mean_test_precision  mean_test_recall  mean_test_f1
0             0.636851             0.656586          0.636851      0.588140
1             0.631328             0.649124          0.631328      0.584578
2             0.639992             0.640113          0.639992      0.591261
3             0.657223             0.647240          0.657223      0.632061
4             0.660315             0.654286          0.660315      0.635416
..                 ...                  ...               ...           ...
56            0.648598             0.649145          0.648598      0.645438
57            0.675105             0.675303          0.675105      0.669380
58            0.665240             0.665808          0.665240      0.659863
59            0.6701

In [7]:
basic_grid_df.iloc[29]

mean_fit_time                                                                13.899044
std_fit_time                                                                  1.564539
mean_score_time                                                               0.710647
std_score_time                                                                0.182185
param_classify                                                                   SVC()
param_feature_ext_1                                                  TfidfVectorizer()
param_feature_ext_2                                     TruncatedSVD(n_components=512)
param_feature_ext_2__n_components                                                  512
params                               {'classify': SVC(), 'feature_ext_1': TfidfVect...
split0_test_accuracy                                                          0.787692
split1_test_accuracy                                                          0.787692
split2_test_accuracy                       

## Fine tune SVC

In [8]:
svc_pipe = Pipeline ([
                  ('feature_ext_1', 'passthrough'),
                  ('feature_ext_2', 'passthrough'),
                  ('classify', 'passthrough')
])

svc_param = [
        {
          'feature_ext_1':[TfidfVectorizer()],
          'feature_ext_2':[TruncatedSVD()],
          'feature_ext_2__n_components':[512],
          'classify':[SVC()
                      ],
          'classify__C':[0.1,0.2,0.5,0.8,1,2, 10, 20, 60, 100, 150, 200],
          'classify__kernel':['linear', 'poly', 'rbf', 'sigmoid'],
        },
]

basic_grid, basic_grid_df = grid_search('setting/sklearn_hp_search', 'svc_finetune', svc_pipe, svc_param, {'accuracy':'accuracy', 'precision':make_scorer(precision_score, average='weighted'), 'recall':make_scorer(recall_score, average='weighted'), 'f1':'f1_weighted'}, 'f1', setting_train_df['text'], encoded_train_settings)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best parameters: {'classify': SVC(C=200), 'classify__C': 200, 'classify__kernel': 'rbf', 'feature_ext_1': TfidfVectorizer(), 'feature_ext_2': TruncatedSVD(n_components=512), 'feature_ext_2__n_components': 512}
    mean_test_accuracy  mean_test_precision  mean_test_recall  mean_test_f1
0             0.534517             0.521034          0.534517      0.387591
1             0.519113             0.270525          0.519113      0.355384
2             0.522811             0.456492          0.522811      0.363516
3             0.563493             0.517557          0.563493      0.442055
4             0.649193             0.723148          0.649193      0.576259
5             0.523425             0.458321          0.523425      0.365866
6             0.628237             0.748803          0.628237      0.543166
7             0.685576             0.721220          0.685576      0.630903
8             0.770044             0.781310 

In [9]:
basic_grid_df.iloc[46]

mean_fit_time                                                                14.391287
std_fit_time                                                                  1.159286
mean_score_time                                                               0.382147
std_score_time                                                                0.143912
param_classify                                                              SVC(C=200)
param_classify__C                                                                  200
param_classify__kernel                                                             rbf
param_feature_ext_1                                                  TfidfVectorizer()
param_feature_ext_2                                     TruncatedSVD(n_components=512)
param_feature_ext_2__n_components                                                  512
params                               {'classify': SVC(C=200), 'classify__C': 200, '...
split0_test_accuracy                       