### clf dev with qsidp

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn.model_selection
from sklearn.ensemble import RandomForestClassifier

In [2]:
from compare_hyperparams import full_labels, remove_subjs, load_feats, cv_classify



#### extracting non-multi disease patients

In [3]:
df_out = full_labels('patients_pain', save=False)

/well/seymour/users/uhu195/python/extract_npy//labels/label_patients_pain.csv
(2724, 1)
(2724, 6)


In [5]:
bfl_dir = '/well/seymour/users/uhu195/python/pain/'
bfloutput_dir = os.path.join(bfl_dir, 'output_patients_50')
data_dir = os.path.join(bfloutput_dir, 'Result_IC30')

In [6]:
df_featout_ex = remove_subjs(data_dir, df_out) # remove multiple conditions

In [7]:
df_featout_ex.shape

(2280, 36)

In [10]:
bmrc_full_save = df_featout_ex['bmrc'].rename('eid')

In [14]:
bmrc_full_save.to_csv('./bmrc_full/subjs_patients_pain_exmult.csv', index=None)

In [19]:
subjs_full_save = df_featout_ex['eid']

In [22]:
subjs_full_save.to_csv('./subjs/subjs_patients_pain_exmult.csv', index=None)

In [15]:
dat = './npy/subjs_patients_pain_exmult/'

In [17]:
tt = np.load(os.path.join(dat, 'cope1.npy'))

In [18]:
tt.shape

(2280, 228453)

#### create pain minus matched for filter

In [116]:
# fname = 'subjs_patients_pain_exmult.csv'
fname = 'subjs_pain_minus_matched.csv'
df_original = pd.read_csv('./subjs/'+fname, header=None)
df_bmrc = pd.read_csv('./bmrc_full/'+fname)

In [111]:
df_original.shape

(14422, 1)

In [112]:
df_bmrc.shape

(14423, 1)

In [104]:
def revert_eid(df_bridge, in_csv, save=False):
    """revert bmrc to project eid"""
    df_bmrc = pd.read_csv('./bmrc_full/'+in_csv)
    df_slice = df_bmrc.merge(df_bridge, left_on='eid', right_on='eid_8107')
    df_subjs = df_slice['eid_45465'].astype(int)
    if save:
        df_subjs.to_csv('./subjs/'+in_csv, index=None, header=None)
    return df_subjs

In [105]:
df_bridge = pd.read_csv('../bridge_file/bridge_8107_45465.csv')

In [106]:
df_original = revert_eid(df_bridge, fname, save=True)

In [118]:
df_combine = pd.concat([df_original[0].rename('eid'),
                        df_bmrc['eid'].rename('bmrc')], axis=1)

In [119]:
df_combine.shape

(14423, 2)

In [120]:
df_combine.to_csv('./subjs_bmrc/'+fname, index=None)

#### functions

In [3]:
def extract_qs(df_subjects, df_questionnaire, visits=[2]):
    """extract questionnaire set out of 5 possible"""
    # load questionnaire code of interest
    field_code = df_questionnaire['code'].to_list()
    # extract all fields with questionnaire code
    field_cols = []
    for code in field_code:
        # cols_ls = [col for col in df_subjects.columns if str(code)+'-' in col]
        code_root = str(code)+'-'
        cols_ls = [col for col in df_subjects.columns if col[:len(code_root)]==code_root]
        if visits != None: # limit to visits only
            if len(cols_ls) > 1:
                cols_exclude = []
                for visit in visits:
                    for col in cols_ls:
                        if '-'+str(visit) in col:
                            cols_exclude.append(col)
                cols_ls = cols_exclude
        else:
            cols_ls = cols_ls
        field_cols += cols_ls
    # append eid
    field_cols += ['eid']
    # remove duplicate
    field_cols_rm = list(set(field_cols))
    df_qs = df_subjects[field_cols_rm]
    # remove duplicated columns
    df_qs_rm = df_qs.loc[:, ~df_qs.columns.duplicated()]
    return df_qs_rm

In [4]:
def load_qscode(questionnaire='all', idp=None):
    """load questionnaire and idp code"""
    base_dir = './bbk_codes/'
    # questionnaire data
    df_qs = pd.DataFrame()
    if questionnaire!=None and len(questionnaire)!=0:
        questionnaire_ls = ['lifestyle','mental','cognitive','digestive','cwp','demographic']
        if (questionnaire!='all') and (questionnaire in questionnaire_ls):
            df_qs = pd.read_csv(os.path.join(base_dir, questionnaire+'_code.csv'))
        elif (questionnaire!='all') and (type(questionnaire) is list): # multiple qs sets
            qs_ls = []
            for i in questionnaire:
                fname = i+'_code.csv'
                fpath = os.path.join(base_dir, fname)
                qs_ls.append(pd.read_csv(fpath))
            df_qs = pd.concat(qs_ls)
        elif questionnaire=='all':
            questionnaire_ls = ['lifestyle','mental','cognitive','demographic']
            qs_ls = []
            for qs in questionnaire_ls:
                qs_ls.append(pd.read_csv(os.path.join(base_dir,qs+'_code.csv')))
            df_qs = pd.concat(qs_ls)
        else:
            raise ValueError('Questionnaire code does not exist.')
    # idp data
    df_idp = pd.DataFrame()
    if idp!=None and len(idp)!=0:
        idp_ls = ['dmri','wdmri','fast','subcorticalvol','t1vols','t2star','t2weighted','taskfmri']
        if (idp!='all') and (idp in idp_ls): # single idp set
            df_idp = pd.read_csv(os.path.join(base_dir, 'idp_'+idp+'_code.csv'))
        elif (idp!='all') and (type(idp) is list): # multiple idp sets
            idpc_ls = []
            for i in idp:
                fname = 'idp_'+i+'_code.csv'
                fpath = os.path.join(base_dir, fname)
                idpc_ls.append(pd.read_csv(fpath))
            df_idp = pd.concat(idpc_ls)
        elif idp=='all': # all idp sets
            idpc_ls = []
            for i in idp_ls:
                fname = 'idp_'+i+'_code.csv'
                fpath = os.path.join(base_dir, fname)
                idpc_ls.append(pd.read_csv(fpath))
            df_idp = pd.concat(idpc_ls)
        else:
            raise ValueError('IDP code does not exist.')
    # combine questionnaire with idp
    df_out = pd.concat([df_qs, df_idp])
    return df_out

In [5]:
def impute_qs(df, nan_percent=0.9, freq_fill='median', 
              transform=False, transform_fn='sqrt'):
    """impute questionnaire df"""
    df_copy = df.copy()
    # replace prefer not to say and remove object
    df_copy = replace_noans(df_copy)
    # replace multiple choice fields
    df_copy = replace_multifield(df_copy)
    # replace specific fields
    df_copy = replace_specific(df_copy)
    # fill freq nan with median
    df_copy = replace_freq(df_copy, use=freq_fill)
    # transform freq cols
    if transform:
        df_copy = apply_transform(df_copy, use=transform_fn)
    # drop columns with threshold percentage nan
    df_copy.dropna(axis=1, thresh=int(nan_percent*df_copy.shape[0]), inplace=True)
    return df_copy

def replace_noans(df):
    """replace prefer not to say if avaialable and remove object cols"""
    df_copy = df.copy()
    for col in df_copy.columns:
        if col!='label': # exclude label
            # remove time stamp cols
            if df_copy[col].dtype==object:
                df_copy.drop(col, axis=1, inplace=True)
            # replace nan with -818 (prefer not to say)
            elif np.any(df_copy[col]==-818):
                df_copy[col].replace({np.nan: -818.}, inplace=True)
    return df_copy

def replace_multifield(df):
    """replace multiple choice fields"""
    df_copy = df.copy()
    categories_multi = [
        '6160',#Leisure/social activities
        '6145',#Illness, injury, bereavement, stress in last 2 years
    ]
    for cat in categories_multi:
        p_cols = [col for col in df_copy.columns if col[:len(cat)+1]==str(cat)+'-']
        for c in p_cols: # replace with none of the above -7
            df_copy[c].replace(np.nan, -7., inplace=True)
    return df_copy

def replace_specific(df):
    """replace specific categories"""
    df_copy = df.copy()
    categories_zero = [
        '20123',#Single episode of probable major depression
        '20124',#Probable recurrent major depression (moderate)
        '20125', #Probable recurrent major depression (severe)
        '20481', #Self-harmed in past year
        '20484', #Attempted suicide in past year
        '20122', #Bipolar disorder status
        '20126', #Bipolar and major depression status
                 ]
    categories_nts = [
        '20414', #Frequency of drinking alcohol
    ]
    categories_to = [
        '20246', #Trail making completion status
        '20245', #Pairs matching completion status
        '20244', #Symbol digit completion status
    ]
    for c in df_copy.columns:
        for cat in categories_zero:
            if cat in c: 
                df_copy[c].replace(np.nan, 0., inplace=True)
        for cat in categories_nts:
            if cat in c:
                df_copy[c].replace(np.nan, -818., inplace=True) # treat as prefer not to say
        for cat in categories_to:
            if cat in c:
                df_copy[c].replace(np.nan, 1., inplace=True) # treat as abandoned
    return df_copy

def replace_freq(df, use='median'):
    """replace nan in freq with median"""
    df_copy = df.copy()
    for c in df_copy.columns:
        tmp = df_copy[c].value_counts()
        if tmp.shape[0]>7 and c!='label': # most likely frequency/idp
            if use == 'median':
                df_copy[c].fillna(df_copy[c].median(), inplace=True)
            elif use == 'mean':
                df_copy[c].fillna(df_copy[c].mean(), inplace=True)
        elif tmp.shape[0]<=7 and c!='label': # other types of freq
            if np.any(df_copy[c]==-3.) or np.any(df_copy[c]==-1.): # prefer not to say
                df_copy[c].replace({np.nan: -3.}, inplace=True)
#             elif np.any(df_copy[c]==-600.): # degree of bother, also has prefer not to say
#                 df_copy[c].replace({np.nan: -818.}, inplace=True)
    return df_copy

In [181]:
bestIC = 50
bfl_dir = '/well/seymour/users/uhu195/python/pain/'
# bfloutput_dir = os.path.join(bfl_dir, 'output_patients_50')
bfloutput_dir = os.path.join(bfl_dir, 'output_patients_500')

In [182]:
curr_dir = '/well/seymour/users/uhu195/python/extract_npy'
# param_dir = os.path.join(curr_dir, 'hyperparam_cv', 'patients_optuna.csv')

In [183]:
bp_path = os.path.join(curr_dir, 'hyperparam_cv', f'best_params_IC{bestIC}.npy')
params = np.load(bp_path, allow_pickle='TRUE').item() # load dict

In [184]:
params

{'max_depth': 10,
 'n_estimators': 225,
 'max_features': 0.6326227223728347,
 'min_samples_split': 12,
 'min_samples_leaf': 14,
 'max_samples': 0.8879865707952637}

In [185]:
# load data (500 IC)
d = f'Result_IC{bestIC}'
data_dir = os.path.join(bfloutput_dir, d)
df_out = full_labels('patients_pain', save=False)
df_featout_ex = remove_subjs(data_dir, df_out) # remove multiple conditions
print(df_featout_ex.shape)
#     X_train, y_train, _, _ = load_feats(df_featout_ex, bestIC, train=True, balance=True, scaler=True)
# X_train, y_train = load_feats(df_featout_ex, bestIC, train=False, balance=True, scaler=True)
# # train clf
# forest = RandomForestClassifier(**params)
# # load data
# forest.fit(X_train, y_train)

cv_classify(df_featout_ex, bestIC, classifier='rforest', tuned_params=params, cv_fold=5, scaler=True, balance=True)

/well/seymour/users/uhu195/python/extract_npy//labels/label_patients_pain.csv
(2724, 1)
(2724, 6)
(2280, 56)
(2280, 50)
4
5-fold CV classification with classifier RandomForestClassifier(max_depth=10, max_features=0.6326227223728347,
                       max_samples=0.8879865707952637, min_samples_leaf=14,
                       min_samples_split=12, n_estimators=225):
test ROC AUC=0.5714, test accuracy=0.3307, test f1=0.3307


Unnamed: 0,fit_time,score_time,test_accuracy,test_f1_micro,test_roc_auc_ovo
0,2.94885,0.063872,0.333333,0.333333,0.584607
1,2.887013,0.064186,0.326667,0.326667,0.569126
2,2.901357,0.063468,0.33,0.33,0.588207
3,2.900222,0.063643,0.296667,0.296667,0.530859
4,2.923391,0.064991,0.366667,0.366667,0.5844


In [10]:
# load data
d = f'Result_IC{bestIC}'
data_dir = os.path.join(bfloutput_dir, d)
df_out = full_labels('patients_pain', save=False)
df_featout_ex = remove_subjs(data_dir, df_out) # remove multiple conditions
print(df_featout_ex.shape)
#     X_train, y_train, _, _ = load_feats(df_featout_ex, bestIC, train=True, balance=True, scaler=True)
# X_train, y_train = load_feats(df_featout_ex, bestIC, train=False, balance=True, scaler=True)
# # train clf
# forest = RandomForestClassifier(**params)
# # load data
# forest.fit(X_train, y_train)

cv_classify(df_featout_ex, bestIC, classifier='rforest', tuned_params=params, cv_fold=5, scaler=True, balance=True)

/well/seymour/users/uhu195/python/extract_npy//labels/label_patients_pain.csv
(2724, 1)
(2724, 6)
(2280, 106)
(2280, 100)
4
5-fold CV classification with classifier RandomForestClassifier(max_depth=2, max_features=0.5236846481050532,
                       max_samples=0.6766668567081835, min_samples_leaf=14,
                       min_samples_split=6, n_estimators=250):
test ROC AUC=0.5680, test accuracy=0.3293, test f1=0.3293


Unnamed: 0,fit_time,score_time,test_accuracy,test_f1_micro,test_roc_auc_ovo
0,1.661849,0.062518,0.333333,0.333333,0.576711
1,1.658394,0.061787,0.356667,0.356667,0.596844
2,1.661305,0.062138,0.326667,0.326667,0.574222
3,1.656882,0.060656,0.29,0.29,0.5436
4,1.651546,0.06068,0.34,0.34,0.548815


In [11]:
df_qsidp = pd.read_csv('./qsidp/qsidp_patients_pain.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [78]:
# select 
qs = load_qscode(questionnaire='all', idp=None)
# qs = load_qscode(questionnaire=None, idp='all')

In [79]:
df_qs = extract_qs(df_qsidp, df_questionnaire=qs, visits=[2])

In [80]:
df_qs.shape

(3363, 153)

In [81]:
# section used ones before imputing
df_qs_sec = df_qs[df_qs['eid'].isin(df_featout_ex['eid'])]
print(df_qs_sec.shape)

(2280, 153)


In [82]:
# impute qs
df_qs_imputed = impute_qs(df_qs_sec, nan_percent=0.9, freq_fill='median', 
              transform=False, transform_fn='sqrt')

In [157]:
df_qs_imputed.head()

Unnamed: 0,20403-0.0,20483-0.0,26431-0.0,20522-0.0,1031-2.0,20157-0.0,20523-0.0,6160-2.0,20195-0.0,20248-0.0,...,20499-0.0,2060-2.0,2020-2.0,6145-2.1,20124-0.0,2000-2.0,6160-2.1,20489-0.0,20247-0.0,20416-0.0
1,-818.0,-818.0,-0.735,-818.0,3.0,59.3975,-818.0,1.0,21.0,2.0,...,-818.0,1.0,0.0,3.0,0.0,1.0,-7.0,-818.0,2.0,-818.0
2,2.0,-818.0,-0.735,1.0,4.0,40.59,1.0,-7.0,22.0,1.0,...,1.0,4.0,0.0,-7.0,0.0,1.0,-7.0,3.0,2.0,1.0
3,3.0,-818.0,-0.735,4.0,2.0,59.3975,0.0,1.0,21.0,2.0,...,1.0,1.0,0.0,-7.0,0.0,0.0,2.0,3.0,2.0,4.0
5,1.0,-818.0,-0.735,4.0,2.0,59.3975,1.0,1.0,21.0,2.0,...,0.0,1.0,0.0,-7.0,0.0,0.0,-7.0,4.0,2.0,1.0
6,-818.0,-818.0,-0.735,-818.0,3.0,59.3975,-818.0,5.0,21.0,2.0,...,-818.0,1.0,0.0,-7.0,0.0,0.0,-7.0,-818.0,2.0,-818.0


In [167]:
# check continuous vs categorical
all_ls = []
# for i,r in df_qs_imputed[['20403-0.0', '20483-0.0']].iteritems():
for i,r in df_qs_imputed.iteritems():
    cat_count = len(r.value_counts().values)
    if cat_count < 8:
#         tmp = pd.get_dummies(r, prefix=i)
        tmp = pd.get_dummies(r, prefix=i, drop_first=True)
    else:
        tmp = pd.DataFrame(r)
    all_ls.append(tmp.reset_index(drop=True))

In [168]:
df_qs_imputed_dum = pd.concat(all_ls, axis=1)

In [169]:
df_featout_ex.shape

(2280, 106)

In [170]:
df_featout_ex.head()

Unnamed: 0,eid,irritable bowel syndrome,migraine,back pain,osteoarthritis,bmrc,0,1,2,3,...,90,91,92,93,94,95,96,97,98,99
0,1009498,0,0,1,0,3374984,-20376930.0,18273630.0,-39743.553396,549538.377241,...,0.385746,0.185159,-0.078095,0.1621,-0.002303,0.0,0.0,0.0,0.0,0.0
1,1009687,0,0,0,1,5477409,-21135520.0,-1800645.0,382392.172648,469098.508579,...,0.404936,0.392089,0.289257,-0.057072,0.034051,0.0,0.0,0.0,0.0,0.0
2,1034646,0,1,0,0,1901794,-35357900.0,2854687.0,-30499.144031,168267.344017,...,-0.209228,0.07471,-0.483292,-0.007582,0.051398,0.0,0.0,0.0,0.0,0.0
3,1043722,0,1,0,0,4444357,-6854058.0,-13300450.0,967580.373336,128703.801067,...,0.287834,-0.007288,0.140404,0.05404,-0.014287,0.0,0.0,0.0,0.0,0.0
4,1052218,0,0,0,1,2745333,1288482.0,-5242134.0,479195.303494,-268740.399271,...,-0.223085,-0.625955,-0.043717,-0.163596,0.026179,0.0,0.0,0.0,0.0,0.0


In [171]:
# merge
df_bfl_qsidp = df_featout_ex.merge(df_qs_imputed_dum, left_on='eid', right_on='eid', how='left',indicator=False)

In [172]:
df_bfl_qsidp.shape

(2280, 395)

In [174]:
# retrain params 
import optuna
from compare_hyperparams import objective, load_feats

X_train, y_train = load_feats(df_bfl_qsidp, bestIC, train=False, balance=True, scaler=True)
print(X_train.shape, y_train.shape)
study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=20)
bp = study.best_trial.params
print(bp)

[32m[I 2021-10-12 13:48:14,809][0m A new study created in memory with name: no-name-c2c12b90-d619-4958-bb14-57f426a8d04e[0m


(2280, 389)
(1500, 389) (1500,)


[32m[I 2021-10-12 13:48:27,015][0m Trial 0 finished with value: 0.34800000000000003 and parameters: {'max_depth': 17, 'n_estimators': 200, 'max_features': 0.9140052288539503, 'min_samples_split': 12, 'min_samples_leaf': 2, 'max_samples': 0.9178544254549668}. Best is trial 0 with value: 0.34800000000000003.[0m
[32m[I 2021-10-12 13:48:29,364][0m Trial 1 finished with value: 0.37666666666666665 and parameters: {'max_depth': 23, 'n_estimators': 200, 'max_features': 0.2011147607554608, 'min_samples_split': 5, 'min_samples_leaf': 11, 'max_samples': 0.7566969777567052}. Best is trial 1 with value: 0.37666666666666665.[0m
[32m[I 2021-10-12 13:48:33,601][0m Trial 2 finished with value: 0.35733333333333334 and parameters: {'max_depth': 7, 'n_estimators': 100, 'max_features': 0.8417361309180584, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_samples': 0.9354836128827977}. Best is trial 1 with value: 0.37666666666666665.[0m
[32m[I 2021-10-12 13:48:37,825][0m Trial 3 finished with 

{'max_depth': 23, 'n_estimators': 200, 'max_features': 0.2011147607554608, 'min_samples_split': 5, 'min_samples_leaf': 11, 'max_samples': 0.7566969777567052}


In [175]:
# qs alone, sectioned before merge, dummified
cv_classify(df_bfl_qsidp, bestIC, classifier='rforest', tuned_params=bp, cv_fold=5, scaler=True, balance=True)

(2280, 389)
4
5-fold CV classification with classifier RandomForestClassifier(max_depth=23, max_features=0.2011147607554608,
                       max_samples=0.7566969777567052, min_samples_leaf=11,
                       min_samples_split=5, n_estimators=200):
test ROC AUC=0.6132, test accuracy=0.3613, test f1=0.3613


Unnamed: 0,fit_time,score_time,test_accuracy,test_f1_micro,test_roc_auc_ovo
0,2.012449,0.061226,0.333333,0.333333,0.615852
1,2.037176,0.06114,0.406667,0.406667,0.602652
2,1.984198,0.059746,0.376667,0.376667,0.621037
3,2.020768,0.060366,0.34,0.34,0.608281
4,2.006107,0.060413,0.35,0.35,0.618385


In [65]:
# qs alone, sectioned before merge
cv_classify(df_bfl_qsidp, bestIC, classifier='rforest', tuned_params=bp, cv_fold=5, scaler=True, balance=True)

(2280, 212)
4
5-fold CV classification with classifier RandomForestClassifier(max_depth=3, max_features=0.43559897216631016,
                       max_samples=0.9465356674703758, min_samples_leaf=14,
                       min_samples_split=4, n_estimators=200):
test ROC AUC=0.6114, test accuracy=0.3740, test f1=0.3740


Unnamed: 0,fit_time,score_time,test_accuracy,test_f1_micro,test_roc_auc_ovo
0,2.22923,0.053102,0.373333,0.373333,0.603733
1,2.226943,0.052746,0.39,0.39,0.61003
2,2.220343,0.05263,0.376667,0.376667,0.636889
3,2.221489,0.053747,0.35,0.35,0.599704
4,2.219174,0.053535,0.38,0.38,0.6064


In [77]:
# idp alone, imputed, sectioned
cv_classify(df_bfl_qsidp, bestIC, classifier='rforest', tuned_params=bp, cv_fold=5, scaler=True, balance=True)

(2280, 975)
4
5-fold CV classification with classifier RandomForestClassifier(max_depth=10, max_features=0.41760145026665085,
                       max_samples=0.9409379942684949, min_samples_leaf=2,
                       min_samples_split=10, n_estimators=150):
test ROC AUC=0.5971, test accuracy=0.3473, test f1=0.3473


Unnamed: 0,fit_time,score_time,test_accuracy,test_f1_micro,test_roc_auc_ovo
0,30.192394,0.056124,0.353333,0.353333,0.628163
1,30.070764,0.054839,0.343333,0.343333,0.573585
2,30.015521,0.055154,0.336667,0.336667,0.591378
3,29.921669,0.05468,0.35,0.35,0.562296
4,29.919734,0.05499,0.353333,0.353333,0.629852


In [40]:
# idp alone, imputed, not sectioned
cv_classify(df_bfl_qsidp, bestIC, classifier='rforest', tuned_params=bp, cv_fold=5, scaler=True, balance=True)

(2280, 975)
4
5-fold CV classification with classifier RandomForestClassifier(max_depth=4, max_features=0.9729649859360843,
                       max_samples=0.8255786576374825, min_samples_split=13,
                       n_estimators=75):
test ROC AUC=0.6028, test accuracy=0.3467, test f1=0.3467


Unnamed: 0,fit_time,score_time,test_accuracy,test_f1_micro,test_roc_auc_ovo
0,16.663128,0.029437,0.346667,0.346667,0.625259
1,16.788806,0.032825,0.366667,0.366667,0.586548
2,16.660043,0.028785,0.35,0.35,0.5992
3,16.646502,0.028955,0.31,0.31,0.576089
4,16.663128,0.029003,0.36,0.36,0.626741


In [25]:
# with all qs, imputed but didn't dummy
cv_classify(df_bfl_qsidp, bestIC, classifier='rforest', tuned_params=bp, cv_fold=5, scaler=True, balance=True)

(2280, 216)
4
5-fold CV classification with classifier RandomForestClassifier(max_depth=2, max_features=0.15079269935030123,
                       max_samples=0.6936468178262939, min_samples_leaf=7):
test ROC AUC=0.6056, test accuracy=0.3667, test f1=0.3667


Unnamed: 0,fit_time,score_time,test_accuracy,test_f1_micro,test_roc_auc_ovo
0,0.313112,0.030555,0.35,0.35,0.591689
1,0.311915,0.029889,0.386667,0.386667,0.611096
2,0.312089,0.029962,0.373333,0.373333,0.616933
3,0.311238,0.029809,0.346667,0.346667,0.608119
4,0.314362,0.02969,0.376667,0.376667,0.600341


In [None]:
# adding qs data make it worse? need to check qs cat and cont variables