In [40]:
import os, glob, bcolz, gc

import numpy as np
import pandas as pd

from tqdm import tqdm
from scipy import ndimage, misc
from scipy.stats import rankdata

from sklearn.externals import joblib
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from xgboost import XGBClassifier, XGBRegressor

import keras
from keras import backend as K
from keras import optimizers
from keras.models import Model, load_model

from keras.applications.inception_v3 import preprocess_input as preprocess_input_inceptionv3

from tensorflow.python.client import device_lib
device_lib.list_local_devices(), 'keras version: {}'.format(keras.__version__)

([name: "/cpu:0"
  device_type: "CPU"
  memory_limit: 268435456
  locality {
  }
  incarnation: 12693644352211927512, name: "/gpu:0"
  device_type: "GPU"
  memory_limit: 303824896
  locality {
    bus_id: 2
  }
  incarnation: 2529966118843386688
  physical_device_desc: "device: 0, name: Tesla K80, pci bus id: 0000:84:00.0"],
 'keras version: 2.0.6')

In [2]:
path = os.path.join('/scratch', 'yns207', 'data_invasive')
train_set = pd.read_csv(os.path.join(path, 'train_labels.csv'))
test_set = pd.read_csv(os.path.join(path, 'sample_submission.csv'))

def delete_model(model, clear_session=True):
    '''removes model!
    '''
    del model
    gc.collect()
    if clear_session: K.clear_session()

def read_img(img_path, img_shape):
    img = misc.imread(img_path)
    img = misc.imresize(img, img_shape)
    return img

def read_imgs(img_height, img_width):
    train_img, test_img = [],[]
    for img_path in tqdm(train_set['name'].iloc[:]):
        train_img.append(read_img(os.path.join(path, 'train', str(img_path)+'.jpg'), (img_height, img_width)))

    for img_path in tqdm(test_set['name'].iloc[:]):
        test_img.append(read_img(os.path.join(path, 'test', str(img_path)+'.jpg'), (img_height, img_width)))
    return np.array(train_img), np.array(test_img)

In [3]:
train_img_224, test_img_224 = read_imgs(224,224)
train_img_299, test_img_299 = read_imgs(299,299)
train_img_450, test_img_450 = read_imgs(450,450)
train_labels = train_set['invasive'].values

100%|██████████| 2295/2295 [01:11<00:00, 32.31it/s]
100%|██████████| 1531/1531 [00:45<00:00, 33.47it/s]
100%|██████████| 2295/2295 [01:12<00:00, 31.58it/s]
100%|██████████| 1531/1531 [00:47<00:00, 32.43it/s]
100%|██████████| 2295/2295 [01:16<00:00, 29.90it/s]
100%|██████████| 1531/1531 [00:49<00:00, 30.41it/s]


In [4]:
preds_df = pd.DataFrame([], columns=['name'])
preds_df['name'] = test_set['name']

model_files = [
    'invasive_incepv3_aug10_kfolds_299x299_0.model',
    'invasive_incepv3_aug10_kfolds_299x299_1.model',
    'invasive_xcep_aug11_kfolds_224x224_0.model',
    'invasive_xcep_aug11_kfolds_224x224_1.model',
    'invasive_xcep_aug11_kfolds_224x224_2.model',
    'invasive_xcep_aug11_kfolds_224x224_3.model',
    'invasive_xcep_aug11_kfolds_224x224_4.model',
    'invasive_xcep_aug11_kfolds_299x299_0.model',
    'invasive_xcep_aug11_kfolds_299x299_1.model',
    'invasive_xcep_aug11_kfolds_299x299_2.model',
    'invasive_xcep_aug11_kfolds_299x299_3.model',
    'invasive_xcep_aug11_kfolds_299x299_4.model',
    'invasive_incepv3_aug11_kfolds_224x224_0.model',
    'invasive_incepv3_aug11_kfolds_224x224_1.model',
    'invasive_incepv3_aug11_kfolds_224x224_2.model',
    'invasive_incepv3_aug11_kfolds_224x224_3.model',
    'invasive_incepv3_aug11_kfolds_224x224_4.model',
    'invasive_incepv3_aug11_kfolds_299x299_0.model',
    'invasive_incepv3_aug11_kfolds_299x299_1.model',
    'invasive_incepv3_aug11_kfolds_299x299_2.model',
    'invasive_incepv3_aug11_kfolds_299x299_3.model',
    'invasive_incepv3_aug11_kfolds_299x299_4.model',
    'invasive_incepv3_aug11_kfolds_450x450_0.model',
    'invasive_incepv3_aug11_kfolds_450x450_1.model',
    'invasive_incepv3_aug11_kfolds_450x450_2.model',
    'invasive_incepv3_aug11_kfolds_450x450_3.model',
    'invasive_incepv3_aug11_kfolds_450x450_4.model'
]

for model_name in model_files:
    print(model_name)
    model = load_model(os.path.join(path,model_name))
    if '224' in model_name:
        proc_test_img = preprocess_input_inceptionv3(test_img_224.astype(np.float32))
    elif '299' in model_name:
        proc_test_img = preprocess_input_inceptionv3(test_img_299.astype(np.float32))
    else:
        proc_test_img = preprocess_input_inceptionv3(test_img_450.astype(np.float32))
    preds_df[model_name] = pd.Series(model.predict(proc_test_img).flatten())
    delete_model(model)

invasive_incepv3_aug10_kfolds_299x299_0.model




invasive_incepv3_aug10_kfolds_299x299_1.model
invasive_xcep_aug11_kfolds_224x224_0.model
invasive_xcep_aug11_kfolds_224x224_1.model
invasive_xcep_aug11_kfolds_224x224_2.model
invasive_xcep_aug11_kfolds_224x224_3.model
invasive_xcep_aug11_kfolds_224x224_4.model
invasive_xcep_aug11_kfolds_299x299_0.model
invasive_xcep_aug11_kfolds_299x299_1.model
invasive_xcep_aug11_kfolds_299x299_2.model
invasive_xcep_aug11_kfolds_299x299_3.model
invasive_xcep_aug11_kfolds_299x299_4.model
invasive_incepv3_aug11_kfolds_224x224_0.model
invasive_incepv3_aug11_kfolds_224x224_1.model
invasive_incepv3_aug11_kfolds_224x224_2.model
invasive_incepv3_aug11_kfolds_224x224_3.model
invasive_incepv3_aug11_kfolds_224x224_4.model
invasive_incepv3_aug11_kfolds_299x299_0.model
invasive_incepv3_aug11_kfolds_299x299_1.model
invasive_incepv3_aug11_kfolds_299x299_2.model
invasive_incepv3_aug11_kfolds_299x299_3.model
invasive_incepv3_aug11_kfolds_299x299_4.model
invasive_incepv3_aug11_kfolds_450x450_0.model
invasive_incepv3_a

In [5]:
preds_df.head()

Unnamed: 0,name,invasive_incepv3_aug10_kfolds_299x299_0.model,invasive_incepv3_aug10_kfolds_299x299_1.model,invasive_xcep_aug11_kfolds_224x224_0.model,invasive_xcep_aug11_kfolds_224x224_1.model,invasive_xcep_aug11_kfolds_224x224_2.model,invasive_xcep_aug11_kfolds_224x224_3.model,invasive_xcep_aug11_kfolds_224x224_4.model,invasive_xcep_aug11_kfolds_299x299_0.model,invasive_xcep_aug11_kfolds_299x299_1.model,...,invasive_incepv3_aug11_kfolds_299x299_0.model,invasive_incepv3_aug11_kfolds_299x299_1.model,invasive_incepv3_aug11_kfolds_299x299_2.model,invasive_incepv3_aug11_kfolds_299x299_3.model,invasive_incepv3_aug11_kfolds_299x299_4.model,invasive_incepv3_aug11_kfolds_450x450_0.model,invasive_incepv3_aug11_kfolds_450x450_1.model,invasive_incepv3_aug11_kfolds_450x450_2.model,invasive_incepv3_aug11_kfolds_450x450_3.model,invasive_incepv3_aug11_kfolds_450x450_4.model
0,1,0.99922,0.999937,0.999832,0.997136,0.998534,0.999998,0.999851,1.0,1.0,...,0.996456,0.992088,0.999838,0.999759,0.986,0.996039,0.977934,0.993849,0.999967,0.994448
1,2,0.006064,0.018489,0.29235,0.021871,0.045619,0.011755,0.04939,0.087101,0.000298,...,0.060021,0.079418,0.050304,0.025824,0.040866,0.005403,0.015407,0.005452,0.019659,0.02946
2,3,0.011985,0.269732,0.04708,0.077081,0.053459,0.055718,0.004568,0.012156,0.014143,...,0.064229,0.415717,0.058073,0.065356,0.002074,0.005325,0.088598,0.050874,0.011113,0.103452
3,4,0.017386,0.077561,7.2e-05,0.024709,0.000225,0.000183,0.023354,0.000373,0.005312,...,0.00195,0.270881,0.202681,0.031353,0.073497,0.023607,0.203233,0.051576,0.072977,0.028269
4,5,0.978103,0.991908,0.975208,0.998245,0.997167,0.989112,0.996995,0.999627,0.999979,...,0.986529,0.860889,0.860935,0.999671,0.969829,0.993084,0.971659,0.983782,0.99927,0.985751


In [8]:
# avg all preds
subm = pd.DataFrame([], columns=['name', 'invasive'])
subm['name'] = test_set['name']
subm['invasive'] = preds_df[[col for col in preds_df.columns if not col is 'name']].mean(axis=1)
subm.head()

Unnamed: 0,name,invasive
0,1,0.994363
1,2,0.033531
2,3,0.080738
3,4,0.042286
4,5,0.976823


In [None]:
subm.to_csv(os.path.join(path, 'results', 'sumb_aug12_0.gz'), index=False, compression='gzip')

that did really well! 0.99452 (14th place)

we will try some forset ensembling, xgboost + random forset (maybe mix both), then we will use wtvr is best to psuedo label and train a new set of models.

# random forest

In [126]:
# make preds on training data
def make_preds(model_files, img_224, img_299, img_450):
    preds_df = pd.DataFrame([])
    
    for model_name in model_files:
        print(model_name)
        model = load_model(os.path.join(path,model_name))
        if '224' in model_name:
            proc_img = preprocess_input_inceptionv3(img_224.astype(np.float32))
        elif '299' in model_name:
            proc_img = preprocess_input_inceptionv3(img_299.astype(np.float32))
        else:
            proc_img = preprocess_input_inceptionv3(img_450.astype(np.float32))
        preds_df[model_name] = pd.Series(model.predict(proc_img).flatten())
        delete_model(model)
    return preds_df

In [11]:
preds = make_preds(model_files, train_img_224, train_img_299, train_img_450)

invasive_incepv3_aug10_kfolds_299x299_0.model




invasive_incepv3_aug10_kfolds_299x299_1.model
invasive_xcep_aug11_kfolds_224x224_0.model
invasive_xcep_aug11_kfolds_224x224_1.model
invasive_xcep_aug11_kfolds_224x224_2.model
invasive_xcep_aug11_kfolds_224x224_3.model
invasive_xcep_aug11_kfolds_224x224_4.model
invasive_xcep_aug11_kfolds_299x299_0.model
invasive_xcep_aug11_kfolds_299x299_1.model
invasive_xcep_aug11_kfolds_299x299_2.model
invasive_xcep_aug11_kfolds_299x299_3.model
invasive_xcep_aug11_kfolds_299x299_4.model
invasive_incepv3_aug11_kfolds_224x224_0.model
invasive_incepv3_aug11_kfolds_224x224_1.model
invasive_incepv3_aug11_kfolds_224x224_2.model
invasive_incepv3_aug11_kfolds_224x224_3.model
invasive_incepv3_aug11_kfolds_224x224_4.model
invasive_incepv3_aug11_kfolds_299x299_0.model
invasive_incepv3_aug11_kfolds_299x299_1.model
invasive_incepv3_aug11_kfolds_299x299_2.model
invasive_incepv3_aug11_kfolds_299x299_3.model
invasive_incepv3_aug11_kfolds_299x299_4.model
invasive_incepv3_aug11_kfolds_450x450_0.model
invasive_incepv3_a

In [12]:
preds.head()

Unnamed: 0,name,invasive_incepv3_aug10_kfolds_299x299_0.model,invasive_incepv3_aug10_kfolds_299x299_1.model,invasive_xcep_aug11_kfolds_224x224_0.model,invasive_xcep_aug11_kfolds_224x224_1.model,invasive_xcep_aug11_kfolds_224x224_2.model,invasive_xcep_aug11_kfolds_224x224_3.model,invasive_xcep_aug11_kfolds_224x224_4.model,invasive_xcep_aug11_kfolds_299x299_0.model,invasive_xcep_aug11_kfolds_299x299_1.model,...,invasive_incepv3_aug11_kfolds_299x299_0.model,invasive_incepv3_aug11_kfolds_299x299_1.model,invasive_incepv3_aug11_kfolds_299x299_2.model,invasive_incepv3_aug11_kfolds_299x299_3.model,invasive_incepv3_aug11_kfolds_299x299_4.model,invasive_incepv3_aug11_kfolds_450x450_0.model,invasive_incepv3_aug11_kfolds_450x450_1.model,invasive_incepv3_aug11_kfolds_450x450_2.model,invasive_incepv3_aug11_kfolds_450x450_3.model,invasive_incepv3_aug11_kfolds_450x450_4.model
0,1,0.001253,0.011497,5.127164e-07,0.000289,1e-06,3e-06,5.9e-05,3.701978e-06,8.1e-05,...,0.000438,0.012741,0.040401,0.077121,0.08363286,0.006519,0.035461,0.082165,0.006436,0.031972
1,2,0.00293,8.8e-05,0.03061252,0.004832,0.114906,0.373535,0.074218,0.02146954,0.001395,...,0.145233,0.218515,0.053029,0.076973,6.538212e-09,0.07175,0.128591,0.013319,0.008619,0.007797
2,3,0.996199,0.99982,0.9999363,0.981235,0.990044,0.999787,0.99989,0.9938554,0.995244,...,0.99997,0.99671,0.999981,0.998054,0.9986544,0.990071,0.960965,0.999823,0.999852,0.999066
3,4,0.000678,0.007591,5.8695e-05,3e-06,2e-06,0.000106,0.004041,9.471538e-07,2e-06,...,0.003871,0.029723,0.061424,0.033346,0.0630205,0.005201,0.049514,0.072159,0.046485,0.021436
4,5,0.998524,1.0,1.0,0.997346,1.0,0.999998,0.999999,1.0,1.0,...,0.999096,0.997274,0.996581,0.997158,0.9594099,0.995446,0.993305,0.980781,0.999997,0.999043


In [60]:
%cd $path

model_name = 'invasive_forest_ensembler_aug12'
kf = KFold(n_splits=5, shuffle=True, random_state=78)
i = 0

train_data = preds.values[:,1:]

for train_ixs, valid_ixs in kf.split(train_data):
    print('kfold: {}'.format(i))
    x_train = train_data[train_ixs]
    x_valid = train_data[valid_ixs]
    y_train = train_labels[train_ixs]
    y_valid = train_labels[valid_ixs]
    
    forest = RandomForestRegressor(random_state=40, n_jobs=-1, n_estimators=27, max_depth=7)
    forest.fit(x_train, y_train)
    forest_preds_train = forest.predict(x_train).flatten()
    forest_preds_valid = forest.predict(x_valid).flatten()
    
    joblib.dump(forest, '{}_{}.pkl'.format(model_name, i))
    
    print('forest train loss:')
    print('{}'.format(log_loss(y_train, forest_preds_train)))
    print('forest train aroc:')
    print('{}'.format(roc_auc_score(y_train, forest_preds_train)))
    print('forest train accuracy:')
    print('{}'.format(accuracy_score(y_train, np.around(forest_preds_train))))    
    print('forest valid loss:')
    print('{}'.format(log_loss(y_valid, forest_preds_valid)))
    print('forest valid aroc:')
    print('{}'.format(roc_auc_score(y_valid, forest_preds_valid)))
    print('forest valid accuracy:')
    print('{}'.format(accuracy_score(y_valid, np.around(forest_preds_valid))))
    print('\n')
    
    i += 1

/scratch/yns207/data_invasive
kfold: 0
forest train loss:
0.0013623471817160108
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.0026148094896038558
forest valid aroc:
1.0
forest valid accuracy:
1.0


kfold: 1
forest train loss:
0.0015890919251152933
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.0066453497585176385
forest valid aroc:
1.0
forest valid accuracy:
1.0


kfold: 2
forest train loss:
0.0017617653882842554
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.0061148682694591675
forest valid aroc:
1.0
forest valid accuracy:
1.0


kfold: 3
forest train loss:
0.0008691026434795442
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.008875525063376558
forest valid aroc:
1.0
forest valid accuracy:
0.9967320261437909


kfold: 4
forest train loss:
0.0011748368623181531
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.00012333440517369845
forest valid aroc:
1.0
forest valid accura

# xgboost ensembler

In [102]:
%cd $path

model_name = 'invasive_xgb_ensembler_aug12'
kf = KFold(n_splits=5, shuffle=True, random_state=78)
i = 0

for train_ixs, valid_ixs in kf.split(train_data):
    print('kfold: {}'.format(i))
    x_train = train_data[train_ixs]
    x_valid = train_data[valid_ixs]
    y_train = train_labels[train_ixs]
    y_valid = train_labels[valid_ixs]
    
    forest = XGBRegressor()
    forest.fit(x_train, y_train)
    # this max gets rid of negative preds
    forest_preds_train = np.maximum(forest.predict(x_train).flatten(), np.random.rand()*0.000001)
    forest_preds_valid = np.maximum(forest.predict(x_valid).flatten(), np.random.rand()*0.000001)
    
    joblib.dump(forest, '{}_{}.pkl'.format(model_name, i))
    
    print('xgb train loss:')
    print('{}'.format(log_loss(y_train, forest_preds_train)))
    print('xgb train aroc:')
    print('{}'.format(roc_auc_score(y_train, forest_preds_train)))
    print('xgb train accuracy:')
    print('{}'.format(accuracy_score(y_train, np.around(forest_preds_train))))
    print('xgb valid loss:')
    print('{}'.format(log_loss(y_valid, forest_preds_valid)))
    print('xgb valid aroc:')
    print('{}'.format(roc_auc_score(y_valid, forest_preds_valid)))
    print('xgb valid accuracy:')
    print('{}'.format(accuracy_score(y_valid, np.around(forest_preds_valid))))
    print('\n')
    
    i += 1

/scratch/yns207/data_invasive
kfold: 0
xgb train loss:
nan
xgb train aroc:
0.9999999999999999
xgb train accuracy:
1.0
xgb valid loss:
nan
xgb valid aroc:
1.0
xgb valid accuracy:
0.996742671009772


kfold: 1


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


xgb train loss:
nan
xgb train aroc:
1.0
xgb train accuracy:
1.0
xgb valid loss:
nan
xgb valid aroc:
1.0
xgb valid accuracy:
0.9967320261437909


kfold: 2
xgb train loss:
nan
xgb train aroc:
0.9999999999999999
xgb train accuracy:
1.0
xgb valid loss:
nan
xgb valid aroc:
1.0
xgb valid accuracy:
1.0


kfold: 3
xgb train loss:
2.7143164212359304e-05
xgb train aroc:
1.0
xgb train accuracy:
1.0
xgb valid loss:
0.03380374891493571
xgb valid aroc:
0.9974358974358974
xgb valid accuracy:
0.9967320261437909


kfold: 4
xgb train loss:
nan
xgb train aroc:
1.0
xgb train accuracy:
1.0
xgb valid loss:
nan
xgb valid aroc:
1.0
xgb valid accuracy:
1.0




  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


it kinda seems like they all  folds learned the same shit in training which is a good sign. rf and xgb had very similar results but some differ across the same folds.

lets ensemble all these trees:



In [109]:
ensembler_preds = pd.DataFrame([], columns=['name'])
ensembler_preds['name'] = test_set['name']

In [110]:
ensembler_files = [
#  'invasive_forest_ensembler_aug12_0.pkl',
#  'invasive_forest_ensembler_aug12_1.pkl',
#  'invasive_forest_ensembler_aug12_2.pkl',
#  'invasive_forest_ensembler_aug12_3.pkl',
#  'invasive_forest_ensembler_aug12_4.pkl',
 'invasive_xgb_ensembler_aug12_0.pkl',
 'invasive_xgb_ensembler_aug12_1.pkl',
 'invasive_xgb_ensembler_aug12_2.pkl',
 'invasive_xgb_ensembler_aug12_3.pkl',
 'invasive_xgb_ensembler_aug12_4.pkl',
]

# preds_df is the test data
# predictions we want to feed ot the 
# ensembler
test_data = preds_df.values[:,1:]

for ensembler_file in ensembler_files:
    ensembler = joblib.load(ensembler_file)
    ensembler_preds[ensembler_file] = pd.Series(ensembler.predict(test_data).flatten())
# it loooks like there are negative valuse but they are all small.
# so well just abs() them before hand. this is a reult of using regression
# trees!
ensembler_preds = ensembler_preds.abs()
ensembler_preds.head()

Unnamed: 0,name,invasive_xgb_ensembler_aug12_0.pkl,invasive_xgb_ensembler_aug12_1.pkl,invasive_xgb_ensembler_aug12_2.pkl,invasive_xgb_ensembler_aug12_3.pkl,invasive_xgb_ensembler_aug12_4.pkl
0,1.0,0.99998,0.999988,0.999991,0.999976,0.999987
1,2.0,1.7e-05,2.8e-05,1.8e-05,3.2e-05,1.2e-05
2,3.0,1.7e-05,6e-06,0.000127,3.2e-05,2.6e-05
3,4.0,1.7e-05,6e-06,0.000552,3.2e-05,1.2e-05
4,5.0,0.99998,0.999988,0.999991,0.999976,0.999987


In [111]:
subm = pd.DataFrame([], columns=['name', 'invasive'])
subm['name'] = test_set['name']
subm['invasive'] = ensembler_preds[[col for col in ensembler_preds.columns if not col is 'name']].mean(axis=1)
subm.head()

Unnamed: 0,name,invasive
0,1,0.999984
1,2,2.1e-05
2,3,4.1e-05
3,4,0.000124
4,5,0.999984


In [114]:
subm.to_csv(os.path.join(path, 'results', 'sumb_aug12_2.gz'), index=False, compression='gzip')

i tried doing just the xgboosted trees ensemblers only and it only scored: 0.99260, ok fuck lol. so i screwed up the function and it was only returning 1500 ish of the training examples. that's why none of this shit worked.


what ill do is try:

1 single xgb ensembler on all training data for all models. (submission 1) - xgb classifier, predict_proba

then if that works multiple with full training data. (submission 2) - xgb + random forest classifiers, predict_proba

then whatever is best ill use to psuedo label - psuedo labeling (submission 3 after retraining w/ extra data)

In [127]:
preds_train = make_preds(model_files, train_img_224, train_img_299, train_img_450)

invasive_incepv3_aug10_kfolds_299x299_0.model




invasive_incepv3_aug10_kfolds_299x299_1.model
invasive_xcep_aug11_kfolds_224x224_0.model
invasive_xcep_aug11_kfolds_224x224_1.model
invasive_xcep_aug11_kfolds_224x224_2.model
invasive_xcep_aug11_kfolds_224x224_3.model
invasive_xcep_aug11_kfolds_224x224_4.model
invasive_xcep_aug11_kfolds_299x299_0.model
invasive_xcep_aug11_kfolds_299x299_1.model
invasive_xcep_aug11_kfolds_299x299_2.model
invasive_xcep_aug11_kfolds_299x299_3.model
invasive_xcep_aug11_kfolds_299x299_4.model
invasive_incepv3_aug11_kfolds_224x224_0.model
invasive_incepv3_aug11_kfolds_224x224_1.model
invasive_incepv3_aug11_kfolds_224x224_2.model
invasive_incepv3_aug11_kfolds_224x224_3.model
invasive_incepv3_aug11_kfolds_224x224_4.model
invasive_incepv3_aug11_kfolds_299x299_0.model
invasive_incepv3_aug11_kfolds_299x299_1.model
invasive_incepv3_aug11_kfolds_299x299_2.model
invasive_incepv3_aug11_kfolds_299x299_3.model
invasive_incepv3_aug11_kfolds_299x299_4.model
invasive_incepv3_aug11_kfolds_450x450_0.model
invasive_incepv3_a

In [128]:
preds_train

Unnamed: 0,invasive_incepv3_aug10_kfolds_299x299_0.model,invasive_incepv3_aug10_kfolds_299x299_1.model,invasive_xcep_aug11_kfolds_224x224_0.model,invasive_xcep_aug11_kfolds_224x224_1.model,invasive_xcep_aug11_kfolds_224x224_2.model,invasive_xcep_aug11_kfolds_224x224_3.model,invasive_xcep_aug11_kfolds_224x224_4.model,invasive_xcep_aug11_kfolds_299x299_0.model,invasive_xcep_aug11_kfolds_299x299_1.model,invasive_xcep_aug11_kfolds_299x299_2.model,...,invasive_incepv3_aug11_kfolds_299x299_0.model,invasive_incepv3_aug11_kfolds_299x299_1.model,invasive_incepv3_aug11_kfolds_299x299_2.model,invasive_incepv3_aug11_kfolds_299x299_3.model,invasive_incepv3_aug11_kfolds_299x299_4.model,invasive_incepv3_aug11_kfolds_450x450_0.model,invasive_incepv3_aug11_kfolds_450x450_1.model,invasive_incepv3_aug11_kfolds_450x450_2.model,invasive_incepv3_aug11_kfolds_450x450_3.model,invasive_incepv3_aug11_kfolds_450x450_4.model
0,0.001253,0.011497,5.127164e-07,0.000289,0.000001,0.000003,0.000059,3.701978e-06,8.079002e-05,2.598423e-05,...,0.000438,0.012741,0.040401,0.077121,8.363286e-02,0.006519,0.035461,0.082165,0.006436,0.031972
1,0.002930,0.000088,3.061252e-02,0.004832,0.114906,0.373535,0.074218,2.146954e-02,1.394692e-03,2.559908e-04,...,0.145233,0.218515,0.053029,0.076973,6.538212e-09,0.071750,0.128591,0.013319,0.008619,0.007797
2,0.996199,0.999820,9.999363e-01,0.981235,0.990044,0.999787,0.999890,9.938554e-01,9.952442e-01,9.993926e-01,...,0.999970,0.996710,0.999981,0.998054,9.986544e-01,0.990071,0.960965,0.999823,0.999852,0.999066
3,0.000678,0.007591,5.869500e-05,0.000003,0.000002,0.000106,0.004041,9.471538e-07,2.109078e-06,1.312236e-07,...,0.003871,0.029723,0.061424,0.033346,6.302050e-02,0.005201,0.049514,0.072159,0.046485,0.021436
4,0.998524,1.000000,1.000000e+00,0.997346,1.000000,0.999998,0.999999,1.000000e+00,1.000000e+00,9.999477e-01,...,0.999096,0.997274,0.996581,0.997158,9.594099e-01,0.995446,0.993305,0.980781,0.999997,0.999043
5,0.021603,0.024955,2.507766e-04,0.000110,0.005429,0.000028,0.000226,1.980803e-04,4.708151e-06,5.143657e-05,...,0.001747,0.008225,0.040571,0.226596,3.806963e-04,0.032457,0.010189,0.048264,0.001897,0.014457
6,0.999989,1.000000,9.999998e-01,0.999144,1.000000,1.000000,0.999993,1.000000e+00,1.000000e+00,1.000000e+00,...,0.995856,1.000000,0.999993,0.999835,1.000000e+00,0.999998,0.980857,0.998116,0.999942,0.999719
7,0.999943,0.996385,9.999197e-01,0.999583,0.999822,0.999986,1.000000,9.990876e-01,1.000000e+00,9.999660e-01,...,0.986009,0.967489,0.999691,0.999855,9.977244e-01,0.998300,0.981616,0.984599,0.999417,0.993163
8,0.105246,0.033701,4.296451e-03,0.004311,0.003983,0.013100,0.152102,2.923114e-03,2.833724e-03,9.577100e-04,...,0.450576,0.301734,0.183440,0.123232,4.327516e-02,0.189339,0.304217,0.124207,0.071665,0.027651
9,0.001673,0.008157,4.531498e-04,0.004736,0.000006,0.017762,0.001283,7.720639e-05,2.135407e-04,6.165037e-05,...,0.004038,0.055513,0.048956,0.062582,1.145148e-04,0.009742,0.050041,0.034627,0.005176,0.025511


In [164]:
x_train = preds_train.values
y_train = train_labels

x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=45)

forest = XGBClassifier()
forest.fit(x_train, y_train)
forest_preds_train = forest.predict_proba(x_train)
forest_preds_train = forest_preds_train[:,1] # get the likelihood of the 1 class (0 is at index 0)
forest_preds_valid = forest.predict_proba(x_valid)
forest_preds_valid = forest_preds_valid[:,1] # get the likelihood of the 1 class (0 is at index 0)

joblib.dump(forest, '{}_{}.pkl'.format('invasive_xgb_ensembler_aug13', 'single'))

print('xgb train loss:')
print('{}'.format(log_loss(y_train, forest_preds_train)))
print('xgb train aroc:')
print('{}'.format(roc_auc_score(y_train, forest_preds_train)))
print('xgb train accuracy:')
print('{}'.format(accuracy_score(y_train, np.around(forest_preds_train))))

print('xgb valid loss:')
print('{}'.format(log_loss(y_valid, forest_preds_valid)))
print('xgb valid aroc:')
print('{}'.format(roc_auc_score(y_valid, forest_preds_valid)))
print('xgb valid accuracy:')
print('{}'.format(accuracy_score(y_valid, np.around(forest_preds_valid))))

xgb train loss:
0.001084963534844528
xgb train aroc:
1.0
xgb train accuracy:
1.0
xgb valid loss:
0.0016745277221389683
xgb valid aroc:
1.0
xgb valid accuracy:
1.0


In [166]:
x_train = preds_train.values
y_train = train_labels

x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=45)

forest = RandomForestClassifier()
forest.fit(x_train, y_train)
forest_preds_train = forest.predict_proba(x_train)
forest_preds_train = forest_preds_train[:,1] # get the likelihood of the 1 class (0 is at index 0)
forest_preds_valid = forest.predict_proba(x_valid)
forest_preds_valid = forest_preds_valid[:,1] # get the likelihood of the 1 class (0 is at index 0)

joblib.dump(forest, '{}_{}.pkl'.format('invasive_forest_ensembler_aug13', 'single'))

print('forest train loss:')
print('{}'.format(log_loss(y_train, forest_preds_train)))
print('forest train aroc:')
print('{}'.format(roc_auc_score(y_train, forest_preds_train)))
print('forest train accuracy:')
print('{}'.format(accuracy_score(y_train, np.around(forest_preds_train))))

print('forest valid loss:')
print('{}'.format(log_loss(y_valid, forest_preds_valid)))
print('forest valid aroc:')
print('{}'.format(roc_auc_score(y_valid, forest_preds_valid)))
print('forest valid accuracy:')
print('{}'.format(accuracy_score(y_valid, np.around(forest_preds_valid))))

forest train loss:
0.001918738597290155
forest train aroc:
1.0
forest train accuracy:
0.9994553376906318
forest valid loss:
0.0054413908116923565
forest valid aroc:
1.0
forest valid accuracy:
1.0


ok so it looks like xgb is superior to random forest in generalizing to the validation set. now im not sure if i should train on the whole set or try to just submit xgb form above. i think ll try xgb form above.

In [167]:
single_xgb = '{}_{}.pkl'.format('invasive_xgb_ensembler_aug13', 'single')

In [168]:
preds_df.head()

Unnamed: 0,name,invasive_incepv3_aug10_kfolds_299x299_0.model,invasive_incepv3_aug10_kfolds_299x299_1.model,invasive_xcep_aug11_kfolds_224x224_0.model,invasive_xcep_aug11_kfolds_224x224_1.model,invasive_xcep_aug11_kfolds_224x224_2.model,invasive_xcep_aug11_kfolds_224x224_3.model,invasive_xcep_aug11_kfolds_224x224_4.model,invasive_xcep_aug11_kfolds_299x299_0.model,invasive_xcep_aug11_kfolds_299x299_1.model,...,invasive_incepv3_aug11_kfolds_299x299_0.model,invasive_incepv3_aug11_kfolds_299x299_1.model,invasive_incepv3_aug11_kfolds_299x299_2.model,invasive_incepv3_aug11_kfolds_299x299_3.model,invasive_incepv3_aug11_kfolds_299x299_4.model,invasive_incepv3_aug11_kfolds_450x450_0.model,invasive_incepv3_aug11_kfolds_450x450_1.model,invasive_incepv3_aug11_kfolds_450x450_2.model,invasive_incepv3_aug11_kfolds_450x450_3.model,invasive_incepv3_aug11_kfolds_450x450_4.model
0,1,0.99922,0.999937,0.999832,0.997136,0.998534,0.999998,0.999851,1.0,1.0,...,0.996456,0.992088,0.999838,0.999759,0.986,0.996039,0.977934,0.993849,0.999967,0.994448
1,2,0.006064,0.018489,0.29235,0.021871,0.045619,0.011755,0.04939,0.087101,0.000298,...,0.060021,0.079418,0.050304,0.025824,0.040866,0.005403,0.015407,0.005452,0.019659,0.02946
2,3,0.011985,0.269732,0.04708,0.077081,0.053459,0.055718,0.004568,0.012156,0.014143,...,0.064229,0.415717,0.058073,0.065356,0.002074,0.005325,0.088598,0.050874,0.011113,0.103452
3,4,0.017386,0.077561,7.2e-05,0.024709,0.000225,0.000183,0.023354,0.000373,0.005312,...,0.00195,0.270881,0.202681,0.031353,0.073497,0.023607,0.203233,0.051576,0.072977,0.028269
4,5,0.978103,0.991908,0.975208,0.998245,0.997167,0.989112,0.996995,0.999627,0.999979,...,0.986529,0.860889,0.860935,0.999671,0.969829,0.993084,0.971659,0.983782,0.99927,0.985751


that looks like the right test set predictions.

In [175]:
ensembler_preds = pd.DataFrame([], columns=['name'])
ensembler_preds['name'] = test_set['name']

# preds_df is the test data
# predictions we want to feed ot the 
# ensembler
test_data = preds_df.values[:,1:]

# load the single xgb ensembler
ensembler = joblib.load(single_xgb)
ensembler_preds['invasive'] = pd.Series(ensembler.predict_proba(test_data)[:,1])

ensembler_preds.head()

Unnamed: 0,name,invasive
0,1,0.999246
1,2,0.001211
2,3,0.00243
3,4,0.001211
4,5,0.999246


In [176]:
ensembler_preds.to_csv(os.path.join(path, 'results', 'subm_aug13_0.gz'), index=False, compression='gzip')

that just did 0.9221, not better. lets try the kfolds version as a last attempt then we will psuedo label.

In [184]:
%cd $path

model_name = 'invasive_xgb_ensembler_aug13'
kf = KFold(n_splits=5, shuffle=True, random_state=78)
i = 0

for train_ixs, valid_ixs in kf.split(train_data):
    print('kfold: {}'.format(i))
    x_train = train_data[train_ixs]
    x_valid = train_data[valid_ixs]
    y_train = train_labels[train_ixs]
    y_valid = train_labels[valid_ixs]
    
    forest = XGBClassifier()#gamma=0.1, min_child_weight=1, n_estimators=50)
    forest.fit(x_train, y_train)
    # this max gets rid of negative preds
    forest_preds_train = forest.predict_proba(x_train)
    forest_preds_train = forest_preds_train[:,1] # get the likelihood of the 1 class (0 is at index 0)
    forest_preds_valid = forest.predict_proba(x_valid)
    forest_preds_valid = forest_preds_valid[:,1] # get the likelihood of the 1 class (0 is at index 0)
    
    joblib.dump(forest, '{}_{}.pkl'.format(model_name, i))
    
    print('forest train loss:')
    print('{}'.format(log_loss(y_train, forest_preds_train)))
    print('forest train aroc:')
    print('{}'.format(roc_auc_score(y_train, forest_preds_train)))
    print('forest train accuracy:')
    print('{}'.format(accuracy_score(y_train, np.around(forest_preds_train))))
    print('\n')
    
    print('forest valid loss:')
    print('{}'.format(log_loss(y_valid, forest_preds_valid)))
    print('forest valid aroc:')
    print('{}'.format(roc_auc_score(y_valid, forest_preds_valid)))
    print('forest valid accuracy:')
    print('{}'.format(accuracy_score(y_valid, np.around(forest_preds_valid))))
    print('\n')
    
    i += 1

/scratch/yns207/data_invasive
kfold: 0
forest train loss:
0.0015888466894264435
forest train aroc:
1.0
forest train accuracy:
1.0


forest valid loss:
0.0017457529523255187
forest valid aroc:
1.0
forest valid accuracy:
1.0


kfold: 1
forest train loss:
0.0016192816093335955
forest train aroc:
1.0
forest train accuracy:
1.0


forest valid loss:
0.0053715554360464655
forest valid aroc:
1.0
forest valid accuracy:
0.9967320261437909


kfold: 2
forest train loss:
0.0015736058741161714
forest train aroc:
1.0
forest train accuracy:
1.0


forest valid loss:
0.0029742010371467354
forest valid aroc:
1.0
forest valid accuracy:
1.0


kfold: 3
forest train loss:
0.0016103450344799428
forest train aroc:
1.0
forest train accuracy:
1.0


forest valid loss:
0.02097103835442682
forest valid aroc:
1.0
forest valid accuracy:
0.9967320261437909


kfold: 4
forest train loss:
0.0015998975827111578
forest train aroc:
1.0
forest train accuracy:
1.0


forest valid loss:
0.0015458191495413094
forest valid aroc:


In [178]:
ensembler_preds = pd.DataFrame([], columns=['name'])
ensembler_preds['name'] = test_set['name']

ensembler_files = [
 'invasive_xgb_ensembler_aug13_0.pkl',
 'invasive_xgb_ensembler_aug13_1.pkl',
 'invasive_xgb_ensembler_aug13_2.pkl',
 'invasive_xgb_ensembler_aug13_3.pkl',
 'invasive_xgb_ensembler_aug13_4.pkl',
]

# preds_df is the test data
# predictions we want to feed ot the 
# ensembler
test_data = preds_df.values[:,1:]

for ensembler_file in ensembler_files:
    ensembler = joblib.load(ensembler_file)
    ensembler_preds[ensembler_file] = pd.Series(ensembler.predict_proba(test_data)[:,1])

ensembler_preds.head()

Unnamed: 0,name,invasive_xgb_ensembler_aug13_0.pkl,invasive_xgb_ensembler_aug13_1.pkl,invasive_xgb_ensembler_aug13_2.pkl,invasive_xgb_ensembler_aug13_3.pkl,invasive_xgb_ensembler_aug13_4.pkl
0,1,0.998863,0.998804,0.998853,0.998742,0.998854
1,2,0.001991,0.002075,0.00198,0.002187,0.001964
2,3,0.002183,0.002075,0.002204,0.002187,0.002173
3,4,0.001991,0.002075,0.00198,0.002187,0.001964
4,5,0.998863,0.998804,0.998747,0.998742,0.998744


In [179]:
subm = pd.DataFrame([], columns=['name', 'invasive'])
subm['name'] = test_set['name']
subm['invasive'] = ensembler_preds[[col for col in ensembler_preds.columns if not col is 'name']].mean(axis=1)
subm.head()

Unnamed: 0,name,invasive
0,1,0.998823
1,2,0.002039
2,3,0.002164
3,4,0.002039
4,5,0.99878


In [186]:
subm.to_csv(os.path.join(path, 'results', 'subm_aug13_1.gz'), index=False, compression='gzip')

the thing is ik this is going to be very very similar to the prior submission...hmmm. alright fuck it. lets submit

it scored: 0.99357 which ipmrovement on the other xgb but not an improvement overall... ok ok so one thing i could try to get my 3rd rsubmission in before 8 is:

i take the average predictions (the bestm odel so far) and i pseudo label the test set. then i take that test set and use ti validate xgb. not sure if i need to round these predictions or leave as is. if i leave as is im basically telling xgb to approximate the 'average' function i did before. if i round that seems a bit better.

In [189]:
# these are my trainign perdictions
# and my test set pseudo labels
preds_df.shape, preds_train.shape

((1531, 28), (2295, 27))

In [206]:
#create pseudo_labels
pseudo_labels = preds_df[[col for col in preds_df.columns if not col is 'name']].mean(axis=1)
pseudo_labels = np.around(pseudo_labels)

options:

fit training validation on pseudo (doesnt really make sense, overfitting trianing set)

fit pseudo valdiate on training (makes more sense, overfitting pseudo test set)

fit different types of xgb to training and see which does best on pseudo (make most sense)

lets see which validates best.


In [212]:
x_valid.shape, x_train.shape

((2295,), (1531, 27))

In [224]:
x_train = preds_train.values
y_train = train_labels

x_valid = preds_df.values[:,1:]
y_valid = pseudo_labels

print('subsample: {}'.format(hp))
forest = XGBClassifier(gamma=0.3, min_child_weight=1, max_depth=3, subsample=0.1)
forest.fit(x_train, y_train)
forest_preds_train = forest.predict_proba(x_train)
forest_preds_train = forest_preds_train[:,1] # get the likelihood of the 1 class (0 is at index 0)
forest_preds_valid = forest.predict_proba(x_valid)
forest_preds_valid = forest_preds_valid[:,1] # get the likelihood of the 1 class (0 is at index 0)

joblib.dump(forest, '{}_{}.pkl'.format('invasive_xgb_ensembler_aug13', 'pseudo'))

print('forest train loss:')
print('{}'.format(log_loss(y_train, forest_preds_train)))
print('forest train aroc:')
print('{}'.format(roc_auc_score(y_train, forest_preds_train)))
print('forest train accuracy:')
print('{}'.format(accuracy_score(y_train, np.around(forest_preds_train))))

print('forest valid loss:')
print('{}'.format(log_loss(np.around(y_valid), forest_preds_valid)))
print('forest valid aroc:')
print('{}'.format(roc_auc_score(y_valid, forest_preds_valid)))
print('forest valid accuracy:')
print('{}'.format(accuracy_score(y_valid, np.around(forest_preds_valid))))
print('\n')

subsample: 1.0
forest train loss:
0.007368865142087401
forest train aroc:
1.0
forest train accuracy:
0.9995642701525055
forest valid loss:
0.01688389374207216
forest valid aroc:
0.999965318600675
forest valid accuracy:
0.9967341606792945




5 to 30 for max_depth, from 1 to 10 for min_child_weight and from 0.8 to 1 for subsample, 0-1 for gamma

**gamma variance (conclusion choose gamma = 0.3):**

```
gamma: 0
forest train loss:
0.0008504493089244663
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.014233999597888068
forest valid aroc:
0.9999375734812148
forest valid accuracy:
0.9947746570868713


gamma: 0.1
forest train loss:
0.0008509125571935346
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.01398027742024241
forest valid aroc:
0.9999358394112485
forest valid accuracy:
0.9947746570868713


gamma: 0.2
forest train loss:
0.0008508751850853186
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.01429368357928707
forest valid aroc:
0.9999289031313836
forest valid accuracy:
0.9947746570868713


gamma: 0.3
forest train loss:
0.0008511137505658458
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.013959487113593102
forest valid aroc:
0.9999358394112486
forest valid accuracy:
0.9960809928151535


gamma: 0.4
forest train loss:
0.0008521989819569379
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.014109791050004152
forest valid aroc:
0.9999323712713162
forest valid accuracy:
0.9954278249510125


gamma: 0.5
forest train loss:
0.0008529682369285906
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.013693470405055797
forest valid aroc:
0.9999358394112485
forest valid accuracy:
0.9947746570868713


gamma: 0.6
forest train loss:
0.0008520413955829811
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.013434322249741372
forest valid aroc:
0.9999375734812148
forest valid accuracy:
0.9947746570868713


gamma: 0.7
forest train loss:
0.0008560999465521936
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.01326545017964105
forest valid aroc:
0.999939307551181
forest valid accuracy:
0.9947746570868713


gamma: 0.8
forest train loss:
0.0008560999465521936
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.01326545017964105
forest valid aroc:
0.999939307551181
forest valid accuracy:
0.9947746570868713


gamma: 0.9
forest train loss:
0.0008560209260129396
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.014168850513577223
forest valid aroc:
0.9999393075511811
forest valid accuracy:
0.9954278249510125


gamma: 1.0
forest train loss:
0.0009366764403191803
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.014018150247552248
forest valid aroc:
0.9999393075511811
forest valid accuracy:
0.9954278249510125
```

**min child weight of 1 is best:**

```
min child weight: 1
forest train loss:
0.0008511137505658458
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.013959487113593102
forest valid aroc:
0.9999358394112486
forest valid accuracy:
0.9960809928151535


min child weight: 2
forest train loss:
0.0017329625831511123
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.014703686693169245
forest valid aroc:
0.9999375734812148
forest valid accuracy:
0.9954278249510125


min child weight: 3
forest train loss:
0.0025972231038620852
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.015169726322897164
forest valid aroc:
0.9999401745861642
forest valid accuracy:
0.9954278249510125


min child weight: 4
forest train loss:
0.003467222465169748
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.016114689190741226
forest valid aroc:
0.9999341053412824
forest valid accuracy:
0.9954278249510125


min child weight: 5
forest train loss:
0.004120813273826781
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.01583588911706471
forest valid aroc:
0.9999462438310461
forest valid accuracy:
0.9954278249510125


min child weight: 6
forest train loss:
0.0051483446381142065
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.017286655685002414
forest valid aroc:
0.9999375734812148
forest valid accuracy:
0.9947746570868713


min child weight: 7
forest train loss:
0.006039873449931283
forest train aroc:
0.9999999999999999
forest train accuracy:
1.0
forest valid loss:
0.01790839181677636
forest valid aroc:
0.9999375734812148
forest valid accuracy:
0.9947746570868713


min child weight: 8
forest train loss:
0.006659184929088959
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.018763743281354706
forest valid aroc:
0.9999375734812148
forest valid accuracy:
0.9947746570868713


min child weight: 9
forest train loss:
0.007773602408334646
forest train aroc:
0.9999999999999999
forest train accuracy:
1.0
forest valid loss:
0.019702517373364474
forest valid aroc:
0.9999228338865016
forest valid accuracy:
0.9954278249510125


min child weight: 10
forest train loss:
0.0085119923286562
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.02069233495688551
forest valid aroc:
0.9999271690614173
forest valid accuracy:
0.9954278249510125
```

**max treep depth dosent mak ea dif leave at default of 3**

**subsample of 0.1 is best**

```
subsample: 0
forest train loss:
0.6931471824645996
forest train aroc:
0.5
forest train accuracy:
0.3690631808278867
forest valid loss:
0.6931471824645996
forest valid aroc:
0.5
forest valid accuracy:
0.5630306988896147


subsample: 0.1
forest train loss:
0.007368865142087401
forest train aroc:
1.0
forest train accuracy:
0.9995642701525055
forest valid loss:
0.01688389374207216
forest valid aroc:
0.999965318600675
forest valid accuracy:
0.9967341606792945


subsample: 0.2
forest train loss:
0.004110291987263916
forest train aroc:
1.0
forest train accuracy:
0.9995642701525055
forest valid loss:
0.018468767648032344
forest valid aroc:
0.9998976898719909
forest valid accuracy:
0.991508817766166


subsample: 0.3
forest train loss:
0.002534753442610044
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.015635455454636432
forest valid aroc:
0.9999098283617548
forest valid accuracy:
0.9947746570868713


subsample: 0.4
forest train loss:
0.0018954371974635098
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.014884565783750355
forest valid aroc:
0.999911562431721
forest valid accuracy:
0.9954278249510125


subsample: 0.5
forest train loss:
0.001577378528736418
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.012517065536425462
forest valid aroc:
0.99994450976108
forest valid accuracy:
0.9960809928151535


subsample: 0.6
forest train loss:
0.0013354961939910854
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.01260048144248787
forest valid aroc:
0.9999410416211473
forest valid accuracy:
0.9954278249510125


subsample: 0.7
forest train loss:
0.0011508407621605885
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.0136139180799347
forest valid aroc:
0.9999427756911136
forest valid accuracy:
0.9954278249510125


subsample: 0.8
forest train loss:
0.0010170401617517382
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.013009485556230929
forest valid aroc:
0.999946243831046
forest valid accuracy:
0.9954278249510125


subsample: 0.9
forest train loss:
0.0009442541762488471
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.014238709547646054
forest valid aroc:
0.9999323712713161
forest valid accuracy:
0.9954278249510125


subsample: 1.0
forest train loss:
0.0008511137505658458
forest train aroc:
1.0
forest train accuracy:
1.0
forest valid loss:
0.013959487113593102
forest valid aroc:
0.9999358394112486
forest valid accuracy:
0.9960809928151535

```

In [226]:
ensembler_preds = pd.DataFrame([], columns=['name'])
ensembler_preds['name'] = test_set['name']

# preds_df is the test data
# predictions we want to feed ot the 
# ensembler
test_data = preds_df.values[:,1:]

# load the single xgb ensembler
ensembler = joblib.load('{}_{}.pkl'.format('invasive_xgb_ensembler_aug13', 'pseudo'))
ensembler_preds['invasive'] = pd.Series(ensembler.predict_proba(test_data)[:,1])

ensembler_preds.head()

Unnamed: 0,name,invasive
0,1,0.995993
1,2,0.007416
2,3,0.007416
3,4,0.007416
4,5,0.995993


In [227]:
ensembler_preds.to_csv(os.path.join(path, 'results', 'subm_aug13_2.gz'), index=False, compression='gzip')

ok that scored: 0.99120, lets just take the psuedo labels and retrain the model w/ that exta info.