In [13]:
import os, glob, bcolz, gc

import numpy as np
import pandas as pd

from tqdm import tqdm
from scipy import ndimage, misc
from scipy.stats import rankdata

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler

import keras
from keras import backend as K
from keras import optimizers
from keras.models import Model, load_model

from keras.applications.inception_v3 import preprocess_input as preprocess_input_inceptionv3

from tensorflow.python.client import device_lib
device_lib.list_local_devices(), 'keras version: {}'.format(keras.__version__)

([name: "/cpu:0"
  device_type: "CPU"
  memory_limit: 268435456
  locality {
  }
  incarnation: 4758454870926726169, name: "/gpu:0"
  device_type: "GPU"
  memory_limit: 303824896
  locality {
    bus_id: 2
  }
  incarnation: 9472406446308518386
  physical_device_desc: "device: 0, name: Tesla K80, pci bus id: 0000:85:00.0"],
 'keras version: 2.0.6')

In [2]:
path = os.path.join('/scratch', 'yns207', 'data_invasive')
train_set = pd.read_csv(os.path.join(path, 'train_labels.csv'))
test_set = pd.read_csv(os.path.join(path, 'sample_submission.csv'))

def delete_model(model, clear_session=True):
    '''removes model!
    '''
    del model
    gc.collect()
    if clear_session: K.clear_session()

def read_img(img_path, img_shape):
    img = misc.imread(img_path)
    img = misc.imresize(img, img_shape)
    return img

def read_imgs(img_height, img_width):
    train_img, test_img = [],[]
    for img_path in tqdm(train_set['name'].iloc[:]):
        train_img.append(read_img(os.path.join(path, 'train', str(img_path)+'.jpg'), (img_height, img_width)))

    for img_path in tqdm(test_set['name'].iloc[:]):
        test_img.append(read_img(os.path.join(path, 'test', str(img_path)+'.jpg'), (img_height, img_width)))
    return np.array(train_img), np.array(test_img)

In [3]:
train_img_299, test_img_299 = read_imgs(299,299)
train_img_450, test_img_450 = read_imgs(450,450)
train_labels = train_set['invasive'].values

100%|██████████| 2295/2295 [01:13<00:00, 31.25it/s]
100%|██████████| 1531/1531 [00:47<00:00, 32.18it/s]
100%|██████████| 2295/2295 [01:18<00:00, 29.73it/s]
100%|██████████| 1531/1531 [00:50<00:00, 30.57it/s]


In [4]:
preds_df = pd.DataFrame([], columns=['name'])
preds_df['name'] = test_set['name']

model_files = [
    'invasive_incepv3_aug10_kfolds_299x299_0.model',
    'invasive_incepv3_aug10_kfolds_299x299_1.model',
    'invasive_incepv3_aug10_kfolds_0.model',
]

for model_name in model_files:
    print(model_name)
    model = load_model(os.path.join(path,model_name))
    if '299' in model_name:
        proc_test_img = preprocess_input_inceptionv3(test_img_299.astype(np.float32))
    else:
        proc_test_img = preprocess_input_inceptionv3(test_img_450.astype(np.float32))
    preds_df[model_name] = pd.Series(model.predict(proc_test_img).flatten())
    delete_model(model)

invasive_incepv3_aug10_kfolds_299x299_0.model




invasive_incepv3_aug10_kfolds_299x299_1.model
invasive_incepv3_aug10_kfolds_0.model


In [5]:
preds_df.head()

Unnamed: 0,name,invasive_incepv3_aug10_kfolds_299x299_0.model,invasive_incepv3_aug10_kfolds_299x299_1.model,invasive_incepv3_aug10_kfolds_0.model
0,1,0.99922,0.999937,0.999953
1,2,0.006064,0.018489,0.003858
2,3,0.011985,0.269733,0.045479
3,4,0.017386,0.077561,0.013823
4,5,0.978104,0.991908,0.995183


In [6]:
# avg all preds
subm = pd.DataFrame([], columns=['name', 'invasive'])
subm['name'] = test_set['name']
subm['invasive'] = preds_df[[col for col in preds_df.columns if not col is 'name']].mean(axis=1)
subm.head()

Unnamed: 0,name,invasive
0,1,0.999703
1,2,0.00947
2,3,0.109066
3,4,0.036257
4,5,0.988398


In [7]:
subm.to_csv(os.path.join(path, 'results', 'subm_aug11_0.gz'), index=False, compression='gzip')

that did really well. 0.99279 vs 0.9869 cool

In [14]:
# lets try a rank avged version...
subm = pd.DataFrame([], columns=['name', 'invasive'])
subm['name'] = test_set['name']
for col in preds_df.columns:
    if col == 'name':
        continue
    subm['r_'+col] = rankdata(preds_df[col])
subm['r_avg'] = subm[[col for col in subm.columns if not col is 'name']].mean(axis=1)
subm['invasive'] = MinMaxScaler(feature_range=(0.01,0.99)).fit_transform(subm['r_avg'] .values.reshape(-1, 1))
subm.head()

Unnamed: 0,name,invasive,r_invasive_incepv3_aug10_kfolds_299x299_0.model,r_invasive_incepv3_aug10_kfolds_299x299_1.model,r_invasive_incepv3_aug10_kfolds_0.model,r_avg
0,1,0.857325,1329.0,1167.0,1377.0,1291.0
1,2,0.302892,495.0,559.0,343.0,465.666667
2,3,0.469043,597.0,835.0,707.0,713.0
3,4,0.422243,643.0,766.0,521.0,643.333333
4,5,0.654675,1004.0,967.0,997.0,989.333333


In [16]:
subm[['name','invasive']].to_csv(os.path.join(path, 'results', 'subm_aug11_1.gz'), index=False, compression='gzip')

that was worse: 0.99230