In [1]:
import os, gc, sys, h5py, json, math, time
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
from matplotlib import pyplot as plt
from PIL import Image
from os import listdir
from collections import defaultdict
from keras import backend as K
from keras.models import Sequential, Model, load_model
from keras.utils import to_categorical
from keras.layers import Flatten, Dense, Dropout, GlobalAveragePooling2D
from keras.optimizers import SGD,Adam
from keras.preprocessing import image
from keras.preprocessing.image import ImageDataGenerator
from keras.applications.resnet50 import ResNet50, preprocess_input

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Print size of any object
def memo_obj(obj):
    print(sys.getsizeof(obj)/ 1024**2," MB")

def change_datatype(df):
    int_cols = list(df.select_dtypes(include=['int']).columns)
    for col in int_cols:
        if ((np.max(df[col]) <= 127) and(np.min(df[col] >= -128))):
            df[col] = df[col].astype(np.int8)
        elif ((np.max(df[col]) <= 32767) and(np.min(df[col] >= -32768))):
            df[col] = df[col].astype(np.int16)
        elif ((np.max(df[col]) <= 2147483647) and(np.min(df[col] >= -2147483648))):
            df[col] = df[col].astype(np.int32)
        else:
            df[col] = df[col].astype(np.int64)
    return df

def show_img(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    plt.imshow(img, interpolation='nearest')
    plt.show()

def flip_axis(x, axis):
    x = np.asarray(x).swapaxes(axis, 0)
    x = x[::-1, ...]
    x = x.swapaxes(0, axis)
    return x

def get_class_weights(y):
    counter = Counter(y)
    majority = max(counter.values())
    return  {cls: float(majority/count) for cls, count in counter.items()}

def lgb_modelfit_nocv(params, dtrain, dvalid, predictors, target='target', \
                      objective='multiclass', metrics='multi_logloss', # multi_error\
                      feval=None, early_stopping_rounds=20, num_boost_round=3000, \
                      verbose_eval=10, categorical_features=None):
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': objective,
        'metric':metrics,
        'learning_rate': 0.01,
        'num_leaves': 30,  # we should let it be smaller than 2^(max_depth)
        'max_depth': -1,  # -1 means no limit
        'min_child_samples': 20,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 255,  # Number of bucketed bin for feature values
        'subsample': 0.6,  # Subsample ratio of the training instance.
        'subsample_freq': 0,  # frequence of subsample, <=0 means no enable
        'colsample_bytree': 0.3,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight': 5,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'subsample_for_bin': 200000,  # Number of samples for constructing bin
        'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        'reg_alpha': 0,  # L1 regularization term on weights
        'reg_lambda': 0,  # L2 regularization term on weights
        'nthread': 8,
        'verbose': 0
    }

    lgb_params.update(params)

    print("preparing validation datasets")

    xgtrain = lgb.Dataset(dtrain[predictors].values, label=dtrain[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features)
    
    xgvalid = lgb.Dataset(dvalid[predictors].values, label=dvalid[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features)

    evals_results = {}

    bst1 = lgb.train(lgb_params, 
                     xgtrain, 
                     valid_sets=[xgtrain, xgvalid], 
                     valid_names=['train','valid'], 
                     evals_result=evals_results, 
                     num_boost_round=num_boost_round,
                     early_stopping_rounds=early_stopping_rounds,
                     verbose_eval=10, 
                     feval=feval)

    n_estimators = bst1.best_iteration
    print("\nModel Report")
    print("n_estimators : ", n_estimators)
    print(metrics+":", evals_results['valid']['multi_logloss'][n_estimators-1])

    return bst1, evals_results

# Loading the data

In [3]:
base_path = "..\\..\\..\\train_data"
file_path = os.path.join(base_path,"file_labels")
img_path = os.path.join(base_path,"bin-images")
lgb_path = os.path.join(base_path, "lgb_data")

In [4]:
df_train = pd.read_csv(file_path + '\\moderate_train.csv', dtype = {'file_name':str, 'lable':np.int8})
df_val = pd.read_csv(file_path + '\\moderate_val.csv', dtype = {'file_name':str, 'lable':np.int8})
df_test = pd.read_csv(file_path + '\\moderate_test.csv', dtype = {'file_name':str, 'lable':np.int8})

print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(289573, 3)
(36197, 3)
(36197, 3)


# Loading in the model

In [5]:
col_axis = 2
row_axis = 1
img_col_axis = col_axis - 1
img_row_axis = row_axis - 1

In [6]:
def batch_generator(df, batch_size, img_path, num_classes, \
                    horizontal_flip = True, vertical_flip = True):
    """This generator use a pandas DataFrame to read images (df.tile_name) from disk.
    """
    N = df.shape[0]
    while True:
        for start in range(0, N, batch_size):
            x_batch = []
            y_batch = []
            if start + batch_size > N: break
            for ind in range(start, start + batch_size):
                name = df.loc[ind, 'file_name']
                img = image.load_img(img_path+'/' + str(name) + '.jpg', target_size=(224, 224))
                img = image.img_to_array(img)
                labelname=df.loc[ind, 'label'] 
                x = preprocess_input(img, data_format='channels_last', mode='caffe')
                if horizontal_flip:
                    x = flip_axis(x, img_col_axis)
                if vertical_flip:
                    x = flip_axis(x, img_row_axis)
                x_batch.append(x)
                y_batch.append(labelname) 
            x_batch = np.array(x_batch, dtype = np.float32)
            y_batch = to_categorical(y_batch, num_classes = num_classes)
            yield (x_batch, y_batch)

In [7]:
# output paths
model_output_path = "..\\saved_models\\"
plots_output_path = "..\\visualization\\"

NUM_CLASSES = 6
NUM_EPOCHS = 100
BATCH_SIZE = 32
INIT_LR = 5e-3
PATIENCE = 20
model_name = "resnet(caffe)_adam_"+str(BATCH_SIZE) + "_random_augment"
base_filepath = model_output_path + model_name + ".hdf5"

In [None]:
# base_model.summary()

In [None]:
base_model = load_model(base_filepath)
model = Model(inputs=base_model.input, outputs=base_model.get_layer('flatten_1').output)

In [None]:
cut_points = [-1,len(df_train)//4,len(df_train)//2,len(df_train)*3//4,len(df_train)]
for i in range(len(cut_points)-1):
    df_train_samp = df_train.loc[cut_points[i]+1:cut_points[i+1], :]
    print("Starting:", cut_points[i]+1, "to", cut_points[i+1])
    df_train_samp.index = range(len(df_train_samp))
    print("len(df_train_samp):", len(df_train_samp))
    train_features = model.predict_generator(batch_generator(df_train_samp,
                                              batch_size=BATCH_SIZE,
                                              img_path=img_path,
                                              num_classes=NUM_CLASSES,
                                              horizontal_flip = False, vertical_flip = False),
                                          steps = len(df_train_samp) // BATCH_SIZE,
                                          verbose = 1)
    train_features = pd.DataFrame(train_features)
    train_features['label'] = df_train_samp.loc[:train_features.shape[0], 'label']
    train_features['sharpness'] = df_train_samp.loc[:train_features.shape[0], 'sharpness']
    train_features['file_name'] = df_train_samp.loc[:train_features.shape[0], 'file_name'].astype(str)
    train_features.columns = ["res_clf_" + str(i) for i in list(train_features.columns[:2048])] + ['label', 'sharpness', 'file_name']
    change_datatype(train_features).to_csv(lgb_path +'\\' + model_name + '_train_'+'orig_'+str(i)+'.csv', index = None)

In [None]:
val_features = model.predict_generator(batch_generator(df_val, 
                                                  batch_size=BATCH_SIZE,
                                                  img_path=img_path,
                                                  num_classes=NUM_CLASSES,
                                                  horizontal_flip = False, vertical_flip = False),
                                              steps = len(df_val) // BATCH_SIZE,
                                              verbose = 1)
val_features = pd.DataFrame(val_features)
val_features['label'] = df_val.loc[:val_features.shape[0], 'label']
val_features['file_name'] = df_val.loc[:val_features.shape[0], 'file_name'].astype(str)
val_features['sharpness'] = df_val.loc[:val_features.shape[0], 'sharpness']
val_features.columns = [str(i) for i in list(train_features.columns[:2048])] + ['label', 'file_name', 'sharpness']

test_features = model.predict_generator(batch_generator(df_test, 
                                              batch_size=BATCH_SIZE,
                                              img_path=img_path,
                                              num_classes=NUM_CLASSES,
                                              horizontal_flip = False, vertical_flip = False),
                                          steps = len(df_test) // BATCH_SIZE,
                                          verbose = 1)
test_features = pd.DataFrame(test_features)
test_features['label'] = df_test.loc[:test_features.shape[0], 'label']
test_features['file_name'] = df_test.loc[:test_features.shape[0], 'file_name'].astype(str)
test_features['sharpness'] = df_test.loc[:test_features.shape[0], 'sharpness']
test_features.columns = [str(i) for i in list(train_features.columns[:2048])] + ['label', 'file_name', 'sharpness']

change_datatype(val_features).to_csv(lgb_path +'\\' + model_name + '_val'+'.csv', index = None)
change_datatype(test_features).to_csv(lgb_path +'\\' + model_name + '_test'+'.csv', index = None)

In [None]:
# For test-time augmentation for lgb model
test_features = model.predict_generator(batch_generator(df_test, 
                                              batch_size=BATCH_SIZE,
                                              img_path=img_path,
                                              num_classes=NUM_CLASSES,
                                              horizontal_flip = True, vertical_flip = False),
                                          steps = len(df_test) // BATCH_SIZE,
                                          verbose = 1)
test_features = pd.DataFrame(test_features)
test_features['label'] = df_test.loc[:test_features.shape[0], 'label']
test_features['file_name'] = df_test.loc[:test_features.shape[0], 'file_name'].astype(str)
test_features['sharpness'] = df_test.loc[:test_features.shape[0], 'sharpness']
test_features.columns = [str(i) for i in list(train_features.columns[:2048])] + ['label', 'file_name', 'sharpness']
change_datatype(test_features).to_csv(lgb_path +'\\' + model_name + '_test_hflip.csv', index = None)

test_features = model.predict_generator(batch_generator(df_test, 
                                              batch_size=BATCH_SIZE,
                                              img_path=img_path,
                                              num_classes=NUM_CLASSES,
                                              horizontal_flip = False, vertical_flip = True),
                                          steps = len(df_test) // BATCH_SIZE,
                                          verbose = 1)
test_features = pd.DataFrame(test_features)
test_features['label'] = df_test.loc[:test_features.shape[0], 'label']
test_features['file_name'] = df_test.loc[:test_features.shape[0], 'file_name'].astype(str)
test_features['sharpness'] = df_test.loc[:test_features.shape[0], 'sharpness']
test_features.columns = [str(i) for i in list(train_features.columns[:2048])] + ['label', 'file_name', 'sharpness']
change_datatype(test_features).to_csv(lgb_path +'\\' + model_name + '_test_vflip.csv', index = None)