In [None]:
"""
Code Version resnet
Apply code to the real data

# make image size as 200 x 200 (center crop 100x100, then resize to 200x200)
# RGB channels (resnet required)

# v3
1) class and functionalize
2) train / test has already spilted into two files

# v4
1) modify train generator
  - make it accept generate images from directory
    in two way:
      a) separate train / validation / test
      b) separate train / test (for this case, validation set should gen from training set)

# v4b
1) implement resnet with dropout in the conv layers

# v5
1) Add hard negative mining protocol

"""

In [1]:
%env CUDA_VISIBLE_DEVICES=4

env: CUDA_VISIBLE_DEVICES=4


In [2]:
from PIL import Image
import skimage.io as skio
import glob
import numpy as np
import scipy as sp
import scipy.stats 
import pandas as pd
import os
import re
import csv
from scipy.stats import percentileofscore
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score, f1_score

# basic libraries
from __future__ import print_function
import random
import matplotlib.pyplot as plt


In [3]:
# NN libraries
import keras
from keras.models import Sequential, Model
from keras.models import load_model, save_model
from keras.layers import Dense, Dropout, Activation, Flatten, Input
from keras.layers import Convolution2D, MaxPooling2D, Conv2D, BatchNormalization
from keras.utils import np_utils
from keras import backend as K
from keras.optimizers import SGD, Adam, Adagrad
from keras.regularizers import l1, l2
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import Callback
from keras.applications import resnet50
from keras.applications.resnet50 import ResNet50
from keras.applications.resnet50 import identity_block
from keras.applications.resnet50 import conv_block
from keras.applications.resnet50 import preprocess_input
from keras.layers import AveragePooling2D, ZeroPadding2D, GlobalAveragePooling2D
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

import tensorflow as tf
from keras.backend.tensorflow_backend import set_session

K.set_image_dim_ordering('tf')

Using TensorFlow backend.


In [4]:
"""
self function calling
"""
%run -i 'bin/callbacks_miscs.py'
%run -i 'bin/py_init_data.py'
%run -i 'bin/py_generator_for_model.py'
#from keras.applications.resnet50 import _obtain_input_shape

#%run -i 'bin/resnet_with_drp.py'


In [5]:
def resnet_model_build(resnet_model, use_stage, freeze_stage, acti,
                       use_merge = False, 
                       n_meta = 0,
                       fc_drop_rate = 0.2):
    #if use merge should always check n_meta
    
    fc_drop_rate = float(fc_drop_rate)
    for layer in resnet_model.layers:
        layer.trainable = True
    #resnet_model.summary()

    # design for using different activation function that will change the layer name
    if acti == 'relu':
        to_get = 'activation_'
    else:
        to_get = acti + "_"

    if use_stage == 1:
        get_layer = "max_pooling2d"
    elif use_stage == 2:
        #get_layer = "activation_10"
        get_layer = to_get + '10'
    elif use_stage == 3:
        #get_layer = "activation_22"
        get_layer = to_get + '22'
    elif use_stage == 4:
        #get_layer = "activation_40"
        get_layer = to_get + '40'
    else:
        get_layer = "global_avg_pooling2d_1"

    if freeze_stage == 1:
        free_layer_num = 5
    elif freeze_stage == 2:
        free_layer_num = 37
    elif freeze_stage == 3:
        free_layer_num = 79
    elif freeze_stage == 4:
        free_layer_num = 141
    else:
        free_layer_num = 176

    if freeze_stage == 0:
        print('all parameter tunable')
    else:
        for layer in resnet_model.layers[:free_layer_num]:
            layer.trainable = False
        
    if use_stage != 5:    
        x = resnet_model.get_layer(get_layer).output
        #x = AveragePooling2D((13, 13), name='avg_pool')(x)
        #x = Flatten()(x)
        x = GlobalAveragePooling2D()(x)
    else:
        x = resnet_model.get_layer(get_layer).output
    
    if use_merge:
        meta_info = Input(shape = (n_meta, )) # n_meta: numbers of features from meta
        x = keras.layers.concatenate([x, meta_info])
    else:
        pass
    
    """
    x = Dense(64, name = 'dense1')(x)
    x = BatchNormalization(axis = -1, name = 'dense1_bn')(x)
    x = Activation('relu', name = 'dense1_activation')(x)
    x = Dropout(fc_drop_rate, name = 'd1_drop')(x)
    
    x = Dense(32, name = 'dense2')(x)
    x = BatchNormalization(axis = -1, name = 'dense2_bn')(x)
    x = Activation('relu', name = 'dense2_activation')(x)
    x = Dropout(fc_drop_rate, name = 'd2_drop')(x)
    """
    
    out = Dense(2, activation="softmax", name = "output")(x)
        
    model_final = Model(inputs = [resnet_model.input], outputs = [out])
    return model_final

In [6]:
# config -- put parameters here
from keras.preprocessing.image import ImageDataGenerator

### fold number and naming
#i_fold = 1 # fold number on naming
model_output_prefix = 'resnetMean_DateCut10_boosting_append' # Remember to modify the parameter below, this line is only about file naming
### controling parameters
n_gpu_use = 1 # this should compatible with CUDA_VISIBLE_DEVICES number

### model related parameters
fs = 0 # layer to freeze 
us = 4 # layers to dump
lr = 0.00017 # learning rate at begin
drp = 0 # dropout ratio in the Resnet Conv layers
batch_size = 64 * n_gpu_use
nb_epoch = 150 # numbers of epoch for the training process
n_batch = 400 # numbers of updates per epoch
use_merge = False # Ignore it, design to merge meta-data
mini_batch_method = "shuffle" # shuffle or random
nn_activation = 'relu' # activation type in the resnet (default should be relu, option: relu / leakyrelu / elu)
dataset_mean_ratio = 1 # it matter if use dataset mean

### data information
dir_out_csv = '/home/seanyu/project/CCP/res_csv/' # result csv output location
dir_out_model = '/home/seanyu/project/CCP/model/' # model output location

### data initialize parameters
# dir_train: training set location
# dir_valid: validation set location (if leave blank, automatically get val set from training set by valid_ratio)
# dir_test: testing set location
data_params = {
            'dir_train': {'d_class0': '/data/put_data/seanyu/ccp/clean_date_cut/thres10/non_copper_train/',
                                'd_class1': '/data/put_data/seanyu/ccp/clean_date_cut/thres10/copper_train/'
                 },
            # leave white space " " as value if there is no validation dir
            'dir_valid': {'d_class0': '',
                                'd_class1': ''
                 },
            'dir_test': {'d_class0': '/data/put_data/seanyu/ccp/clean_date_cut/thres10/non_copper_test/',
                               'd_class1': '/data/put_data/seanyu/ccp/clean_date_cut/thres10/copper_test/'
                 },
            'valid_ratio' : 0.1
        }

### model input information
# tags: is copper defect or not (Y: copper, N: non-copper, watchout: the ordering trap ... alphabet ordering)
# crop_w/h: crop size from input image
# img_w/h: image size for the model (resizing)
# img_channels: RGB = 3
# use_self_improc: True (-selfmean) / False (-imagenet mean) / dataset (-dataset mean)
generator_params_dict = {'tags' : ['N', 'Y'],
                                 'crop_w': 100,
                                 'crop_h': 100,
                                 'img_w': 200,
                                 'img_h': 200,
                                 'img_channels': 3,
                                 'use_self_improc' : False # True / False / 'dataset'
                                }

### parameters for train generator
# parameters reference: https://keras.io/preprocessing/image/
datagen = ImageDataGenerator(
        rotation_range=45,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.1,
        zoom_range=[0.25, 2.25],
        horizontal_flip=True, vertical_flip = True,
        fill_mode='wrap')

### parameters for validation augmentation
#datagen_val = None

datagen_val = ImageDataGenerator(
    rotation_range=45,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0,
    zoom_range=[0.5, 1.5],
    horizontal_flip=True, vertical_flip = True,
    fill_mode='wrap')


In [7]:
i_fold = 3
opt = model_output_prefix  + '_k' + str(i_fold)
model_file_name = dir_out_model + "/model_" + opt + ".h5"

# Initialize the data
data_cla = init_data_from_directory(data_params)
train_nonC, val_nonC, test_nonC, train_C, val_C, test_C = data_cla.get_train_val_test_df()
print('Non_copper training/validation/testing ' + str(len(train_nonC)) + "/" + str(len(val_nonC)) + "/" + str(len(test_nonC)))
print('copper training/validation/testing ' + str(len(train_C)) + "/" + str(len(val_C)) + "/" + str(len(test_C)))

# Check table independcy here (should be empty!, if not empty, it means data contamination)
if len(set(train_C.pid).intersection(test_C.pid) ) != 0:
    print('die')
    raise 'YOU MUST ERROR HERE!'
if len(set(train_nonC.pid).intersection(test_nonC.pid) ) != 0:
    print('die')
    raise 'YOU MUST ERROR HERE!'

        
# Get training set mean of rgb
if generator_params_dict['use_self_improc'] == 'dataset':
    print('use dataset mean')
    avg_dataset = get_training_set_mean(df_class0= train_C, df_class1= train_nonC, n_core=8)
    generator_params_dict['dataset_mean'] = avg_dataset / np.float32(dataset_mean_ratio)
    # write the self_mean information to a txt file
    csv_file_name = dir_out_model + "/rgbConfig_" + opt + ".txt"
    np.savetxt(csv_file_name, avg_dataset)
else:
    generator_params_dict['dataset_mean'] = None
    print('do not use dataset mean')  

gen_data = call_generators(generator_params_dict, dta_gen= datagen)

x_val, y_val = gen_data.get_validation_data(df_class0= val_nonC, df_class1= val_C,
                                                   class_0_ratio = 1,  use_im_gen = datagen_val, n_gen_loop = 3)

# shuffle dataframe
train_C = train_C.sample(frac=1).reset_index(drop = True)
train_nonC = train_nonC.sample(frac=1).reset_index(drop = True)

"""
Create n partitions for training -- do boosting
"""
n_split = 3
len(train_nonC) // n_split
train_nonC_list = []
for i in np.arange(n_split + 1):
    i_start = len(train_nonC) // 3 * (i-1)
    i_end = len(train_nonC) // 3 * (i)
    train_nonC_list.append(train_nonC[i_start:i_end])
    print(train_nonC_list[i].shape)

print(len(train_nonC_list))

# use for checking
# set(tmp[1].pid).intersection(tmp[2].pid)

use inner split k-folds
Non_copper training/validation/testing 535714/59524/76704
copper training/validation/testing 9749/1084/977
do not use dataset mean
2168
(8672, 200, 200, 3)
use imagenet mean
(0, 3)
(178571, 3)
(178571, 3)
(178571, 3)
4


In [8]:
### should implement hard-mining
### for each generation, reload trained model, reset all parameter, change trainable data list
glist = [1,2,3]
for generation in glist:
    print('We are now at generation' + str(generation))
    K.clear_session()
    if generation == 1:
        
        resnet_model = ResNet50(include_top=False, weights = "imagenet", input_shape = (200, 200, 3), pooling ='avg')
        model = resnet_model_build(resnet_model, freeze_stage= fs, use_stage= us, acti = nn_activation)
        model.summary()
        opt = 'hard_negative_thresh10_gen' + str(generation) + '_k' + str(i_fold)
        model_file_name = "model/model_" + opt + ".h5"
    else:
        print('loading exist model: ' + model_file_name)
        model = load_model(model_file_name) # load last generation model, update model_name after loading
        opt = 'hard_negative_thresh10_gen' + str(generation) + '_k' + str(i_fold)
        model_file_name = "model/model_" + opt + ".h5"

    lr = 0.00017
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=6)
    myoptimizer = Adam(lr= lr)
    model.compile(loss='binary_crossentropy', optimizer=myoptimizer, metrics=['acc'])

    earlystop = EarlyStopping(monitor= 'val_loss', 
                                  min_delta= 0.0001, 
                                  patience= nb_epoch / 10, 
                                  verbose=0, mode='auto')
    checkpoint = ModelCheckpoint(model_file_name,
                                 monitor='val_loss',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='auto')
    loss_history = LossHistory()
    
    history_model = model.fit_generator(gen_data.train_generator(df_class0= train_nonC_list[generation], 
                                                                 df_class1=train_C,
                                                                 class_0_ratio = 1,
                                                                 class_1_ratio = 1,
                                                                 bz = batch_size),
                                            steps_per_epoch = n_batch,
                                            epochs= nb_epoch,
                                            validation_data=(x_val, y_val),
                                            callbacks = [reduce_lr,
                                                         loss_history, 
                                                         checkpoint, 
                                                         earlystop,
                                                         LogAUC(), 
                                                         f1sc()  ])
    # save training process
    train_loss = history_model.history.get("loss")
    train_acc = history_model.history.get("acc")

    val_loss = history_model.history.get("val_loss")
    val_acc = history_model.history.get("val_acc")
    val_auc = history_model.history.get("val_auc")
    val_f1 = history_model.history.get('val_f1sc')
    val_tp = np.array(history_model.history.get('val_tp')).astype('float32')
    val_tn = np.array(history_model.history.get('val_tn')).astype('float32')
    val_fp = np.array(history_model.history.get('val_fp')).astype('float32')
    val_fn = np.array(history_model.history.get('val_fn')).astype('float32')

    pd_tmp = pd.DataFrame({'train_loss': train_loss,
                           'valid_loss': val_loss,
                           'train_acc': train_acc,
                           'valid_acc': val_acc,
                           'valid_f1': val_f1,
                           'valid_auc': val_auc,
                           'valid_TP': val_tp,
                           'valid_TN': val_tn,
                           'valid_FP': val_fp,
                           'valid_FN': val_fn})
    pd_tmp.to_csv(opt + '_training_process_gen' + str(generation) + '.csv')

    # make prediction
    
    pred_out = gen_data.model_predict_testing(model_name = model_file_name, 
                                              df_class0 = test_nonC, 
                                              df_class1 = test_C, 
                                              testing_batch_size= 12500)
    pred_out.to_csv('res_csv/testing_' + opt + '.csv', index = False)
    
    ###
    # do hard-negative-mining #
    # random predict negative samples (non-copper)? or do all prediction
    ###
    print('doing training set prediction')
    
    if generation < glist[-1]:
        mining = gen_data.model_predict_testing(model_name = model_file_name,
                                                df_class0 = train_nonC_list[generation + 1],
                                                df_class1 = train_C,
                                                testing_batch_size = 12500)
        # get 
        xxx = pd.concat([mining[(mining['y_true'] == 0) & (mining['y_pred'] >= 0.5) & (mining['y_pred'] < 0.95)],
                 mining[(mining['y_true'] == 0) & (mining['y_pred'] >= 0.4) & (mining['y_pred'] < 0.5)].sample(frac = 0.8),
                 mining[(mining['y_true'] == 0) & (mining['y_pred'] >= 0.3) & (mining['y_pred'] < 0.4)].sample(frac = 0.6),
                 mining[(mining['y_true'] == 0) & (mining['y_pred'] >= 0.2) & (mining['y_pred'] < 0.3)].sample(frac = 0.4),
                 mining[(mining['y_true'] == 0) & (mining['y_pred'] >= 0.0) & (mining['y_pred'] < 0.2)].sample(frac = 0.2)])
        
        train_nonC_list[generation + 1] = train_nonC_list[generation + 1][train_nonC_list[generation + 1].im_path.isin(list(xxx.png_name))]
        
        # if want to keep original data (rather than purely use new fold as training samples)
        train_nonC_list[generation + 1] = pd.concat([train_nonC_list[generation], train_nonC_list[generation + 1]]) 
        
        train_nonC_list[generation + 1].reset_index(drop = True)
        print('Next generation nonC size: ' +  str(train_nonC_list[generation + 1].shape))
    else:
        print('Generation' + str(generation) + 'is the last one: done')
    

We are now at generation1
all parameter tunable
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 200, 200, 3)   0                                            
____________________________________________________________________________________________________
zero_padding2d_1 (ZeroPadding2D) (None, 206, 206, 3)   0           input_1[0][0]                    
____________________________________________________________________________________________________
conv1 (Conv2D)                   (None, 100, 100, 64)  9472        zero_padding2d_1[0][0]           
____________________________________________________________________________________________________
bn_conv1 (BatchNormalization)    (None, 100, 100, 64)  256         conv1[0][0]                      
___________________________________________

____________________________________________________________________________________________________
bn4a_branch2b (BatchNormalizatio (None, 13, 13, 256)   1024        res4a_branch2b[0][0]             
____________________________________________________________________________________________________
activation_24 (Activation)       (None, 13, 13, 256)   0           bn4a_branch2b[0][0]              
____________________________________________________________________________________________________
res4a_branch2c (Conv2D)          (None, 13, 13, 1024)  263168      activation_24[0][0]              
____________________________________________________________________________________________________
res4a_branch1 (Conv2D)           (None, 13, 13, 1024)  525312      activation_22[0][0]              
____________________________________________________________________________________________________
bn4a_branch2c (BatchNormalizatio (None, 13, 13, 1024)  4096        res4a_branch2c[0][0]    

Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150


Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150


Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
6.21448
runung index: 0
39.17853236198425
runung index: 1
37.29506039619446
runung index: 2
37.4403190612793
runung index: 3
37.37554216384888
runung index: 4
37.380677938461304
runung index: 5
37.44952845573425
runung index: 6
8.060314178466797
doing training set prediction
15.0656
runung index: 0
40.149373292922974
runung index: 1
37.41170120239258
runung index: 2
37.33612775802612
runung index: 3
37.209747076034546
runung index: 4
37.07895374298096
runung index: 5
37.26848363876343
runung index: 6
37.31501531600952
runung index: 7
37.20440649986267
runung index: 8
37.22254252433777
runung index: 9
37.12575626373291
runung index: 10
37.33307886123657
runung index: 11
37.321831941604614
runung index: 12
37.19296598434448
runung index: 13
37.2

Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150


Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150


6.21448
runung index: 0
39.92813968658447
runung index: 1
37.04754567146301
runung index: 2
37.16007161140442
runung index: 3
37.155561685562134
runung index: 4
37.06194019317627
runung index: 5
37.1908655166626
runung index: 6
8.000077962875366
doing training set prediction
15.0656
runung index: 0
40.605916261672974
runung index: 1
37.33028864860535
runung index: 2
37.210145473480225
runung index: 3
37.264198303222656
runung index: 4
37.355626821517944
runung index: 5
37.33407187461853
runung index: 6
37.35201859474182
runung index: 7
37.343751192092896
runung index: 8
37.25987195968628
runung index: 9
37.396915435791016
runung index: 10
37.33290147781372
runung index: 11
37.338616132736206
runung index: 12
37.23989534378052
runung index: 13
37.34236264228821
runung index: 14
37.24082827568054
runung index: 15
2.426393985748291
Next generation nonC size: (253090, 3)
We are now at generation3
loading exist model: model/model_hard_negative_thresh10_gen2_k3.h5
Epoch 1/150
Epoch 2/150
Epo

Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
6.21448
runung index: 0
39.98738145828247
runung index: 1
37.1593496799469
runung index: 2
37.18761992454529
runung index: 3
37.04566764831543
runung index: 4
37.19725680351257
runung index: 5
37.10509133338928
runung index: 6


7.936277151107788
doing training set prediction
Generation3is the last one: done
