## Imports and Utilities

In [1]:
from cnn_utils import *

import matplotlib.pyplot as plt
%matplotlib inline


# 設定使用兩顆GPU執行程式，若只有一顆則此行不須執行 (等同於 os.environ["CUDA_VISIBLE_DEVICES"] = "0" )
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Configuration

In [None]:
config_path = 'configuration.config'

In [6]:
# set params from scratch

params_dict = {'model_params':{},
            'data_params':{},
            'train_params':{}}

params_dict['data_params']['df_dir'] = 'NIH_Chest_X_ray/Data_Entry_2017.csv' # 訓練資料的csv檔案
params_dict['data_params']['img_dir'] = 'NIH_Chest_X_ray/images/images/' # 訓練資料的圖片資料夾位置
params_dict['data_params']['file_col'] = 'Image Index'
params_dict['data_params']['target_col'] = 'View Position'
params_dict['data_params']['data_save_path'] = 'ResNet50_sample5000_preproc2_data.pkl'


# parameters of present model
params_dict['model_params']['pretrain_model'] = 'imagenet:ResNet50' # file_path or imagenet pretrain model
params_dict['model_params']['model_dir'] = 'saved_models' # 模型存放位置
params_dict['model_params']['model_name'] = 'ResNet50_can_be_deleted' # 儲存模型名稱(自由命名)
params_dict['model_params']['weight_init'] = 'glorot_uniform'


# parameters of training
params_dict['train_params']['histogram_equalization'] = True
params_dict['train_params']['scale_range'] = (-1,1)
params_dict['train_params']['img_augmentation'] = True
params_dict['train_params']['batch_size'] = 64
params_dict['train_params']['epochs'] = 200
params_dict['train_params']['img_shape'] = (224,224,3)
params_dict['train_params']['weight_regularization'] = {'l1':0, 'l2':0}
params_dict['train_params']['dropout_rate'] = 0.5
params_dict['train_params']['early_stop_round'] = 5
params_dict['train_params']['optimizer'] = {'optimizer':'Adam',
                                            'momentum':None,
                                            'lr' : 1e-5,
                                            'decay': 0.0,  #SGD & RMSprop & Adagrad & Adam
                                            'nesterov': None,  #SGD
                                            'rho': None,  #RMSprop
                                            'epsilon': None  #RMSprop & Adagrad 
                                           }
params_dict['train_params']['loss'] = 'categorical_crossentropy'
params_dict['train_params']['freeze_layer'] = None
params_dict['train_params']['random_state'] = 1048


with open('configuration.config', 'w') as f:
    f.write(json.dumps(params_dict, indent=3))

In [27]:
# load params from params_dict

params_dict = json.load(open(config_path,'r'))

---

## Data preprocessing
切分訓練集、驗證集、測試集並將檔案存出

In [None]:
if type(params_dict['data_params']['df_dir']) is str:

    data = pd.read_csv(params_dict['data_params']['df_dir'])

    preproc = get_preproc_function(scale_range = params_dict['train_params']['scale_range'],
                                   histogram_equalization = params_dict['train_params']['histogram_equalization'])

    #setting

    n_samples = 5000

    # 取出2成資料id作為測試資料id
    test_id = take_test_id(data, random_state = params_dict['train_params']['random_state'], target_col = params_dict['data_params']['target_col'])

    train_dat = data.drop(test_id)
    test_dat = data.loc[test_id]


    # 從剩下的資料中取出 n_samples 筆資料作為訓練與驗證集
    train_dat = train_dat[:n_samples]

    # 打亂並再度切割資料作為訓練與驗證集 (8 : 2)
    t_dat, v_dat = train_test_split(train_dat, shuffle = True, test_size = 0.2, random_state = params_dict['train_params']['random_state'])

    print(t_dat.shape)
    print(v_dat.shape)

elif type(params_dict['data_params']['df_dir']) is dict:
    
    t_dat = params_dict['data_params']['df_dir']['training_set']
    v_dat = params_dict['data_params']['df_dir']['validation_set']
    test_dat = params_dict['data_params']['df_dir']['testing_set']
    
dat_dict = {'training_set': t_dat,
            'validation_set': v_dat,
            'testing_set': test_dat}


# 將三份檔案儲存為pickle檔
if params_dict['data_params']['data_save_path']!='':

    pickle.dump(dat_dict,open(params_dict['data_params']['data_save_path'], 'wb'))
    

In [18]:
train_gen = image_data_generator(t_dat, img_dir = params_dict['data_params']['img_dir'],
                                 file_col = params_dict['data_params']['file_col'],
                                 target_col = params_dict['data_params']['target_col'],
                                 batch_size = params_dict['train_params']['batch_size'],
                                 re_size = params_dict['train_params']['img_shape'][0:2],
                                 preprocess_function=preproc,
                                 augmentation = params_dict['train_params']['img_augmentation'])

valid_gen = image_data_generator(v_dat, img_dir = params_dict['data_params']['img_dir'],
                                 file_col = params_dict['data_params']['file_col'],
                                 target_col = params_dict['data_params']['target_col'],
                                 batch_size = params_dict['train_params']['batch_size'],
                                 re_size = params_dict['train_params']['img_shape'][0:2],
                                 preprocess_function=preproc,
                                 augmentation = False)

(4000, 12)
(1000, 12)


## Build up model and training

In [27]:

if '/' in params_dict['model_params']['pretrain_model']:
    model = load_model(params_dict['model_params']['pretrain_model'])
else:
    # 使用ResNet50作為基本模型(可更換)
    
    imagenet_pretrain_model = params_dict['model_params']['pretrain_model'].split(':')[1]
    
    if imagenet_pretrain_model=='ResNet50':
        base_model = ResNet50(weights='imagenet', include_top=False, input_shape=params_dict['train_params']['img_shape'])
    elif imagenet_pretrain_model=='VGG16':
        base_model = VGG16(weights='imagenet', include_top=False, input_shape=params_dict['train_params']['img_shape'])
    elif imagenet_pretrain_model=='InceptionV3':
        base_model = InceptionV3(weights='imagenet', include_top=False, input_shape=params_dict['train_params']['img_shape'])
    elif imagenet_pretrain_model=='InceptionResNetV2':
        base_model = InceptionResNetV2(weights='imagenet', include_top=False, input_shape=params_dict['train_params']['img_shape'])
    elif imagenet_pretrain_model=='DenseNet201':
        base_model = DenseNet201(weights='imagenet', include_top=False, input_shape=params_dict['train_params']['img_shape'])
    elif imagenet_pretrain_model=='NASNetLarge':
        base_model = NASNetLarge(weights='imagenet', include_top=False, input_shape=params_dict['train_params']['img_shape'])
    else:
        print('parameter in config.model_params.pretrain_model should be either a path or imagenet:\'pretain_model_name\'')
   
    n_class = t_dat[params_dict['data_params']['target_col']].nunique()
    
    # layer freeze:
    if params_dict['train_params']['freeze_layer']:
        for i in params_dict['train_params']['freeze_layer'] :
            base_model.layers[i].trainable = False
    
    x = base_model.output
    x = Flatten()(x)
    x = Dropout(params_dict['train_params']['dropout_rate'])(x)
    
    if params_dict['train_params']['weight_regularization']:
        reg = keras.regularizers.l1_l2(l1 = params_dict['train_params']['weight_regularization']['l1'],
                                               l2 = params_dict['train_params']['weight_regularization']['l2'])
    else:
        reg = None
    predictions = Dense(n_class, activation='softmax', kernel_regularizer=reg)(x)

    
    model = Model(inputs=base_model.input, outputs=predictions)

if params_dict['train_params']['optimizer']['optimizer']=='SGD':
    optimizer = keras.optimizers.SGD(lr=params_dict['train_params']['optimizer']['lr'],
                                     momentum=params_dict['train_params']['optimizer']['momentum'],
                                     decay=params_dict['train_params']['optimizer']['decay'],
                                     nesterov=params_dict['train_params']['optimizer']['nesterov'])
elif params_dict['train_params']['optimizer']['optimizer']=='RMSprop':
    optimizer = keras.optimizers.RMSprop(lr=params_dict['train_params']['optimizer']['lr'],
                                         rho=params_dict['train_params']['optimizer']['rho'],
                                         epsilon=params_dict['train_params']['optimizer']['epsilon'],
                                         decay=params_dict['train_params']['optimizer']['decay'])
elif params_dict['train_params']['optimizer']['optimizer']=='Adagrad':
    optimizer = keras.optimizers.Adagrad(lr=params_dict['train_params']['optimizer']['lr'],
                                         epsilon=params_dict['train_params']['optimizer']['epsilon'],
                                         decay=params_dict['train_params']['optimizer']['decay'])
elif params_dict['train_params']['optimizer']['optimizer']=='Adam':
    optimizer = keras.optimizers.Adam(lr=params_dict['train_params']['optimizer']['lr'],
                                      epsilon=params_dict['train_params']['optimizer']['epsilon'],
                                      decay=params_dict['train_params']['optimizer']['decay'])
    
model.compile(loss=params_dict['train_params']['loss'],
              optimizer=optimizer, metrics=['accuracy'])




In [None]:
# 設定模型儲存的位置
model_path = params_dict['model_params']['model_dir']+'/{}.h5'.format(params_dict['model_params']['model_name'])

if params_dict['train_params']['early_stop_round']!=None:
    earlystop = EarlyStopping(monitor='val_loss', patience=params_dict['train_params']['early_stop_round'], verbose=1)
    checkpoint = ModelCheckpoint(model_path, monitor='val_loss', save_best_only=True, verbose=1)
    callbacks = [checkpoint, earlystop]
else:
    checkpoint = ModelCheckpoint(model_path, monitor='loss', save_best_only=True, verbose=1)
    callbacks = [checkpoint]


model_history = model.fit_generator(train_gen,
                                    epochs = params_dict['train_params']['epochs'],
                                    validation_data = valid_gen,
                                    callbacks = callbacks,
                                    verbose=1, steps_per_epoch = math.ceil(len(t_dat)/params_dict['train_params']['batch_size']),
                                    validation_steps = math.ceil(len(v_dat)/params_dict['train_params']['batch_size']))

Epoch 1/200

In [None]:
# 完整訓練完模型得到 model_history後，可以畫出訓練階段的狀況

training_loss = model_history.history['loss']
plt.plot(training_loss, label="training_loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Learning Curve")

try:
    val_loss = model_history.history['val_loss']
    plt.plot(val_loss, label="validation_loss")
    plt.legend(loc='best')
except:
    None
plt.show()

---

## Model testing - log

In [28]:
log_dict = {}

In [4]:
#load parameters

params_dict = json.load(open(config_path,'r'))
dat_dict = pickle.load(open(params_dict['data_params']['data_save_path'], 'rb'))

# 載入已經訓練好的模型
model = load_model(os.path.join(params_dict['model_params']['model_dir'], params_dict['model_params']['model_name']+'.h5'))

preproc = get_preproc_function(scale_range = params_dict['train_params']['scale_range'],
                               histogram_equalization = params_dict['train_params']['histogram_equalization'])

In [29]:
log_dict['general'] = {}
log_dict['general']['model_path'] = os.path.join(params_dict['model_params']['model_dir'], params_dict['model_params']['model_name']+'.h5')
log_dict['general']['config_path'] = config_path

for dat_type, dat in dat_dict.items():
    
    log_dict[dat_type] = {}
    
    gen = image_data_generator(dat, img_dir = params_dict['data_params']['img_dir'],
                                     file_col = params_dict['data_params']['file_col'],
                                     target_col = params_dict['data_params']['target_col'],
                                     batch_size = params_dict['train_params']['batch_size'],
                                     re_size = params_dict['train_params']['img_shape'][0:2],
                                     preprocess_function=preproc,
                                     augmentation = False, shuffle_ = False)
    
    true_ = np.argmax(pd.get_dummies(dat[params_dict['data_params']['target_col']]).values, axis = 1)
    pred = model.predict_generator(gen, steps = math.ceil(len(dat)/params_dict['train_params']['batch_size']))

    cm = confusion_matrix(y_true = true_, y_pred = np.argmax(pred, axis = 1))
    acc = accuracy_score(y_true = true_, y_pred = np.argmax(pred, axis = 1))
    precision = list(precision_score(y_true = true_, y_pred = np.argmax(pred, axis = 1), average = None)
    recall = list(recall_score(y_true = true_, y_pred = np.argmax(pred, axis = 1), average = None)
    
    # save result to log_dict
    log_dict[dat_type]['length'] = dat.shape[0]
    log_dict[dat_type]['accuracy'] = acc
    log_dict[dat_type]['confusion_matrix'] = cm.tolist()
    log_dict[dat_type]['precision'] = precision
    log_dict[dat_type]['recall'] = recall


In [30]:
log_name = params_dict['model_params']['model_name']+'.log'

with open(log_name, 'w') as f:
    f.write(json.dumps(log_dict, indent=3))

---

## Model prediction

In [3]:
#load parameters

params_dict = json.load(open(config_path,'r'))

# 載入已經訓練好的模型
model = load_model(os.path.join(params_dict['model_params']['model_dir'], params_dict['model_params']['model_name']+'.h5'))

preproc = get_preproc_function(scale_range = params_dict['train_params']['scale_range'],
                               histogram_equalization = params_dict['train_params']['histogram_equalization'])

In [8]:
# setting

# 設定要做測試時的batch_size
batch_size = 16

# 設定要讀取圖片的 path
test_image_path = 'NIH_Chest_X_ray/images/images/'

# 設定要存出的檔案名稱
out_file_name = 'test.csv'

###########################################

# test data generator
test_gen = generator_from_dir(test_image_path,
                              target_size = tuple(params_dict['train_params']['img_shape'][:2]),
                              batch_size = batch_size,
                              preprocess_function = preproc)


test_prediction = []
test_label = []
file_list = []

# testing data prediction
for i, [image_batch, file_batch] in enumerate(test_gen):
    prediction = model.predict(np.array(image_batch))

    test_prediction.extend(prediction)
    file_list.extend(file_batch)
    
    if i%10==0:
        print('{} batchs had been processed.'.format(i))

# 存出檔案
out_file = pd.DataFrame({
    'file_name' : file_list,
    'prediction' : np.argmax(np.array(test_prediction), axis = 1),
})
    
out_file.to_csv(out_file_name, index = False)

112120 images found in directory
0 images had been processed.
1000 images had been processed.
2000 images had been processed.
3000 images had been processed.
4000 images had been processed.
5000 images had been processed.
6000 images had been processed.
7000 images had been processed.
