# Step 1. List all train image paths

In [2]:
import os
############################################
# Get train image list, not numpy array    #
############################################
def get_train_img_list(train_img_path):
    train_img_path_list = []
    
    type_path_list = map(lambda type_name: "/".join([train_img_path, type_name]), os.listdir(train_img_path))
    print("all type files below:")
    for p_idx in xrange(len(type_path_list)):
        print "".join(
            map(str, ["type", (p_idx+1), " dir:", type_path_list[p_idx]])
        )

    for p_idx in xrange(len(type_path_list)):
        type_img_name_list = map(lambda img_name:
                                 "/".join([type_path_list[p_idx], img_name]),
                                 os.listdir(type_path_list[p_idx]))
        print "".join(
            map(str, ["type", (p_idx+1), " file number:", len(type_img_name_list)])
        )
        train_img_path_list.extend(type_img_name_list)
        
    # filter no-image files
    train_img_path_list = filter(lambda f: f[-4:] == ".jpg", train_img_path_list)
    
    for p_idx in xrange(len(train_img_path_list[:10])): 
        print "".join(
            map(str, ["sample ", p_idx+1, ":", train_img_path_list[p_idx]])
        )    
    
    return train_img_path_list

In [3]:
###############################
# just for test               #
###############################

train_img_path = "/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224"
img_path_list = get_train_img_list(train_img_path)
print len(img_path_list)

all type files below:
type1 dir:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224
type2 dir:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_2_seg_224
type3 dir:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_3_seg_224
type1 file number:7200
type2 file number:21730
type3 file number:12130
sample 1:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/0.jpg
sample 2:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/10 (2).jpg
sample 3:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/10.jpg
sample 4:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/1000.jpg
sample 5:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/1928.jpg
sample 6:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/1930.jpg
sample 7:/media/yuens/WIN10-ENTERT

# Step 2. Extract Features Using Pretrained Models

## Step 2.1 Attaining label name

In [4]:
def get_synset(synset_path="../model/synset.txt"):
    #synset_path = "../model/synset.txt"
    with open(synset_path) as f:
        synsets = [l.rstrip() for l in f]
    #print synsets
    return synsets

## Step 2.2 All layers of resneXt-50 and last 10 layers:

First, have a look to observe the last 10 layers of a pretrained model:

In [5]:
###############################
# just for test               #
###############################

import mxnet as mx

prefix = "../model/finetune-resnext-50-train-add-seg-224-lr-0.01"
epoch = 3
sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)

all_layers = sym.get_internals()
all_layers.list_outputs()[-10:-1]

['stage4_unit3_bn3_output',
 '_plus15_output',
 'stage4_unit3_relu_output',
 'pool1_output',
 'flatten0_output',
 'fc1_weight',
 'fc1_bias',
 'fc1_output',
 'softmax_label']

  chunks = self.iterencode(o, _one_shot=True)


## Step 2.3 Attaining feature of flatten0_output layer from symbol:

In [6]:
###############################
# load model                  #
###############################
def get_feature_extractor(prefix, epoch):
    
    sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)

    all_layers = sym.get_internals()
    flatten_output_sym = all_layers['flatten0_output']
    mod_output = mx.mod.Module(symbol=flatten_output_sym, label_names=None, context=mx.cpu())
    mod_output.bind(for_training=False, data_shapes=[('data', (1,3,224,224))])
    mod_output.set_params(arg_params, aux_params)
    
    feature_extractor = mod_output
    return feature_extractor

  chunks = self.iterencode(o, _one_shot=True)


## Step 2.4 Attaining image and make resize

In [7]:
import cv2
import numpy as np

def get_image(filename, input_shape=(224, 224)):
    img = cv2.imread(filename)  # read image in b,g,r order
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)   # change to r,g,b order
    img = cv2.resize(img, input_shape)  # resize to 224*224 to fit model
    img = np.swapaxes(img, 0, 2)
    img = np.swapaxes(img, 1, 2)  # change to (channel, height, width)
    img = img[np.newaxis, :]  # extend to (example, channel, heigth, width)
    return img

  chunks = self.iterencode(o, _one_shot=True)


## Step 2.5 Feature Extraction

Extract and save features for each image of three types.

### Step 2.5.1 Feature extraction on train set

In [None]:
import mxnet as mx
from collections import namedtuple
import time, datetime

################################################
# parameter initialization                     #
################################################

####################################
# extract feature on training set  #
####################################

Batch = namedtuple('Batch', ['data'])
train_img_path = "/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224"
img_path_list = get_train_img_list(train_img_path)
feats_save_path_list = map(lambda img_path: img_path.replace("jpg", "npy"), img_path_list)
model_path = "../model"
prefix_list = ["finetune-resnet-152-train-add-seg-224",\
               "finetune-resnet-200-train-add-seg-224",\
               "finetune-resnext-50-train-add-seg-224-lr-0.01"]
epoch_list = [3, 3, 3]
feats_save_prefix_list = ["resnet-152",\
                          "resnet-200",\
                          "resnext-50"]
synset_path="../model/synset.txt"
synsets = get_synset(synset_path)

################################################
# extract feature using each pretrained model  #
################################################

for m_idx in xrange(len(prefix_list)):
    prefix = "/".join([model_path, prefix_list[m_idx]])
    epoch = epoch_list[m_idx]
    feats_save_prefix = feats_save_prefix_list[m_idx]
    feature_extractor = get_feature_extractor(prefix, epoch)
    
    for img_idx in xrange(len(img_path_list)):
        time_stamp = datetime.datetime.now()
        print("{} {}th/{} img".format(time_stamp.strftime('%Y-%m-%d %H:%M:%S'), img_idx+1, len(img_path_list))),
        
        start_extract = time.time()
        
        img_path = img_path_list[img_idx]
        feats_save_path = feats_save_path_list[img_idx]
        img = get_image(img_path)
        feature_extractor.forward(Batch([mx.nd.array(img)]))
        feats = feature_extractor.get_outputs()[0].asnumpy()
        
        #####################
        # check features    #
        #####################
        #print feats.shape
        
        feats_save_path = feats_save_path.replace(".", "-"+feats_save_prefix+".")
        #####################
        # save features     #
        #####################
        #np.save(feats_save_path, feats)
        
        finish_extract = time.time()
        duration_extract = finish_extract - start_extract
        
        print("calc and save feats:{:.4f} path:{}".format(duration_extract, feats_save_path))

all type paths below:
type1 dir:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224
type2 dir:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_2_seg_224
type3 dir:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_3_seg_224
type1 images number:5760
type2 images number:17384
type3 images number:9704
sample 1:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_3_seg_224/100-resnet-152.npy
sample 2:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_3_seg_224/100-resnet-200.npy
sample 3:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_3_seg_224/100-resnext-50.npy
sample 4:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_3_seg_224/100.jpg
sample 5:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_3_seg_224/1000-resnet-152.npy
sample 6:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_3_seg

### 2.5.2 Feature extraction on test set

Extract test image features.

In [8]:
import os

def get_test_img_list(test_img_path):
    img_list = filter(lambda p: p[-4:] == ".jpg", os.listdir(test_img_path))
    return map(lambda img_name: "/".join([test_img_path, img_name]), img_list)

  chunks = self.iterencode(o, _one_shot=True)


In [9]:
###############################
# just for test               #
###############################

test_img_path = "/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224"
test_img_list = get_test_img_list(test_img_path)
print len(test_img_list)
print test_img_list[:10]

512
['/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224/0.jpg', '/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224/1.jpg', '/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224/10.jpg', '/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224/100.jpg', '/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224/118.jpg', '/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224/119.jpg', '/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224/12.jpg', '/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224/120.jpg', '/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224/138.jpg', '/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224/139.jpg']


  chunks = self.iterencode(o, _one_shot=True)


In [80]:
import os
import mxnet as mx
from collections import namedtuple
import time, datetime
import cv2

################################################
# parameter initialization                     #
################################################

###############################
# extract feature on test set #
###############################

test_img_path = "/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224"
img_path_list = get_test_img_list(test_img_path)
Batch = namedtuple('Batch', ['data'])
feats_save_path_list = map(lambda img_path: img_path.replace("jpg", "npy"), img_path_list)
model_path = "../model"
prefix_list = ["finetune-resnet-152-train-add-seg-224",\
               "finetune-resnet-200-train-add-seg-224",\
               "finetune-resnext-50-train-add-seg-224-lr-0.01"]
epoch_list = [3, 3, 3]
feats_save_prefix_list = ["resnet-152",\
                          "resnet-200",\
                          "resnext-50"]
synset_path="../model/synset.txt"
synsets = get_synset(synset_path)

################################################
# extract feature using each pretrained model  #
################################################

for m_idx in xrange(len(prefix_list)):
    prefix = "/".join([model_path, prefix_list[m_idx]])
    epoch = epoch_list[m_idx]
    feats_save_prefix = feats_save_prefix_list[m_idx]
    feature_extractor = get_feature_extractor(prefix, epoch)
    
    for img_idx in xrange(len(img_path_list)):
        time_stamp = datetime.datetime.now()
        print("{} {}th/{} img".format(time_stamp.strftime('%Y-%m-%d %H:%M:%S'), img_idx+1, len(img_path_list))),
        
        start_extract = time.time()
        
        img_path = img_path_list[img_idx]
        feats_save_path = feats_save_path_list[img_idx]
        img = get_image(img_path)

        feature_extractor.forward(Batch([mx.nd.array(img)]))
        feats = feature_extractor.get_outputs()[0].asnumpy()
        
        #####################
        # check features    #
        #####################
        #print feats.shape
        
        #####################
        # save features     #
        #####################
        feats_save_path = feats_save_path.replace(".", "-"+feats_save_prefix+".")
        np.save(feats_save_path, feats)
        
        finish_extract = time.time()
        duration_extract = finish_extract - start_extract
        
        print("calc and save feats:{:.4f} path:{}".format(duration_extract, feats_save_path))

 2017-06-09 23:02:27 1th/512 img calc and save feats:1.0504 path:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224/0-resnet-152.npy
2017-06-09 23:02:28 2th/512 img calc and save feats:0.9712 path:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224/1-resnet-152.npy
2017-06-09 23:02:29 3th/512 img calc and save feats:1.0169 path:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224/10-resnet-152.npy
2017-06-09 23:02:30 4th/512 img calc and save feats:1.3943 path:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224/100-resnet-152.npy
2017-06-09 23:02:32 5th/512 img calc and save feats:1.1143 path:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224/101-resnet-152.npy
2017-06-09 23:02:33 6th/512 img calc and save feats:1.2884 path:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224/102-resnet-152.npy
2017-06-09 23:02:34 7th/512 img calc and save feats:1.0203 path:/media/yuens/WIN10-ENTERTENMENT/Kaggle/I

  chunks = self.iterencode(o, _one_shot=True)


# Step 2.5 Feature Concate

In [9]:
import numpy as np

np1 = '/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/3070-resnet-152.npy'
np2 = '/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/3070-resnet-200.npy'
np3 = '/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/3070-resnext-50.npy'

a = np.load(np1)
b = np.load(np2)
c = np.load(np3)

print a.shape
print b.shape
print c.shape

print type(a)
print type(b)
print type(c)

# concate feats as axis=0 (feats map) for following dl model
print np.concatenate((a,b,c), axis=0).shape
# concate feats as axis=1 (row vector) for following boost model
print np.concatenate((a,b,c), axis=1).shape

?np.concatenate

(1, 2048)
(1, 2048)
(1, 2048)
<type 'numpy.ndarray'>
<type 'numpy.ndarray'>
<type 'numpy.ndarray'>
(3, 2048)
(1, 6144)


  chunks = self.iterencode(o, _one_shot=True)


# 3. XGBoost

## 3.1 Train and validat boost model

In [11]:
def get_train_dataframe(train_img_path, suffix="-resnext-50.npy"):
    #######################################################
    # Organize feats path and label using train_img_path  #
    #######################################################    
    
    img_path_list = get_train_img_list(train_img_path)
    print("len(img_path_list):{}".format(len(img_path_list)))    
    train_feats_np_path_list = map(lambda p: p.replace(".jpg", suffix), img_path_list)
    #for idx in xrange(len(train_feats_resnet50_np_path_list[:10])): print idx, train_feats_resnet50_np_path_list[:10][idx]    
    import re
    train_type_list = map(lambda p: str(int(re.findall(".*Type_(.*)_seg_", p)[0])-1), img_path_list)
    print("len(train_type_list):{0}".format(len(train_type_list)))
    #for idx in xrange(len(train_type_list[:10])): print idx, train_type_list[:10][idx]
    
    ####################################################################
    # Organize dataframe for train data (feature path and sample type) #
    ####################################################################
    
    import pandas as pd
    train_df = pd.DataFrame({"feats_path": [], "type": []})
    train_df['feats_path'] = train_feats_np_path_list
    train_df['type'] = train_type_list
    
    return train_df

  chunks = self.iterencode(o, _one_shot=True)


In [12]:
def train_xgboost(train_img_path, feats_suffix="-resnext-50.npy", model_save_path=None, num_round=200):
    train_df = get_train_dataframe(train_img_path, suffix=feats_suffix)
    
    import numpy as np
    import time
    start_load = time.time()
    x = np.array([
                  np.mean(\
                          np.load(feats_path),\
                          axis=0) 
                  for feats_path in train_df['feats_path'].tolist()\
                 ]\
                )
    y = np.array(\
                 map(int, train_df['type'].tolist())\
                )#.as_matrix()
    finish_load = time.time()
    duration_load = finish_load - start_load
    print("load train data time: {:.4f}".format(duration_load))
    print("x.shape:{0}".format(x.shape))
    print("y.shape:{0}".format(y.shape))
    print("type(x[0][0]:{}".format(type(x[0][0])))
    print("type(y[0]):{}".format(type(y[0])))
    
    #######################################################
    # Split data into train and val parts from train data #
    #######################################################
    from sklearn import cross_validation
    train_X, val_X, train_Y, val_Y = cross_validation.train_test_split(x, y, random_state=42, stratify=y,
                                                                   test_size=0.20)#0.20
    print("train_X.shape:{}, train_Y.shape:{}".format(train_X.shape, train_Y.shape))
    print("val_X.shape:{}, val_Y.shape:{}".format(val_X.shape, val_Y.shape))
    
    import xgboost as xgb
    xg_train = xgb.DMatrix(train_X, label=train_Y)
    xg_val = xgb.DMatrix(val_X, label=val_Y)
    
    #######################################################
    # Create XGBClassifier or XGBRegressor                #
    #######################################################
    # setup parameters for xgboost
    param = {}
    # use softmax multi-class classification
    param['objective'] = 'multi:softprob' #multi:softprob or multi:softmax
    # scale weight of positive examples
    param['eta'] = 0.05
    param['max_depth'] = 7
    param['silent'] = 0
    param['nthread'] = 8
    param['num_class'] = 3
    
    watchlist = [(xg_train, 'train'), (xg_val, 'val')]
    num_round = num_round
    bst = xgb.train(param, xg_train, num_round, watchlist)
    # get prediction
    pred = bst.predict(xg_val)
    error_rate = np.sum(pred != val_Y) / val_Y.shape[0]
    print('Test error using softmax = {}'.format(error_rate))    
    
#     clf = xgb.XGBClassifier(max_depth=1,#10
#                            n_estimators=20500,
#                            min_child_weight=15,#9
#                            learning_rate=0.00750,
#                            nthread=8,
#                            subsample=0.80,#0.8
#                            colsample_bytree=0.80,
#                            seed=4242)
    
    #######################################################
    # Train XGBoosting classifier or regressor            #
    #######################################################
#     clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)],\
#             verbose=True,\
#             eval_metric='logloss',\
#             early_stopping_rounds=300)
#     # eval_metric='logloss' or 'error' 

    ########################
    # Save boost model     #
    ########################
    if model_save_path == None:
        import datetime
        cur_time = str(datetime.datetime.now()).replace(" ", "-")[:-7]
        model_save_path = feats_suffix.replace(".npy", cur_time) + "-" + str(num_round) + ".xgbmodel"
    print("model_save_path:{}".format(model_save_path))
    bst.save_model(model_save_path)
    
    return bst

  chunks = self.iterencode(o, _one_shot=True)


In [12]:
import datetime
print datetime.datetime.now()
print str(datetime.datetime.now()).replace(" ", "-")[:-7]

2017-06-17 22:35:21.536362
2017-06-17-22:35:21


  chunks = self.iterencode(o, _one_shot=True)


In [58]:
bst = train_xgboost(train_img_path, feats_suffix, model_save_path=None)

all type paths below:
type1 dir:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224
type2 dir:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_2_seg_224
type3 dir:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_3_seg_224
type1 images number:5760
type2 images number:17384
type3 images number:9704
sample 1:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/0.jpg
sample 2:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/10 (2).jpg
sample 3:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/10.jpg
sample 4:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/1000.jpg
sample 5:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/1928.jpg
sample 6:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/1930.jpg
sample 7:/media/yuens/WIN10-E



In [None]:
##############################################
# save model using bst after train finished  #
##############################################
bst_model_save_path = "../resnxet50-xgbmodel"
bst.save_model(bst_model_save_path)
# bst.load_model(bst_model_save_path)

## 3.2 Save and load boost model

In [25]:
###############################
# Load boost model            #
###############################
import xgboost as xgb

bst_model_save_path = "../resnext50-xgbmodel"
bst = xgb.Booster(model_file=bst_model_save_path)

  chunks = self.iterencode(o, _one_shot=True)


# 4. Predict test set and make submit

Do the same thing again as below, but output probabilities

In [82]:
def get_test_dataframe(test_img_path):
    #######################################################
    # Organize feats path using test_img_path             #
    #######################################################    
    
    img_path_list = get_test_img_list(test_img_path)
    print("len(img_path_list):{}".format(len(img_path_list)))
    test_feats_np_path_list = map(lambda p: p.replace(".jpg", "-resnext-50.npy"), img_path_list)
    
    import pandas as pd
    test_df = pd.DataFrame({"feats_path": [], "type": []})
    test_df['feats_path'] = test_feats_np_path_list
    #test_df['type'] = test_type_list
    print("test_df.shape:{}".format(test_df.shape))
    
    return test_df

  chunks = self.iterencode(o, _one_shot=True)


In [115]:
def make_submit(model_file, test_df, submission_save_path, submission_head=['image_name','Type_1','Type_2','Type_3']):    
    ######################################################
    # Load model and data                                #
    ######################################################
    import xgboost as xgb
    bst = xgb.Booster(model_file=model_file)
    
    import pandas as pd
    x = np.array([
                  np.mean(\
                          np.load(feats_path),\
                          axis=0) 
                  for feats_path in test_df['feats_path'].tolist()\
                 ]\
                )
    xg_test = xgb.DMatrix(x)    
    
    ##############################################################
    # Predict label for features of test set using trained model #
    ##############################################################
    pred_prob = bst.predict(xg_test)
    print("type(pred_prob):{}".format(type(pred_prob)))
    print("pred_prob.shape:{}".format(pred_prob.shape))
    print("pred_prob[:10]:{}".format(pred_prob[:10])) 

    ###############################################################    
    # Save predict result of test set and prepare submission file #
    ###############################################################
    sub_dict = dict()
    for idx in xrange(len(submission_head)):
        sub_dict[submission_head[idx]] = []
    print("submission_head:{}".format(submission_head))
    print("sub_dict:{}".format(sub_dict))
    sub_df = pd.DataFrame(sub_dict)
    
    sub_df[submission_head[0]] = map(lambda idx: str(idx)+".jpg", xrange(int(x.shape[0])))
    sub_df[submission_head[1]] = pred_prob[:, 0]
    sub_df[submission_head[2]] = pred_prob[:, 1]
    sub_df[submission_head[3]] = pred_prob[:, 2]
        
    sub_df.to_csv(submission_save_path, index=False)
    print(sub_df.head())

  chunks = self.iterencode(o, _one_shot=True)


In [114]:
############################################
# initialize parameters                    #
############################################
model_file = "../resnext50-xgbmodel"
test_img_path = "/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224"
submission_save_path = "./boost-result.csv"
submission_head=['image_name','Type_1','Type_2','Type_3']
############################################
# load model,test data and make prediction #
############################################
test_df = get_test_dataframe(test_img_path)
make_submit(model_file, test_df, submission_save_path, submission_head)

len(img_path_list):512
test_df.shape:(512, 2)
type(pred_prob):<type 'numpy.ndarray'>
pred_prob.shape:(512, 3)
pred_prob[:10]:[[  1.51826581e-02   9.63989556e-01   2.08277777e-02]
 [  2.29575369e-03   8.56581926e-01   1.41122356e-01]
 [  2.26456355e-02   9.77246642e-01   1.07705739e-04]
 [  8.84332582e-02   9.11412895e-01   1.53848960e-04]
 [  3.96229886e-03   5.72716780e-02   9.38766062e-01]
 [  2.20657382e-02   5.52931249e-01   4.25003022e-01]
 [  3.32365534e-03   2.71506488e-01   7.25169837e-01]
 [  7.97186419e-03   9.90950584e-01   1.07749156e-03]
 [  1.21626994e-02   8.37532461e-01   1.50304839e-01]
 [  4.38916788e-04   9.98275757e-01   1.28537591e-03]]
submission_head:['image_name', 'Type_1', 'Type_2', 'Type_3']
sub_dict:{'Type_3': [], 'Type_2': [], 'Type_1': [], 'image_name': []}
     Type_1    Type_2    Type_3 image_name
0  0.015183  0.963990  0.020828      0.jpg
1  0.002296  0.856582  0.141122      1.jpg
2  0.022646  0.977247  0.000108      2.jpg
3  0.088433  0.911413  0.000154

  chunks = self.iterencode(o, _one_shot=True)


In [38]:
import re

train_img_path = "/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224"


train_feats_resnet50_np_path_list = map(lambda p: p.replace(".jpg", "-resnext-50.npy"), img_path_list)
for idx in xrange(len(train_feats_resnet50_np_path_list[:10])): print idx, train_feats_resnet50_np_path_list[:10][idx]

train_type_list = map(lambda p: re.findall(".*Type_(.*)_seg_", p)[0], img_path_list)
print("len(train_type_list):{0}".format(len(train_type_list)))
for idx in xrange(len(train_type_list[:10])): print idx, train_type_list[:10][idx]


0 /media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/0-resnext-50.npy
1 /media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/10 (2)-resnext-50.npy
2 /media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/10-resnext-50.npy
3 /media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/1000-resnext-50.npy
4 /media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/1928-resnext-50.npy
5 /media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/1930-resnext-50.npy
6 /media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/1932-resnext-50.npy
7 /media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/1936-resnext-50.npy
8 /media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/3070-resnext-50.npy
9 /media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_22

  chunks = self.iterencode(o, _one_shot=True)


In [32]:
###############################
# just for test               #
###############################
import pandas as pd

a = [1,2,3]
b = [3,4,5]

id_list = range(len(a)+1)

print id_list

pd.DataFrame(a,b)

[0, 1, 2, 3]


Unnamed: 0,0
3,1
4,2
5,3


  chunks = self.iterencode(o, _one_shot=True)


# 5. Predict single sample

load model and single sample feature, then make prediction on single model using trained boost model.

In [31]:
# load sample feature as numpy format
x = np.load("/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_3_seg_224/100-resnet-152.npy")
# transform numpy format as DMatrix format (LIBSVM format)
xg_test = xgb.DMatrix(x)
# make prob prediction on sample
pred = bst.predict(xg_test)
# make prob as label
pred_label = np.argmax(pred_prob, axis=1)
print("pred:{}".format(pred))
print("pred_label:{}".format(pred_label))

pred:[[ 0.04447474  0.76827097  0.18725429]]
pred_label:[1]


  chunks = self.iterencode(o, _one_shot=True)


# 6. Re-organize features to make prediction based on boost model

Concate mulit-features from some deep learning models and train a boost model based on multi-features.

## 6.1 concate multi-features based on train set

In [11]:
import os
############################################
# Get train feature list, numpy array      #
############################################
def get_train_feature_list(train_img_path):
    train_feature_path_list = []
    
    type_path_list = map(lambda type_name: "/".join([train_img_path, type_name]), os.listdir(train_img_path))
    print("all directories below:")
    for i in type_path_list: print i

    for p_idx in xrange(len(type_path_list)):
        type_img_name_list = map(lambda img_name:
                                 "/".join([type_path_list[p_idx], img_name]),
                                 os.listdir(type_path_list[p_idx]))
        print "".join(
            map(str, ["type", (p_idx+1), " file number:", len(type_img_name_list)])
        )
        type_feature_path_list = filter(lambda p: p[-4:] == ".npy", type_img_name_list)
        train_feature_path_list.append(type_feature_path_list)
    
    return train_feature_path_list

  chunks = self.iterencode(o, _one_shot=True)


In [12]:
train_img_path = "/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224"
train_feature_path_list = get_train_feature_list(train_img_path)
print("len(train_feature_path_list):{}".format(len(train_feature_path_list)))

for i in xrange(3):
    print("len(train_feature_path_list[{}]):{}".format(i ,len(train_feature_path_list[i])))
    
for i in xrange(10):
    print("{}".format(train_feature_path_list[0][i]))

all directories below:
/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224
/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_2_seg_224
/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_3_seg_224
type1 file number:7200
type2 file number:21730
type3 file number:12130
len(train_feature_path_list):3
len(train_feature_path_list[0]):5760
len(train_feature_path_list[1]):17384
len(train_feature_path_list[2]):9704
/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/0-resnet-152-resnet-200-resnext-50vec.npy
/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/0-resnet-152.npy
/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/0-resnet-200.npy
/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/0-resnext-50.npy
/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/10 (2)-resnet-

  chunks = self.iterencode(o, _one_shot=True)


In [157]:
def concate_multi_feature_for_train(train_img_path, feats_save_prefix_list, \
                          suffix=".jpg", type_name_list=["Type_1", "Type_2", "Type_3",]):
    #######################################################
    # Attain all feats path for different type and image  #
    #######################################################
    train_img_list = get_train_img_list(train_img_path)
    print("len(train_img_list):{}".format(len(train_img_list)))
    
    img_feats_path_3_type_3d_list = []
    for type_idx in xrange(len(type_name_list)):
        img_path_cur_type_list = filter(lambda p: type_name_list[type_idx] in p, train_img_list)
        multi_feats_2d_list = map(lambda p:\
                                  [\
                                   p[:-len(suffix)] + "-" + feats_save_prefix_list[0] + ".npy",\
                                   p[:-len(suffix)] + "-" + feats_save_prefix_list[1] + ".npy",\
                                   p[:-len(suffix)] + "-" + feats_save_prefix_list[2] + ".npy",\
                                  ],
                                  img_path_cur_type_list)
        print("len(multi_feats_2d_list):{}".format(len(multi_feats_2d_list)))
        img_feats_path_3_type_3d_list.append(multi_feats_2d_list)
    
    print("len(img_feats_path_3_type_3d_list):{}".format(len(img_feats_path_3_type_3d_list)))
    print("len(img_feats_path_3_type_3d_list[0]):{}".format(len(img_feats_path_3_type_3d_list[0])))
    print("len(img_feats_path_3_type_3d_list[0][0]):{}".format(len(img_feats_path_3_type_3d_list[0][0])))
    
    ##################################################################
    # Load mulit feats for one image/sample and concate as one feats #
    ##################################################################
    import numpy as np
    import time, datetime
    for type_idx in xrange(len(img_feats_path_3_type_3d_list)):
        print("type_idx+1:{}".format(type_idx+1))
        for img_idx in xrange(len(img_feats_path_3_type_3d_list[type_idx])):
            ######################
            # Timer start        #
            ######################
            time_stamp = datetime.datetime.now()
            start_concate = time.time()
            print("{} {}th/{} img".format(time_stamp.strftime('%Y-%m-%d %H:%M:%S'), img_idx+1, len(img_feats_path_3_type_3d_list[type_idx]))),

            ######################
            # Concate feats      #
            ######################                    
            img_feats_path_list = img_feats_path_3_type_3d_list[type_idx][img_idx]
            multi_feats_vec = np.concatenate( ( np.load(img_feats_path_list[0]),\
                                                np.load(img_feats_path_list[1]),\
                                                np.load(img_feats_path_list[2]),\
                                              ),\
                                             axis=1,\
                                            )
            #####################
            # save features     #
            #####################
            feats_save_path = img_feats_path_list[0]\
                                .replace(feats_save_prefix_list[0], "-".join(feats_save_prefix_list)+"vec")
            np.save(feats_save_path, multi_feats_vec)
            
            ######################
            # Timer finish       #
            ######################
            finish_concate = time.time()
            duration_concate = finish_concate - start_concate
            print("concate feats:{:.4f} path:{}".format(duration_concate, feats_save_path))

########################
# initialize parameter #
########################
train_img_path = "/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224"
feats_save_prefix_list = ["resnet-152",\
                          "resnet-200",\
                          "resnext-50"]    
concate_multi_feature_for_train(train_img_path, feats_save_prefix_list)

all type files below:
type1 dir:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224
type2 dir:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_2_seg_224
type3 dir:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_3_seg_224
type1 file number:5760
type2 file number:17384
type3 file number:9704
sample 1:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/0.jpg
sample 2:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/10 (2).jpg
sample 3:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/10.jpg
sample 4:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/1000.jpg
sample 5:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/1928.jpg
sample 6:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_1_seg_224/1930.jpg
sample 7:/media/yuens/WIN10-ENTERTE

  chunks = self.iterencode(o, _one_shot=True)


In [158]:
#######################################
# Check multi-features concate result #
#######################################
import numpy as np
vec = np.load("/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224/Type_3_seg_224/129-resnet-152-resnet-200-resnext-50vec.npy")
print vec.shape

(1, 6144)


  chunks = self.iterencode(o, _one_shot=True)


## 6.2 concate multi-features based on test set

In [161]:
def concate_multi_feature_for_test(test_img_path, feats_save_prefix_list, \
                          suffix=".jpg", type_name_list=["Type_1", "Type_2", "Type_3",]):
    #######################################################
    # Attain all feats path according image paths         #
    #######################################################
    test_img_list = get_test_img_list(test_img_path)
    #for img_idx in xrange(len(test_img_list)):
    multi_feats_2d_list = map(lambda p:\
                                      [\
                                       p[:-len(suffix)] + "-" + feats_save_prefix_list[0] + ".npy",\
                                       p[:-len(suffix)] + "-" + feats_save_prefix_list[1] + ".npy",\
                                       p[:-len(suffix)] + "-" + feats_save_prefix_list[2] + ".npy",\
                                      ],
                                      test_img_list)
    print("len(multi_feats_2d_list):{}".format(len(multi_feats_2d_list)))
    print("len(multi_feats_2d_list[0]):{}".format(len(multi_feats_2d_list[0])))
    
    ##################################################################
    # Load multi feats for one image/sample and concate as one feats #
    ##################################################################
    import numpy as np
    import time, datetime
    
    for img_idx in xrange(len(multi_feats_2d_list)):
        ######################
        # Timer start        #
        ######################
        time_stamp = datetime.datetime.now()
        start_concate = time.time()
        print("{} {}th/{} img".format(time_stamp.strftime('%Y-%m-%d %H:%M:%S'), img_idx+1, len(multi_feats_2d_list))),

        ######################
        # Concate feats      #
        ######################        
        img_feats_path_list = multi_feats_2d_list[img_idx]
        multi_feats_vec = np.concatenate( ( np.load(img_feats_path_list[0]),\
                                            np.load(img_feats_path_list[1]),\
                                            np.load(img_feats_path_list[2]),\
                                          ),\
                                         axis=1,\
                                        )
        #####################
        # save features     #
        #####################
        feats_save_path = img_feats_path_list[0]\
                            .replace(feats_save_prefix_list[0], "-".join(feats_save_prefix_list)+"vec")
        np.save(feats_save_path, multi_feats_vec)

        ######################
        # Timer finish       #
        ######################
        finish_concate = time.time()
        duration_concate = finish_concate - start_concate
        print("concate feats:{:.4f} path:{}".format(duration_concate, feats_save_path))
               
########################
# initialize parameter #
########################
test_img_path = "/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224"
feats_save_prefix_list = ["resnet-152",\
                          "resnet-200",\
                          "resnext-50"] 
concate_multi_feature_for_test(test_img_path, feats_save_prefix_list)

len(multi_feats_2d_list):512
len(multi_feats_2d_list[0]):3
2017-06-14 22:34:40 1th/512 img concate feats:0.0477 path:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224/0-resnet-152-resnet-200-resnext-50vec.npy
2017-06-14 22:34:40 2th/512 img concate feats:0.0017 path:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224/1-resnet-152-resnet-200-resnext-50vec.npy
2017-06-14 22:34:40 3th/512 img concate feats:0.0130 path:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224/10-resnet-152-resnet-200-resnext-50vec.npy
2017-06-14 22:34:40 4th/512 img concate feats:0.0021 path:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224/100-resnet-152-resnet-200-resnext-50vec.npy
2017-06-14 22:34:40 5th/512 img concate feats:0.0019 path:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224/118-resnet-152-resnet-200-resnext-50vec.npy
2017-06-14 22:34:40 6th/512 img concate feats:0.0019 path:/media/yuens/WIN10-ENTERTENMENT/Kaggle/Inte

  chunks = self.iterencode(o, _one_shot=True)


In [162]:
#######################################
# Check multi-features concate result #
#######################################
import numpy as np
test_vec = np.load("/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224/85-resnet-152-resnet-200-resnext-50vec.npy")
print test_vec.shape

(1, 6144)


  chunks = self.iterencode(o, _one_shot=True)


# 7. Train boost model on multi-features

Repeat for Step.3 - XGBoost: train and validate boost model

In [10]:
####################################
# Train boost model on multi-feats #
####################################
train_img_path = "/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/train_add_seg_224"
feats_save_prefix_list = ["resnet-152",\
                          "resnet-200",\
                          "resnext-50"]
feats_suffix = "-"+"-".join(feats_save_prefix_list) + "vec.npy"
num_round = 5
bst = train_xgboost(train_img_path, feats_suffix, model_save_path=None, num_round=num_round)

NameError: name 'train_xgboost' is not defined

  chunks = self.iterencode(o, _one_shot=True)


# 8. Test boost model based on test set and make submit

In [190]:
####################################
# Test boost model                 #
####################################

############################################
# initialize parameters                    #
############################################
model_file = "-resnet-152-resnet-200-resnext-50vec2017-06-14-23:26:215.xgbmodel"
test_img_path = "/media/yuens/WIN10-ENTERTENMENT/Kaggle/Intel-CCS/test/test_seg_224"
submission_save_path = "./-resnet-152-resnet-200-resnext-50vec2017-06-14-23:26:215.xgbmodel.csv"
submission_head=['image_name','Type_1','Type_2','Type_3']
############################################
# load model,test data and make prediction #
############################################
test_df = get_test_dataframe(test_img_path)
make_submit(model_file, test_df, submission_save_path, submission_head)

len(img_path_list):512
test_df.shape:(512, 2)
type(pred_prob):<type 'numpy.ndarray'>
pred_prob.shape:(512, 3)
pred_prob[:10]:[[ 0.23845677  0.37364027  0.38790298]
 [ 0.27836341  0.40760386  0.31403267]
 [ 0.26757988  0.39246807  0.33995199]
 [ 0.28474393  0.36851722  0.34673885]
 [ 0.26004237  0.41081354  0.32914412]
 [ 0.25444546  0.45187786  0.29367673]
 [ 0.23906817  0.3821964   0.37873539]
 [ 0.30135506  0.29906088  0.39958405]
 [ 0.23419271  0.42738485  0.33842245]
 [ 0.27186921  0.38034436  0.34778646]]
submission_head:['image_name', 'Type_1', 'Type_2', 'Type_3']
sub_dict:{'Type_3': [], 'Type_2': [], 'Type_1': [], 'image_name': []}
     Type_1    Type_2    Type_3 image_name
0  0.238457  0.373640  0.387903      0.jpg
1  0.278363  0.407604  0.314033      1.jpg
2  0.267580  0.392468  0.339952      2.jpg
3  0.284744  0.368517  0.346739      3.jpg
4  0.260042  0.410814  0.329144      4.jpg


  chunks = self.iterencode(o, _one_shot=True)
