In [3]:
import numpy as np
import pandas as pd
# from bayes_opt import BayesianOptimization
import xgboost as xgb
from itertools import product
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn import model_selection,ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import time
from datetime import datetime
import random
# from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, Imputer, LabelBinarizer, MultiLabelBinarizer,LabelEncoder
from sklearn.cluster import KMeans
from scipy.stats.mstats import gmean
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
#input data
train_df=pd.read_json('../input/train.json').reset_index(drop = True)
test_df=pd.read_json('../input/test.json').reset_index(drop = True)

print train_df.shape
print test_df.shape

(49352, 15)
(74659, 14)


In [5]:
data_path = "../input/"
train_X_0322 = pd.read_csv(data_path + 'train_BM_MB_add03052240.csv')
test_X_0322 = pd.read_csv(data_path + 'test_BM_MB_add03052240.csv')

print train_X_0322.shape
print test_X_0322.shape

(49352, 322)
(74659, 322)


In [6]:
target_num_map = {'high':2, 'medium':1, 'low':0}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

In [7]:
data_path = "../input/"
train_X = pd.read_csv(data_path + 'train_CV_MS_52571.csv')
test_X = pd.read_csv(data_path + 'test_CV_MS_52571.csv')

In [8]:
time_feature = pd.read_csv(data_path + 'listing_image_time.csv')
time_feature.columns = ['listing_id','time_stamp']
train_X = train_X.merge(time_feature,on='listing_id',how='left')
test_X = test_X.merge(time_feature,on='listing_id',how='left')

print train_X.shape
print test_X.shape

(49352, 223)
(74659, 223)


In [9]:
def xgb_blend(estimators, train_x, train_y, test_x, fold, early_stopping_rounds=0,randomseed=1234):
    N_params = len(estimators)
#     print ("Blend %d estimators for %d folds" % (N_params, fold))
    skf = KFold(n_splits=fold,shuffle=True,random_state=randomseed)
    N_class = len(set(train_y))
        
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x_mean = np.zeros((test_x.shape[0], N_class*N_params))
    test_blend_x_gmean = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros ((fold,N_params))
    best_rounds = np.zeros ((fold, N_params))
    
    for j, est in enumerate(estimators):
        est.set_params(objective = 'multi:softprob')
        est.set_params(silent = False)
        est.set_params(learning_rate = 0.03)
        est.set_params(n_estimators=1000000)
        
#         print ("Model %d: %s" %(j+1, est))

        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))
    
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
#             print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x.iloc[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x.iloc[val_index]
            val_y_fold = train_y[val_index]      

            est.fit(train_x_fold,train_y_fold,
                    eval_set = [(val_x_fold, val_y_fold)],
                    eval_metric = 'mlogloss',
                    early_stopping_rounds=early_stopping_rounds,
                    verbose=False)
            best_round=est.best_iteration
            best_rounds[i,j]=best_round
#             print ("best round %d" % (best_round))
            val_y_predict_fold = est.predict_proba(val_x_fold,ntree_limit=best_round)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print "Score: ", score
            scores[i,j]=score
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = est.predict_proba(test_x,ntree_limit=best_round)
            print ("Model %d fold %d fitting finished in %0.3fm" % (j+1,i+1, (time.time() - fold_start)/60))
            
        test_blend_x_mean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
        
        test_blend_x_gmean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([gmean(test_blend_x_j[:,range(0,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(1,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(2,N_class*fold,N_class)], axis=1)]).T
            
#         print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print ("Score for blended models is %f" % (np.mean(scores)))
    return (train_blend_x, test_blend_x_mean, test_blend_x_gmean, scores,best_rounds)


In [10]:
train_total = np.zeros((train_X.shape[0], 3))
test_total = np.zeros((test_X.shape[0], 3))
score_total = 0
count = 1

for n in range(count):
    randomseed = n
    estimators = [
                 xgb.XGBClassifier(max_depth = 7,
                                  min_child_weight = 10,
                                  colsample_bytree = 0.208806,
                                  subsample = 0.99,
                                  gamma = 2.879361),  
                 ]

    (train_blend_x_xgb,
     test_blend_x_xgb_mean,
     test_blend_x_xgb_gmean,
     blend_scores_xgb,
     best_rounds_xgb) = xgb_blend(estimators,
                                  train_X,train_y,
                                  test_X,
                                  30,
                                  200,randomseed)
    train_total += train_blend_x_xgb
    test_total += test_blend_x_xgb_mean
    score_total += np.mean(blend_scores_xgb)
    
train_total = train_total / count
test_total = test_total / count
score_total = score_total / count

Score:  0.481109955485
Model 1 fold 1 fitting finished in 3.381m
Score:  0.497521168757
Model 1 fold 2 fitting finished in 2.481m
Score:  0.495334819649
Model 1 fold 3 fitting finished in 6.195m
Score:  0.494366048195
Model 1 fold 4 fitting finished in 3.330m
Score:  0.502407864257
Model 1 fold 5 fitting finished in 4.739m
Score:  0.494571012439
Model 1 fold 6 fitting finished in 3.644m
Score:  0.489081670296
Model 1 fold 7 fitting finished in 4.406m
Score:  0.492299406588
Model 1 fold 8 fitting finished in 5.238m
Score:  0.488501610621
Model 1 fold 9 fitting finished in 4.347m
Score:  0.516944923787
Model 1 fold 10 fitting finished in 3.518m
Score:  0.507401606008
Model 1 fold 11 fitting finished in 3.430m
Score:  0.499368847664
Model 1 fold 12 fitting finished in 6.076m
Score:  0.532518093513
Model 1 fold 13 fitting finished in 2.656m
Score:  0.487977828688
Model 1 fold 14 fitting finished in 2.973m
Score:  0.483608002218
Model 1 fold 15 fitting finished in 3.812m
Score:  0.517255905

In [11]:
train_blend_x_xgb = pd.DataFrame(train_total)
train_blend_x_xgb.columns = ["low", "medium", "high"]
train_blend_x_xgb["listing_id"] = train_X.listing_id.values

test_blend_x_xgb_mean = pd.DataFrame(test_total)
test_blend_x_xgb_mean.columns = ["low", "medium", "high"]
test_blend_x_xgb_mean["listing_id"] = test_X.listing_id.values

In [12]:
tmp_train = train_X_0322[['listing_id']].merge(train_blend_x_xgb,on = 'listing_id', how = 'left')[["low", "medium", "high"]].values
tmp_test_mean = test_X_0322[['listing_id']].merge(test_blend_x_xgb_mean,on = 'listing_id', how = 'left')[["low", "medium", "high"]].values

In [13]:
now = datetime.now()

name_train_blend = '../blend/train_blend_XGB_last_30fold_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../blend/test_blend_XGB_last_30fold_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print score_total
# print (np.mean(best_rounds_RFC,axis=0))
np.savetxt(name_train_blend,tmp_train, delimiter=",")
np.savetxt(name_test_blend_mean,tmp_test_mean, delimiter=",")

0.498757576037


In [39]:
sub_name = '../output/sub_XGB_2bagging_CV_MS_52571_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(tmp_test_mean[:,:3])
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = test_X_0322.listing_id.values
out_df.to_csv(sub_name, index=False)