In [37]:
import numpy as np
import pandas as pd
from bayes_opt import BayesianOptimization
import xgboost as xgb
from itertools import product
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn import model_selection,ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import time
import random
from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, Imputer, LabelBinarizer, MultiLabelBinarizer,LabelEncoder
from sklearn.cluster import KMeans
from scipy.stats.mstats import gmean
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
def cv_train(train,y,max_depth = 6,min_child_weight = 1,colsample_bytree = 1, subsample = 1, gamma = 0 , verbose_eval = None,
            seed = 0, early_stop = 50, nfold = 5, eta=0.3):
    xgtrain = xgb.DMatrix(train, label=y)
    params = dict()
    params['objective']='multi:softprob'
    params['eval_metric']='mlogloss',
    params['num_class']=3
    params['silent']=0
    params['eta'] = eta
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    params['colsample_bytree'] = colsample_bytree
    params['subsample'] = subsample
    params['gamma'] = gamma
#     params['booster'] = 'dart'
#     params['rate_drop'] = 0.1
#     params['skip_drop'] = 0.5
    
    cv_result = xgb.cv(
        params, xgtrain, 
        num_boost_round=10000, nfold=nfold,
        metrics = 'mlogloss', verbose_eval = verbose_eval,
        seed=seed,callbacks=[xgb.callback.early_stop(early_stop)]
    )

    return cv_result['test-mlogloss-mean'].values[-1]

In [3]:
def CV_st(train,test,feature1,feature2):
    index=list(range(train.shape[0]))
    random.shuffle(index)
    kf = KFold(n_splits=5,shuffle=True, random_state=0)
    
    # median feature names
    features_tmp = []
    features_tmp.append(feature1 + '_' + feature2 + '_low_median') 
    features_tmp.append(feature1 + '_' + feature2 + '_medium_median') 
    features_tmp.append(feature1 + '_' + feature2 + '_high_median')
    
    # train data 
    f_low=pd.Series([np.nan]*len(train))
    f_medium=pd.Series([np.nan]*len(train))
    f_high=pd.Series([np.nan]*len(train))

    for train_index, test_index in kf.split(index):
        tmp = train.iloc[train_index].groupby(['interest_level',feature1])[feature2].median().\
                reset_index().rename(columns={feature2:'new'})
        f_low[test_index]    = train.iloc[test_index].merge(tmp[tmp.interest_level == 'low'],
                                                            on=feature1,how='left')['new'].values
        f_medium[test_index] = train.iloc[test_index].merge(tmp[tmp.interest_level == 'medium'],
                                                            on=feature1,how='left')['new'].values
        f_high[test_index]   = train.iloc[test_index].merge(tmp[tmp.interest_level == 'high'],
                                                            on=feature1,how='left')['new'].values   
    train[features_tmp[0]] = f_low
    train[features_tmp[1]] = f_medium
    train[features_tmp[2]] = f_high

    # test data
    tmp = train.groupby(['interest_level',feature1])[feature2].median().\
            reset_index().rename(columns={feature2:'new'})    
    test[features_tmp[0]] = test.merge(tmp[tmp.interest_level == 'low'],
                                       on=feature1,how='left')['new'].values
    test[features_tmp[1]] = test.merge(tmp[tmp.interest_level == 'medium'],
                                       on=feature1,how='left')['new'].values
    test[features_tmp[2]] = test.merge(tmp[tmp.interest_level == 'high'],
                                       on=feature1,how='left')['new'].values 
    

    # mean feature names
    features_tmp.append(feature1 + '_' + feature2 + '_low_mean') 
    features_tmp.append(feature1 + '_' + feature2 + '_medium_mean') 
    features_tmp.append(feature1 + '_' + feature2 + '_high_mean')
    
    # train data 
    f_low=pd.Series([np.nan]*len(train))
    f_medium=pd.Series([np.nan]*len(train))
    f_high=pd.Series([np.nan]*len(train))

    for train_index, test_index in kf.split(index):
        tmp = train.iloc[train_index].groupby(['interest_level',feature1])[feature2].mean().\
                reset_index().rename(columns={feature2:'new'})
        f_low[test_index]    = train.iloc[test_index].merge(tmp[tmp.interest_level == 'low'],
                                                            on=feature1,how='left')['new'].values
        f_medium[test_index] = train.iloc[test_index].merge(tmp[tmp.interest_level == 'medium'],
                                                            on=feature1,how='left')['new'].values
        f_high[test_index]   = train.iloc[test_index].merge(tmp[tmp.interest_level == 'high'],
                                                            on=feature1,how='left')['new'].values   
    train[features_tmp[3]] = f_low
    train[features_tmp[4]] = f_medium
    train[features_tmp[5]] = f_high

    # test data
    tmp = train.groupby(['interest_level',feature1])[feature2].mean().\
            reset_index().rename(columns={feature2:'new'})    
    test[features_tmp[3]] = test.merge(tmp[tmp.interest_level == 'low'],
                                       on=feature1,how='left')['new'].values
    test[features_tmp[4]] = test.merge(tmp[tmp.interest_level == 'medium'],
                                       on=feature1,how='left')['new'].values
    test[features_tmp[5]] = test.merge(tmp[tmp.interest_level == 'high'],
                                       on=feature1,how='left')['new'].values 
    
    # max feature names
    features_tmp.append(feature1 + '_' + feature2 + '_low_max') 
    features_tmp.append(feature1 + '_' + feature2 + '_medium_max') 
    features_tmp.append(feature1 + '_' + feature2 + '_high_max')
    
    # train data 
    f_low=pd.Series([np.nan]*len(train))
    f_medium=pd.Series([np.nan]*len(train))
    f_high=pd.Series([np.nan]*len(train))

    for train_index, test_index in kf.split(index):
        tmp = train.iloc[train_index].groupby(['interest_level',feature1])[feature2].max().\
                reset_index().rename(columns={feature2:'new'})
        f_low[test_index]    = train.iloc[test_index].merge(tmp[tmp.interest_level == 'low'],
                                                            on=feature1,how='left')['new'].values
        f_medium[test_index] = train.iloc[test_index].merge(tmp[tmp.interest_level == 'medium'],
                                                            on=feature1,how='left')['new'].values
        f_high[test_index]   = train.iloc[test_index].merge(tmp[tmp.interest_level == 'high'],
                                                            on=feature1,how='left')['new'].values   
    train[features_tmp[6]] = f_low
    train[features_tmp[7]] = f_medium
    train[features_tmp[8]] = f_high

    # test data
    tmp = train.groupby(['interest_level',feature1])[feature2].max().\
            reset_index().rename(columns={feature2:'new'})    
    test[features_tmp[6]] = test.merge(tmp[tmp.interest_level == 'low'],
                                       on=feature1,how='left')['new'].values
    test[features_tmp[7]] = test.merge(tmp[tmp.interest_level == 'medium'],
                                       on=feature1,how='left')['new'].values
    test[features_tmp[8]] = test.merge(tmp[tmp.interest_level == 'high'],
                                       on=feature1,how='left')['new'].values 
    
    # min feature names
    features_tmp.append(feature1 + '_' + feature2 + '_low_min') 
    features_tmp.append(feature1 + '_' + feature2 + '_medium_min') 
    features_tmp.append(feature1 + '_' + feature2 + '_high_min')
    
    # train data 
    f_low=pd.Series([np.nan]*len(train))
    f_medium=pd.Series([np.nan]*len(train))
    f_high=pd.Series([np.nan]*len(train))

    for train_index, test_index in kf.split(index):
        tmp = train.iloc[train_index].groupby(['interest_level',feature1])[feature2].min().\
                reset_index().rename(columns={feature2:'new'})
        f_low[test_index]    = train.iloc[test_index].merge(tmp[tmp.interest_level == 'low'],
                                                            on=feature1,how='left')['new'].values
        f_medium[test_index] = train.iloc[test_index].merge(tmp[tmp.interest_level == 'medium'],
                                                            on=feature1,how='left')['new'].values
        f_high[test_index]   = train.iloc[test_index].merge(tmp[tmp.interest_level == 'high'],
                                                            on=feature1,how='left')['new'].values   
    train[features_tmp[9]] = f_low
    train[features_tmp[10]] = f_medium
    train[features_tmp[11]] = f_high

    # test data
    tmp = train.groupby(['interest_level',feature1])[feature2].min().\
            reset_index().rename(columns={feature2:'new'})    
    test[features_tmp[9]] = test.merge(tmp[tmp.interest_level == 'low'],
                                       on=feature1,how='left')['new'].values
    test[features_tmp[10]] = test.merge(tmp[tmp.interest_level == 'medium'],
                                       on=feature1,how='left')['new'].values
    test[features_tmp[11]] = test.merge(tmp[tmp.interest_level == 'high'],
                                       on=feature1,how='left')['new'].values 

#     # std feature names
#     features_tmp.append(feature1 + '_' + feature2 + '_low_std') 
#     features_tmp.append(feature1 + '_' + feature2 + '_medium_std') 
#     features_tmp.append(feature1 + '_' + feature2 + '_high_std')
    
#     # train data 
#     f_low=pd.Series([np.nan]*len(train))
#     f_medium=pd.Series([np.nan]*len(train))
#     f_high=pd.Series([np.nan]*len(train))

#     for train_index, test_index in kf.split(index):
#         tmp = train.iloc[train_index].groupby(['interest_level',feature1])[feature2].std().\
#                 reset_index().rename(columns={feature2:'new'})
#         f_low[test_index]    = train.iloc[test_index].merge(tmp[tmp.interest_level == 'low'],
#                                                             on=feature1,how='left')['new'].values
#         f_medium[test_index] = train.iloc[test_index].merge(tmp[tmp.interest_level == 'medium'],
#                                                             on=feature1,how='left')['new'].values
#         f_high[test_index]   = train.iloc[test_index].merge(tmp[tmp.interest_level == 'high'],
#                                                             on=feature1,how='left')['new'].values   
#     train[features_tmp[12]] = f_low
#     train[features_tmp[13]] = f_medium
#     train[features_tmp[14]] = f_high

#     # test data
#     tmp = train.groupby(['interest_level',feature1])[feature2].std().\
#             reset_index().rename(columns={feature2:'new'})    
#     test[features_tmp[12]] = test.merge(tmp[tmp.interest_level == 'low'],
#                                        on=feature1,how='left')['new'].values
#     test[features_tmp[13]] = test.merge(tmp[tmp.interest_level == 'medium'],
#                                        on=feature1,how='left')['new'].values
#     test[features_tmp[14]] = test.merge(tmp[tmp.interest_level == 'high'],
#                                        on=feature1,how='left')['new'].values 
    
#     # var feature names
#     features_tmp.append(feature1 + '_' + feature2 + '_low_var') 
#     features_tmp.append(feature1 + '_' + feature2 + '_medium_var') 
#     features_tmp.append(feature1 + '_' + feature2 + '_high_var')
    
#     # train data 
#     f_low=pd.Series([np.nan]*len(train))
#     f_medium=pd.Series([np.nan]*len(train))
#     f_high=pd.Series([np.nan]*len(train))

#     for train_index, test_index in kf.split(index):
#         tmp = train.iloc[train_index].groupby(['interest_level',feature1])[feature2].var().\
#                 reset_index().rename(columns={feature2:'new'})
#         f_low[test_index]    = train.iloc[test_index].merge(tmp[tmp.interest_level == 'low'],
#                                                             on=feature1,how='left')['new'].values
#         f_medium[test_index] = train.iloc[test_index].merge(tmp[tmp.interest_level == 'medium'],
#                                                             on=feature1,how='left')['new'].values
#         f_high[test_index]   = train.iloc[test_index].merge(tmp[tmp.interest_level == 'high'],
#                                                             on=feature1,how='left')['new'].values   
#     train[features_tmp[15]] = f_low
#     train[features_tmp[16]] = f_medium
#     train[features_tmp[17]] = f_high

#     # test data
#     tmp = train.groupby(['interest_level',feature1])[feature2].var().\
#             reset_index().rename(columns={feature2:'new'})    
#     test[features_tmp[15]] = test.merge(tmp[tmp.interest_level == 'low'],
#                                        on=feature1,how='left')['new'].values
#     test[features_tmp[16]] = test.merge(tmp[tmp.interest_level == 'medium'],
#                                        on=feature1,how='left')['new'].values
#     test[features_tmp[17]] = test.merge(tmp[tmp.interest_level == 'high'],
#                                        on=feature1,how='left')['new'].values 
    
#     # ratio/diff feature
#     cols = features_tmp[:]
# #     features_tmp = []
#     for col in cols:
#         new_feature = col+'_ratio'
#         train[new_feature] = train[col] / train[feature2]
#         test[new_feature] = test[col] / test[feature2]
#         features_tmp.append(new_feature)
        
    print feature1,' vs ', feature2,'Done!'
    return train,test,features_tmp

In [4]:
#input data
train_df=pd.read_json('../input/train.json').reset_index(drop = True)
test_df=pd.read_json('../input/test.json').reset_index(drop = True)
test_df.loc[test_df.bathrooms == 112.0,'bathrooms'] = 1.5    
test_df.loc[test_df.bathrooms == 20.0,'bathrooms'] = 2.0
test_df.loc[test_df.listing_id == 7220763,'bedrooms'] = 3
test_df.loc[test_df.listing_id == 7047074,'bedrooms'] = 6
print train_df.shape
print test_df.shape

(49352, 15)
(74659, 14)


In [5]:
def add_features(df):
    fmt = lambda s: s.replace("\u00a0", "").strip().lower()
    df["num_photo_count"] = df["photos"].apply(len)
    df["street_address"] = df['street_address'].apply(fmt)
    df["display_address"] = df["display_address"].apply(fmt)
    df["num_desc_wordcount"] = df["description"].apply(len)
    df["num_pricePerBed"] = df['price'] / df['bedrooms']
    df["num_pricePerBath"] = df['price'] / df['bathrooms']
    df["num_pricePerRoom"] = df['price'] / (df['bedrooms'] + df['bathrooms'])
    df["num_bedPerBath"] = df['bedrooms'] / df['bathrooms']
    df["num_bedBathDiff"] = df['bedrooms'] - df['bathrooms']
    df["num_bedBathSum"] = df["bedrooms"] + df['bathrooms']
    df["num_bedsPerc"] = df["bedrooms"] / (df['bedrooms'] + df['bathrooms'])

    df = df.fillna(-1).replace(np.inf, -1)
    return df

# Add common features
train_df = add_features(train_df)
test_df = add_features(test_df) 


# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

train_df['num_desc_length_null'] = (train_df.description.str.len()==0).astype(float)
test_df['num_desc_length_null'] = (test_df.description.str.len()==0).astype(float)
    
features_to_use=[
    "latitude", "longitude","num_pricePerBed",
    'num_bedBathSum','num_pricePerBath','num_pricePerRoom','num_bedPerBath',
    'num_bedBathDiff','num_bedsPerc',
    "num_photo_count", "num_features", "num_desc_wordcount",'num_desc_length_null',
    "listing_id"]

print 'Done!'

Done!


In [6]:
# Location features: Latitude, longitude
precision = 3
x = np.sqrt(((train_df.latitude - train_df.latitude.median())**2) + (train_df.longitude - train_df.longitude.median())**2)
train_df['num_dist_from_center'] = x.values
x = np.sqrt(((test_df.latitude - train_df.latitude.median())**2) + (test_df.longitude - train_df.longitude.median())**2)
test_df['num_dist_from_center'] = x.values
train_df['position'] = train_df.longitude.round(precision).astype(str) + '_' + train_df.latitude.round(precision).astype(str)
test_df['position'] = test_df.longitude.round(precision).astype(str) + '_' + test_df.latitude.round(precision).astype(str)

new_feature = ['num_dist_from_center']
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)
print 'Done!'

Done!


In [7]:
# Degree of "outlierness"
OutlierAggregated = (train_df.bedrooms > 4).astype(float)
OutlierAggregated2 = (test_df.bedrooms > 4).astype(float)
OutlierAggregated += (train_df.bathrooms > 3).astype(float)
OutlierAggregated2 += (test_df.bathrooms > 3).astype(float)
OutlierAggregated += (train_df.bathrooms < 1).astype(float)
OutlierAggregated2 += (test_df.bathrooms < 1).astype(float)
x = np.abs((train_df.price - train_df.price.median())/train_df.price.std()) > 0.30
OutlierAggregated += x.astype(float)
x2 = np.abs((test_df.price - train_df.price.median())/train_df.price.std()) > 0.30
OutlierAggregated2 += x2.astype(float)
x = np.log1p(train_df.price/(train_df.bedrooms.clip(1,3) + train_df.bathrooms.clip(1,2))) > 8.2
OutlierAggregated += x.astype(float)
x2 = np.log1p(test_df.price/(test_df.bedrooms.clip(1,3) + test_df.bathrooms.clip(1,2))) > 8.2
OutlierAggregated2 += x2.astype(float)
x = np.sqrt(((train_df.latitude - train_df.latitude.median())**2) + (train_df.longitude - train_df.longitude.median())**2) > 0.30
OutlierAggregated += x.astype(float)
x2 = np.sqrt(((test_df.latitude - train_df.latitude.median())**2) + (test_df.longitude - train_df.longitude.median())**2) > 0.30
OutlierAggregated2 += x2.astype(float)
train_df['num_OutlierAggregated'] = OutlierAggregated.values
test_df['num_OutlierAggregated'] = OutlierAggregated2.values


new_feature = ['num_OutlierAggregated']
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)
        
print 'Done!'

Done!


In [8]:
# Density in unique locations at given precision
vals = train_df['position'].value_counts()
dvals = vals.to_dict()
train_df['num_pos_density'] = train_df['position'].apply(lambda x: dvals.get(x, vals.min()))
test_df['num_pos_density'] = test_df['position'].apply(lambda x: dvals.get(x, vals.min()))

# Building null
train_df['num_building_null'] = (train_df.building_id=='0').astype(float)
test_df['num_building_null'] = (test_df.building_id=='0').astype(float)


new_feature = ['num_pos_density','num_building_null']
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)
        
print 'Done!'

Done!


In [9]:
# Creation time features
train_df['created'] = pd.to_datetime(train_df.created)
train_df['num_created_weekday'] = train_df.created.dt.dayofweek.astype(float)
train_df['num_created_weekofyear'] = train_df.created.dt.weekofyear
train_df['num_created_day'] = train_df.created.dt.day
train_df['num_created_month'] = train_df.created.dt.month
train_df['num_created_hour'] = train_df.created.dt.hour
  
test_df['created'] = pd.to_datetime(test_df.created)
test_df['num_created_weekday'] = test_df.created.dt.dayofweek
test_df['num_created_weekofyear'] = test_df.created.dt.weekofyear
test_df['num_created_day'] = test_df.created.dt.day
test_df['num_created_month'] = test_df.created.dt.month
test_df['num_created_hour'] = test_df.created.dt.hour


new_feature = ['num_created_weekday','num_created_weekofyear','num_created_day','num_created_month','num_created_hour']
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)
        
print 'Done!'

Done!


In [10]:
# Bedrooms/Bathrooms/Price
train_df['num_bathrooms'] = train_df.bathrooms.clip_upper(4)
test_df['num_bathrooms'] = test_df.bathrooms.clip_upper(4)

train_df['num_bedrooms'] = train_df.bedrooms.clip_upper(5)
test_df['num_bedrooms'] = test_df.bedrooms.clip_upper(5)

train_df['num_price'] = train_df.price.clip_upper(10000)
test_df['num_price'] = test_df.price.clip_upper(10000)

bins = train_df.price.quantile(np.arange(0.05, 1, 0.05))
train_df['num_price_q'] = np.digitize(train_df.price, bins)
test_df['num_price_q'] = np.digitize(test_df.price, bins)


new_feature = ['num_bathrooms','num_bedrooms','num_price','num_price_q']
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)
        
print 'Done!'

Done!


In [11]:
# Composite features based on: 
# https://www.kaggle.com/arnaldcat/two-sigma-connect-rental-listing-inquiries/a-proxy-for-sqft-and-the-interest-on-1-2-baths
train_df['num_priceXroom'] = (train_df.price / (1 + train_df.bedrooms.clip(1, 4) + 0.5*train_df.bathrooms.clip(0, 2))).values
test_df['num_priceXroom'] = (test_df.price / (1 + test_df.bedrooms.clip(1, 4) + 0.5*test_df.bathrooms.clip(0, 2))).values

train_df['num_even_bathrooms'] = ((np.round(train_df.bathrooms) - train_df.bathrooms)==0).astype(float)
test_df['num_even_bathrooms'] = ((np.round(test_df.bathrooms) - test_df.bathrooms)==0).astype(float)

new_feature = ['num_priceXroom','num_even_bathrooms']
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

print 'Done!'

Done!


In [12]:
categorical = ["display_address", "manager_id", "building_id", "street_address",'position']
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            if f not in features_to_use:
                features_to_use.append(f)

In [13]:
dftemp = train_df.copy()
for i in ['latitude', 'longitude']:
    while(1):
        x = dftemp[i].median()
        ix = abs(dftemp[i] - x) > 3*dftemp[i].std()
        if ix.sum()==0:
            break
        dftemp.loc[ix, i] = np.nan
dftemp = dftemp.loc[dftemp[['latitude', 'longitude']].isnull().sum(1) == 0, :]

dfm = DataFrameMapper([(['latitude'], [StandardScaler()]), (['longitude'], [StandardScaler()])])

for i in [6]:
    pipe_location = make_pipeline(dfm, KMeans(n_clusters=i, random_state=1))
    pipe_location.fit(dftemp);
    train_df['location_'+str(i)] = pipe_location.predict(train_df).astype(str)
    test_df['location_'+str(i)] = pipe_location.predict(test_df).astype(str)
for i in train_df.location_6.unique():
    f = 'num_location_6_'+str(i)
    train_df[f] = (train_df.location_6==i).astype(float)
    test_df[f] = (test_df.location_6==i).astype(float)
    if f not in features_to_use:
        features_to_use.append(f)
    
    
train_df['tmp_bathrooms'] = train_df.bathrooms.clip_upper(2)
test_df['tmp_bathrooms'] = test_df.bathrooms.clip_upper(2)
train_df['tmp_bedrooms'] = train_df.bedrooms.clip_upper(4)
test_df['tmp_bedrooms'] = test_df.bedrooms.clip_upper(4)
train_df['roomcal'] = train_df.tmp_bedrooms.astype(str) + '_' + train_df.tmp_bathrooms.astype(str)    
test_df['roomcal'] = test_df.tmp_bedrooms.astype(str) + '_' + test_df.tmp_bathrooms.astype(str)    

room_lb = LabelBinarizer()
room_lb.fit(train_df['roomcal'])
room_col = ['num_room_type_' + str(x) for x in range(len(train_df['roomcal'].unique()))]
for f in room_col:
    if f not in features_to_use:
        features_to_use.append(f)

train_df = train_df.join(pd.DataFrame(room_lb.transform(train_df['roomcal']),columns=room_col,index=train_df.index))
test_df = test_df.join(pd.DataFrame(room_lb.transform(test_df['roomcal']),columns=room_col,index=test_df.index))

tmp = train_df.groupby(['roomcal','location_6'])['num_price'].median().\
            reset_index().rename(columns={'num_price':'num_6_median_price'})
    
train_df = train_df.merge(tmp,on=['roomcal','location_6'],how='left')
test_df = test_df.merge(tmp,on=['roomcal','location_6'],how='left')

test_df.loc[27462,'num_6_median_price'] =  7200.0

train_df['num_6_price_ratio'] = train_df['num_price'] / train_df['num_6_median_price']
train_df['num_6_price_diff'] = train_df['num_price'] - train_df['num_6_median_price']
test_df['num_6_price_ratio'] = test_df['num_price'] / test_df['num_6_median_price']
test_df['num_6_price_diff'] = test_df['num_price'] - test_df['num_6_median_price']


for f in ['num_6_median_price','num_6_price_ratio','num_6_price_diff']:
    if f not in features_to_use:
        features_to_use.append(f)
        
        
print 'Done!'

Done!


In [14]:
tmp = train_df.groupby(['num_bedrooms','location_6'])['num_price'].median().\
            reset_index().rename(columns={'num_price':'num_6_median_price_bedroom'})
    
train_df = train_df.merge(tmp,on=['num_bedrooms','location_6'],how='left')
test_df = test_df.merge(tmp,on=['num_bedrooms','location_6'],how='left')

train_df['num_6_price_ratio_bedroom'] = train_df['num_price'] / train_df['num_6_median_price_bedroom']
train_df['num_6_price_diff_bedroom'] = train_df['num_price'] - train_df['num_6_median_price_bedroom']
test_df['num_6_price_ratio_bedroom'] = test_df['num_price'] / test_df['num_6_median_price_bedroom']
test_df['num_6_price_diff_bedroom'] = test_df['num_price'] - test_df['num_6_median_price_bedroom']


for f in ['num_6_median_price_bedroom','num_6_price_ratio_bedroom','num_6_price_diff_bedroom']:
    if f not in features_to_use:
        features_to_use.append(f)

In [15]:
def create_binary_features(df):
    bows = {
        "dogs": ("dogs", "dog",'pet friendly','pets'),
        "cats": ("cats",'pet friendly','pets'),
        "nofee": ("no fee", "no-fee", "no  fee", "nofee", "no_fee"),
        "lowfee": ("reduced_fee", "low_fee", "reduced fee", "low fee"),
        "furnished": ("furnished",'equipped'),
        "parquet": ("parquet", "hardwood"),
        "concierge": ("concierge", "doorman", "housekeep", "in_super"),
        "prewar": ("prewar", "pre_war", "pre war", "pre-war"),
        "laundry": ("laundry", "lndry"),
        "health": ("health", "gym", "fitness", "training"),
        "transport": ("train", "subway", "transport"),
        "parking": ("parking",),
        "utilities": ("utilities", "heat water", "water included"),
        'fireplace': ('fireplace','fireplaces'),
        'elevator': ('elevator'),
        'pool':('pool'),
        'loft':('loft'),
        'luxury':('luxury','valet'),
        'marble':('marble'),
        'onemounthfree': ('1 month free','one month free'),
        'washer':('washer','dryer')
    }

    def indicator(bow):
        return lambda s: int(any([x in s for x in bow]))

    features = df["features"].apply(lambda f: " ".join(f).lower())   # convert features to string
    for key in bows:
        tmp_key = "feature_" + key
        df[tmp_key] = features.apply(indicator(bows[key]))
        if tmp_key not in features_to_use:
            features_to_use.append(tmp_key)
    return df

# Create binarized features
train_df = create_binary_features(train_df)
test_df = create_binary_features(test_df)


In [16]:
data_path = "../input/"
train_X_0322 = pd.read_csv(data_path + 'train_BM_MB_add03052240.csv')
test_X_0322 = pd.read_csv(data_path + 'test_BM_MB_add03052240.csv')

print train_X_0322.shape
print test_X_0322.shape

(49352, 322)
(74659, 322)


In [17]:
sentiment = [
    'building_id_mean_med','building_id_mean_high', 
    'manager_id_mean_med','manager_id_mean_high',
    'median_price_bed', 'ratio_bed',
       'compound', 'neg', 'neu', 'pos', 'street',
       'avenue', 'east', 'west', 'north', 'south', 'other_address',
       'Zero_building_id', 'top_10_building', 'top_25_building',
       'top_5_building', 'top_50_building', 'top_1_building',
       'top_2_building', 'top_15_building', 'top_20_building',
       'top_30_building','listing_id'
]

train_df = train_df.merge(train_X_0322[sentiment],on='listing_id', how='left')
test_df = test_df.merge(test_X_0322[sentiment],on='listing_id', how='left')

for f in sentiment:
    if f not in features_to_use:
        features_to_use.append(f)

In [18]:
train_df.isnull().values.any()

False

In [19]:
test_df.isnull().values.any()

False

# CV statistics

In [20]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','price')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  price Done!


In [21]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'building_id','price')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)  

building_id  vs  price Done!


In [22]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_dist_from_center')

for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  num_dist_from_center Done!


In [23]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_created_hour')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  num_created_hour Done!


In [24]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'building_id','num_created_hour')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

building_id  vs  num_created_hour Done!


In [25]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_desc_wordcount')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  num_desc_wordcount Done!


In [26]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_6_price_diff_bedroom')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)        

manager_id  vs  num_6_price_diff_bedroom Done!


In [27]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'building_id','num_6_price_diff_bedroom')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)  

building_id  vs  num_6_price_diff_bedroom Done!


In [28]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_bedrooms')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)    

manager_id  vs  num_bedrooms Done!


# --------------------------------------

In [47]:
for f in new_feature:
    features_to_use.remove(f)

In [42]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_6_price_diff')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

manager_id  vs  num_6_price_diff Done!


In [36]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'building_id','num_bedrooms')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)    

building_id  vs  num_bedrooms Done!


In [70]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','building_id_mean_med')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)    

manager_id  vs  building_id_mean_med Done!


In [71]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','building_id_mean_high')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)    

manager_id  vs  building_id_mean_high Done!


In [76]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'building_id','manager_id_mean_med')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)    

building_id  vs  manager_id_mean_med Done!


In [77]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'building_id','manager_id_mean_high')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)    

building_id  vs  manager_id_mean_high Done!


In [104]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'feature_nofee','num_price')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

feature_nofee  vs  num_price Done!


In [98]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'num_price_q','num_priceXroom')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

num_price_q  vs  num_priceXroom Done!


In [97]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'num_building_null','num_priceXroom')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

In [78]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'building_id','num_priceXroom')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

building_id  vs  num_priceXroom Done!


In [67]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','feature_nofee')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)    

manager_id  vs  feature_nofee Done!


In [51]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','manager_id_mean_med')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)    

manager_id  vs  manager_id_mean_med Done!


In [42]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'position','price')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

position  vs  price Done!


In [32]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','pos')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)    

manager_id  vs  pos Done!


In [146]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','median_price_bed')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)        

manager_id  vs  median_price_bed Done!


In [140]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_pricePerBed')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

manager_id  vs  num_pricePerBed Done!


In [133]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_pos_density')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

manager_id  vs  num_pos_density Done!


In [127]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','longitude')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

manager_id  vs  longitude Done!


In [122]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','latitude')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

manager_id  vs  latitude Done!


In [112]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_6_price_ratio')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

manager_id  vs  num_6_price_ratio Done!


In [107]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_features')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

manager_id  vs  num_features Done!


In [101]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','ratio_bed')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

manager_id  vs  ratio_bed Done!


In [96]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_6_price_diff_bedroom')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

manager_id  vs  num_6_price_diff_bedroom Done!


In [49]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_photo_count')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

manager_id  vs  num_photo_count Done!


In [39]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_priceXroom')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

manager_id  vs  num_priceXroom Done!


In [68]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_pricePerBath')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  num_pricePerBath Done!


In [71]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_pricePerRoom')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  num_pricePerRoom Done!


# Val

In [49]:
features_to_use[-5:]

['manager_id_num_bedrooms_medium_max',
 'manager_id_num_bedrooms_high_max',
 'manager_id_num_bedrooms_low_min',
 'manager_id_num_bedrooms_medium_min',
 'manager_id_num_bedrooms_high_min']

In [29]:
train_X = train_df[features_to_use]
test_X = test_df[features_to_use]

train_X.replace(np.inf, np.nan)
test_X.replace(np.inf, np.nan)

train_X.loc[:,'num_nan'] = train_X.isnull().sum(axis=1)
test_X.loc[:,'num_nan'] = test_X.isnull().sum(axis=1)

target_num_map = {'high':2, 'medium':1, 'low':0}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print train_X.shape, test_X.shape 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


(49352, 223) (74659, 223)


In [45]:
train_X[new_feature].head()

Unnamed: 0,manager_id_num_6_price_diff_low_median,manager_id_num_6_price_diff_medium_median,manager_id_num_6_price_diff_high_median,manager_id_num_6_price_diff_low_mean,manager_id_num_6_price_diff_medium_mean,manager_id_num_6_price_diff_high_mean,manager_id_num_6_price_diff_low_max,manager_id_num_6_price_diff_medium_max,manager_id_num_6_price_diff_high_max,manager_id_num_6_price_diff_low_min,manager_id_num_6_price_diff_medium_min,manager_id_num_6_price_diff_high_min
0,-75.0,-25.0,,-7.735849,30.277778,,875.0,900.0,,-1700.0,-700.0,
1,1052.5,-3121.5,,1197.735714,-3121.5,,4300.0,-3121.5,,-1150.0,-3121.5,
2,-50.0,-125.0,-322.5,223.050847,-43.225,-327.5,3300.0,3025.0,495.0,-1505.0,-2510.0,-1340.0
3,135.0,-300.0,-550.0,255.252066,-207.264706,-69.375,4100.0,1100.0,2950.0,-2100.0,-2516.5,-965.0
4,-55.0,,,15.0,,,1000.0,,,-300.0,,


In [46]:
now = time.time()
print cv_train(train_X,train_y,verbose_eval = 5, early_stop = 20)
print '\nTraining :{:0.2f}s'.format(time.time() - now)

Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[0]	train-mlogloss:0.9081+0.000439152	test-mlogloss:0.914806+0.00097818
[5]	train-mlogloss:0.600289+0.000821237	test-mlogloss:0.631917+0.00300072
[10]	train-mlogloss:0.526073+0.000777907	test-mlogloss:0.575912+0.00432575
[15]	train-mlogloss:0.489517+0.00182507	test-mlogloss:0.556177+0.00400821
[20]	train-mlogloss:0.464523+0.00310209	test-mlogloss:0.546892+0.00445571
[25]	train-mlogloss:0.443435+0.00276597	test-mlogloss:0.541393+0.00449292
[30]	train-mlogloss:0.425666+0.00255616	test-mlogloss:0.537842+0.00496247
[35]	train-mlogloss:0.410124+0.00238644	test-mlogloss:0.535211+0.00466594
[40]	train-mlogloss:0.396522+0.0016213	test-mlogloss:0.533484+0.00509716
[45]	train-mlogloss:0.382935+0.00242961	test-mlogloss:0.532501+0.00489936
[50]	train-mlogloss:0.371589+0.00210863	test-mlogloss:0.531685+0.00502912
[55]	train-mlogloss:0.360453+0.002675

In [None]:
# [81]	train-mlogloss:0.354253+0.00211099	test-mlogloss:0.536977+0.00608719 no cv feature
# [65]	train-mlogloss:0.367155+0.00211633	test-mlogloss:0.534893+0.0062382 price
# [72]	train-mlogloss:0.346812+0.00328497	test-mlogloss:0.53305+0.00688662 building_id  vs  price
# [64]	train-mlogloss:0.357396+0.00288563	test-mlogloss:0.531704+0.00625184 num_dist_from_center
# [76]	train-mlogloss:0.331548+0.00150892	test-mlogloss:0.530707+0.00719767 num_created_hour
# [78]	train-mlogloss:0.327187+0.00326905	test-mlogloss:0.530522+0.00662889 building_id  vs  num_created_hour
# [69]	train-mlogloss:0.341823+0.0009883	test-mlogloss:0.530539+0.00755474 num_desc_wordcount
# del [64]	train-mlogloss:0.348393+0.00437276	test-mlogloss:0.532399+0.00560396 building_id  vs  num_desc_wordcount
# [78]	train-mlogloss:0.322085+0.00255577	test-mlogloss:0.530573+0.00706681 num_6_price_diff_bedroom
# [76]	train-mlogloss:0.322108+0.00188804	test-mlogloss:0.53018+0.00868863 building_id  vs  num_6_price_diff_bedroom
# [57]	train-mlogloss:0.35853+0.0039214	test-mlogloss:0.530243+0.00654921 bedroom
# del [80]	train-mlogloss:0.313073+0.00177185	test-mlogloss:0.530923+0.00642305 building_id  vs  bedroom
# [80]	train-mlogloss:0.311336+0.00318315	test-mlogloss:0.530211+0.00570428 num_6_price_diff

In [None]:
# [82]	train-mlogloss:0.373255+0.00424222	test-mlogloss:0.551611+0.00772184 no cv feature
# [69]	train-mlogloss:0.364704+0.00323687	test-mlogloss:0.536954+0.00792152 price 
# [67]	train-mlogloss:0.363512+0.0026217	test-mlogloss:0.536145+0.0072402 num_dist_from_center
# [81]	train-mlogloss:0.335269+0.00262769	test-mlogloss:0.534375+0.00789059 num_created_hour
# [71]	train-mlogloss:0.349284+0.00352997	test-mlogloss:0.534292+0.00880248 num_desc_wordcount
# [68]	train-mlogloss:0.349212+0.00212014	test-mlogloss:0.532903+0.00747242 num_6_price_ratio_bedroom
# [74]	train-mlogloss:0.337387+0.00425913	test-mlogloss:0.533063+0.00678103 building_id_mean_med
# [83]	train-mlogloss:0.320669+0.00289614	test-mlogloss:0.532428+0.00780398 building_id_mean_high
# [76]	train-mlogloss:0.330415+0.00167131	test-mlogloss:0.531293+0.00759899 'manager_id_mean_med','manager_id_mean_high',
# [62]	train-mlogloss:0.348022+0.00308306	test-mlogloss:0.5307+0.00586143 building_id  vs  num_6_price_diff_bedroom

# del [69]	train-mlogloss:0.331928+0.00270211	test-mlogloss:0.531707+0.00621119 building_id  vs  price
# del [61]	train-mlogloss:0.349982+0.00391241	test-mlogloss:0.537226+0.00635442 feature_nofee  vs  num_price
# del [69]	train-mlogloss:0.333043+0.00387723	test-mlogloss:0.531759+0.0078066 num_price_q  vs  num_priceXroom
# del [64]	train-mlogloss:0.345669+0.00295346	test-mlogloss:0.531806+0.00678929 num_building_null  vs  num_priceXroom
# del [73]	train-mlogloss:0.326921+0.00344616	test-mlogloss:0.532193+0.00673751 building_id  vs  num_priceXroom
# del [68]	train-mlogloss:0.342177+0.00136703	test-mlogloss:0.531945+0.00650216 feature_nofee
# del [78]	train-mlogloss:0.324279+0.00234253	test-mlogloss:0.531399+0.00642477 num_bedrooms
# del [67]	train-mlogloss:0.343537+0.0019745	test-mlogloss:0.533178+0.00505587 pos
# del [84]	train-mlogloss:0.323042+0.00215233	test-mlogloss:0.533643+0.00763431 median_price_bed
# del [66]	train-mlogloss:0.350688+0.00226534	test-mlogloss:0.533783+0.00685066 num_pricePerBed
# del [76]	train-mlogloss:0.333929+0.00233681	test-mlogloss:0.533241+0.00765436 num_pos_density
# del  [74]	train-mlogloss:0.336716+0.00472892	test-mlogloss:0.533678+0.00700685 longitude
# del [70]	train-mlogloss:0.343931+0.00190522	test-mlogloss:0.533995+0.00708828 latitude
# del [66]	train-mlogloss:0.353195+0.00347634	test-mlogloss:0.533698+0.00813442 num_6_price_ratio
# del[76]	train-mlogloss:0.335395+0.00269504	test-mlogloss:0.534366+0.0072334 num_features
# del [70]	train-mlogloss:0.344731+0.0040112	test-mlogloss:0.533613+0.00852658 ratio_bed
# del [63]	train-mlogloss:0.359234+0.00282059	test-mlogloss:0.533871+0.00691217 num_6_price_diff_bedroom
# del [84]	train-mlogloss:0.322303+0.00294342	test-mlogloss:0.534586+0.0076548 num_priceXroom
# del [71]	train-mlogloss:0.347942+0.00429741	test-mlogloss:0.535784+0.00840754 num_photo_count

In [118]:
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, train_size=.80, random_state=2016)
rgr = xgb.XGBClassifier(
            objective='multi:softprob',
            seed = 0, # use a fixed seed during tuning so we can reproduce the results
            learning_rate = 0.2,
            n_estimators = 62,
            max_depth= 6,
            nthread = -1,
            colsample_bytree = 0.3,
            subsample =0.7,
            silent = 1
        )
rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
#         early_stopping_rounds=20,
        verbose=20
    )

[0]	validation_0-mlogloss:0.976351
[20]	validation_0-mlogloss:0.570767
[40]	validation_0-mlogloss:0.547269
[60]	validation_0-mlogloss:0.539594


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.3,
       gamma=0, learning_rate=0.2, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=62, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=1, subsample=0.7)

In [119]:
import xgbfir
xgbfir.saveXgbFI(rgr, feature_names=X_train.columns, OutputXlsxFile = '../FE/FI.xlsx')

# Tune XGBoost

In [27]:
# X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, train_size=.80, random_state=1234)
# print X_train.shape
# print X_val.shape
# # xgtrain = xgb.DMatrix(X_train, label=y_train)

(39481, 394)
(9871, 394)


In [53]:
best_score = 1000
for x in [3,4,5,6,7,8,9,10,11,12,13,14,15]:
#     rgr = xgb.XGBClassifier(
#         objective='multi:softprob',
#         seed = 1234, # use a fixed seed during tuning so we can reproduce the results
#         learning_rate = learning_rate,
#         n_estimators = 10000,
#         max_depth= x,
#         nthread = -1,
#         silent = False
#     )
#     rgr.fit(
#         X_train,y_train,
#         eval_set=[(X_val,y_val)],
#         eval_metric='mlogloss',
#         early_stopping_rounds=50,
#         verbose=False
#     )
    tmp = cv_train(train_X,train_y,max_depth = x)
    if  tmp < best_score:
        best_score = tmp
        train_param = x

    print x, '\t', tmp

Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[312]	train-mlogloss:0.395492+0.00170774	test-mlogloss:0.527702+0.00663205

3 	0.5277018
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[141]	train-mlogloss:0.396861+0.00183942	test-mlogloss:0.527621+0.00736431

4 	0.5276214
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[102]	train-mlogloss:0.357724+0.00219303	test-mlogloss:0.528282+0.00654119

5 	0.5282822
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[57]	train-mlogloss:0.358

In [54]:
# max_depth = train_param
max_depth = train_param
print max_depth

4


In [55]:
train_param = 1
for x in [2,4,8,12,16,20,24,28,32,40,48,64,80,128]:
#     rgr = xgb.XGBClassifier(
#         objective='multi:softprob',
#         seed = 1234, # use a fixed seed during tuning so we can reproduce the results
#         learning_rate = learning_rate,
#         n_estimators = 10000,
#         max_depth= max_depth,
#         nthread = -1,
#         silent = False,
#         min_child_weight = x
#     )
#     rgr.fit(
#         X_train,y_train,
#         eval_set=[(X_val,y_val)],
#         eval_metric='mlogloss',
#         early_stopping_rounds=50,
#         verbose=False
#     )
    
    tmp = cv_train(train_X,train_y,max_depth = max_depth,min_child_weight = x)
    if  tmp < best_score:
        best_score = tmp
        train_param = x

    print x, '\t', tmp

Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[186]	train-mlogloss:0.367066+0.00186211	test-mlogloss:0.527145+0.0074838

2 	0.5271452
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[213]	train-mlogloss:0.352706+0.00230145	test-mlogloss:0.526443+0.00732842

4 	0.5264426
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[159]	train-mlogloss:0.390464+0.00279415	test-mlogloss:0.527391+0.00724981

8 	0.5273912
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[177]	train-mlogloss:0.381

In [56]:
min_child_weight = train_param
print min_child_weight

16


In [57]:
train_param = 1
for x in [0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
#     rgr = xgb.XGBClassifier(
#         objective='multi:softprob',
#         seed = 1234, # use a fixed seed during tuning so we can reproduce the results
#         learning_rate = learning_rate,
#         n_estimators = 10000,
#         max_depth= max_depth,
#         nthread = -1,
#         silent = False,
#         min_child_weight = min_child_weight,
#         colsample_bytree = x
#     )
#     rgr.fit(
#         X_train,y_train,
#         eval_set=[(X_val,y_val)],
#         eval_metric='mlogloss',
#         early_stopping_rounds=50,
#         verbose=False
#     )

    tmp = cv_train(train_X,train_y,max_depth = max_depth,min_child_weight = min_child_weight, colsample_bytree = x)
    if  tmp < best_score:
        best_score = tmp
        train_param = x

    print x, '\t', tmp

Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[262]	train-mlogloss:0.395672+0.00192811	test-mlogloss:0.532532+0.00697024

0.05 	0.5325316
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[251]	train-mlogloss:0.38311+0.00308902	test-mlogloss:0.529113+0.00529031

0.1 	0.5291126
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[212]	train-mlogloss:0.387354+0.0026151	test-mlogloss:0.526019+0.00693851

0.2 	0.5260188
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[205]	train-mlogloss

In [58]:
colsample_bytree = train_param
print colsample_bytree

1


In [59]:
train_param = 1
for x in [0.5,0.6,0.7,0.8,0.9]:
#     rgr = xgb.XGBClassifier(
#         objective='multi:softprob',
#         seed = 1234, # use a fixed seed during tuning so we can reproduce the results
#         learning_rate = learning_rate,
#         n_estimators = 10000,
#         max_depth= max_depth,
#         nthread = -1,
#         silent = False,
#         min_child_weight = min_child_weight,
#         colsample_bytree = colsample_bytree,
#         subsample = x
#     )
#     rgr.fit(
#         X_train,y_train,
#         eval_set=[(X_val,y_val)],
#         eval_metric='mlogloss',
#         early_stopping_rounds=50,
#         verbose=False
#     )
    tmp = cv_train(train_X,train_y,max_depth = max_depth,min_child_weight = min_child_weight, 
                   colsample_bytree = colsample_bytree, subsample = x)
    if  tmp < best_score:
        best_score = tmp
        train_param = x

    print x, '\t', tmp

Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[114]	train-mlogloss:0.429981+0.00163237	test-mlogloss:0.535683+0.0063162

0.5 	0.5356834
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[127]	train-mlogloss:0.417173+0.00106604	test-mlogloss:0.532641+0.0071292

0.6 	0.5326408
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[117]	train-mlogloss:0.422807+0.00248712	test-mlogloss:0.530766+0.00658263

0.7 	0.5307656
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[138]	train-mlogloss:

In [60]:
subsample = train_param
print subsample

1


In [61]:
train_param = 0
for x in [0.3, 0.6, 0.9, 1.2, 1.5, 1.8, 2.1, 2.4, 2.7, 3.0]:
#     rgr = xgb.XGBClassifier(
#         objective='multi:softprob',
#         seed = 1234, # use a fixed seed during tuning so we can reproduce the results
#         learning_rate = learning_rate,
#         n_estimators = 10000,
#         max_depth= max_depth,
#         nthread = -1,
#         silent = False,
#         min_child_weight = min_child_weight,
#         colsample_bytree = colsample_bytree,
#         subsample = subsample,
#         gamma = x
#     )
#     rgr.fit(
#         X_train,y_train,
#         eval_set=[(X_val,y_val)],
#         eval_metric='mlogloss',
#         early_stopping_rounds=50,
#         verbose=False
#     )

    tmp = cv_train(train_X,train_y,max_depth = max_depth,min_child_weight = min_child_weight, 
                   colsample_bytree = colsample_bytree, subsample = subsample, gamma = x)
    if  tmp < best_score:
        best_score = tmp
        train_param = x

    print x, '\t', tmp

Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[164]	train-mlogloss:0.393296+0.00247187	test-mlogloss:0.526782+0.00697394

0.3 	0.5267822
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[186]	train-mlogloss:0.378468+0.00338923	test-mlogloss:0.526772+0.00705368

0.6 	0.526772
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[175]	train-mlogloss:0.38681+0.00159871	test-mlogloss:0.526846+0.00696989

0.9 	0.526846
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[167]	train-mlogloss:0

In [62]:
gamma = train_param
print gamma

0


In [31]:
xgtrain = xgb.DMatrix(train_X, label=train_y) 

def xgb_evaluate(min_child_weight, colsample_bytree, max_depth, subsample, gamma): #
    params = dict()
    params['objective']='multi:softprob'
    params['eval_metric']='mlogloss',
    params['num_class']=3
    params['silent']=1
    params['eta'] = 0.1
    params['verbose_eval'] = True
    params['min_child_weight'] = int(min_child_weight)
    params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = 0.99# max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    
    cv_result = xgb.cv(
        params, xgtrain, 
        num_boost_round=10000, nfold=5,
        metrics = 'mlogloss',
        seed=1234,callbacks=[xgb.callback.early_stop(50)]
    )
    
    return -cv_result['test-mlogloss-mean'].values[-1]


xgb_BO = BayesianOptimization(
    xgb_evaluate, 
    {
        'max_depth': (3,10),
        'min_child_weight': (8,80),
        'colsample_bytree': (0.2,1),
        'subsample': (0.7,1),
        'gamma': (0,3)
    }
)

xgb_BO.maximize(init_points=10, n_iter=40)

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1030]	train-mlogloss:0.404719+0.00154409	test-mlogloss:0.525766+0.00730057

    1 | 36m59s | [35m  -0.52577[0m | [32m            0.9657[0m | [32m   0.5937[0m | [32m     3.6952[0m | [32m           53.7510[0m | [32m     0.8887[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[306]	train-mlogloss:0.331036+0.00100352	test-mlogloss:0.523922+0.00655783

    2 | 11m06s | [35m  -0.52392[0m | [32m            0.2498[0m |

  " state: %s" % convergence_dict)


   11 | 09m13s |   -0.52467 |             0.2801 |    1.2652 |      9.9552 |             8.0470 |      0.9948 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[418]	train-mlogloss:0.310456+0.00136334	test-mlogloss:0.521622+0.00676404



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   12 | 14m51s | [35m  -0.52162[0m | [32m            0.2312[0m | [32m   2.9760[0m | [32m     9.9279[0m | [32m           32.8581[0m | [32m     0.7750[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[527]	train-mlogloss:0.31345+0.00208761	test-mlogloss:0.522371+0.00672728



  " state: %s" % convergence_dict)


   13 | 16m20s |   -0.52237 |             0.2031 |    2.9745 |      9.5747 |            43.2279 |      0.8012 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[208]	train-mlogloss:0.262656+0.00148398	test-mlogloss:0.524278+0.00803445



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   14 | 08m15s |   -0.52428 |             0.2114 |    0.1048 |      9.8273 |            22.2451 |      0.7493 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1331]	train-mlogloss:0.435051+0.00326607	test-mlogloss:0.52577+0.00659671



  " state: %s" % convergence_dict)


   15 | 17m57s |   -0.52577 |             0.2675 |    2.8381 |      3.0131 |            40.4910 |      0.9524 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1154]	train-mlogloss:0.394056+0.00124302	test-mlogloss:0.52437+0.00779506

   16 | 14m31s |   -0.52437 |             0.2299 |    0.4023 |      3.0882 |             8.5025 |      0.8048 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[238]	train-mlogloss:0.305061+0.00163014	test-mlogloss:0.52328+0.00618194

   17 | 09m07s |   -0.52328 |             0.2208 |    0.0167 |      9.7664 |            50.1199 |      0.7391 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopp

  " state: %s" % convergence_dict)


   19 | 21m27s |   -0.52416 |             0.2205 |    2.9564 |      3.3751 |            17.0361 |      0.9441 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1093]	train-mlogloss:0.418861+0.00141002	test-mlogloss:0.526003+0.00688117

   20 | 13m37s |   -0.52600 |             0.2256 |    0.3522 |      3.0597 |            79.9431 |      0.9976 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[241]	train-mlogloss:0.295872+0.00102268	test-mlogloss:0.523703+0.00673072

   21 | 29m48s |   -0.52370 |             0.9330 |    2.9849 |      9.1945 |            26.5030 |      0.7375 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Sto

  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   25 | 27m09s |   -0.52412 |             0.2673 |    2.8450 |      3.8539 |             8.0205 |      0.7101 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[237]	train-mlogloss:0.322747+0.00155381	test-mlogloss:0.523257+0.00577917



  " state: %s" % convergence_dict)


   26 | 08m56s |   -0.52326 |             0.2477 |    0.0493 |      8.7521 |            43.1011 |      0.7986 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[446]	train-mlogloss:0.328456+0.00148537	test-mlogloss:0.522107+0.00582033



  " state: %s" % convergence_dict)


   27 | 14m12s |   -0.52211 |             0.2036 |    2.9264 |      9.5995 |            55.4239 |      0.8143 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[406]	train-mlogloss:0.344176+0.00172654	test-mlogloss:0.522368+0.00626433



  " state: %s" % convergence_dict)


   28 | 11m49s |   -0.52237 |             0.2055 |    2.9293 |      8.4159 |            48.4203 |      0.7171 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[674]	train-mlogloss:0.384479+0.00172066	test-mlogloss:0.523107+0.00695791

   29 | 10m24s |   -0.52311 |             0.2043 |    0.0990 |      4.0908 |            24.6326 |      0.8489 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[310]	train-mlogloss:0.311127+0.00202267	test-mlogloss:0.522195+0.00694825

   30 | 09m09s |   -0.52219 |             0.2202 |    0.1870 |      7.3179 |            30.4401 |      0.7319 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stop

  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   31 | 10m42s |   -0.52179 |             0.2039 |    2.9285 |      7.1774 |            32.8201 |      0.7794 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[408]	train-mlogloss:0.311563+0.00109869	test-mlogloss:0.521969+0.00699273

   32 | 13m42s |   -0.52197 |             0.2065 |    2.9887 |      9.5413 |            29.2594 |      0.7878 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[267]	train-mlogloss:0.309185+0.0020634	test-mlogloss:0.523225+0.00591925

   33 | 10m06s |   -0.52323 |             0.2160 |    0.0679 |      9.3548 |            61.0663 |      0.9894 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopp

  " state: %s" % convergence_dict)


   34 | 13m09s | [35m  -0.52129[0m | [32m            0.2054[0m | [32m   2.9595[0m | [32m     6.2395[0m | [32m           25.6201[0m | [32m     0.7927[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[426]	train-mlogloss:0.316246+0.00146418	test-mlogloss:0.521847+0.00681168

   35 | 14m26s |   -0.52185 |             0.2131 |    2.9618 |      9.5358 |            37.2910 |      0.9113 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[563]	train-mlogloss:0.354285+0.000954589	test-mlogloss:0.521773+0.00677928

   36 | 12m15s |   -0.52177 |             0.2011 |    2.8887 |      6.5776 |            27.9240 |      0.8063 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train

  " state: %s" % convergence_dict)


   38 | 11m31s |   -0.52139 |             0.2058 |    2.7369 |      7.6812 |            25.8403 |      0.9935 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1333]	train-mlogloss:0.433136+0.00270341	test-mlogloss:0.525211+0.00717826

   39 | 15m50s |   -0.52521 |             0.2040 |    2.8709 |      3.1960 |            26.3924 |      0.9562 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[508]	train-mlogloss:0.288489+0.00166711	test-mlogloss:0.521452+0.00681815

   40 | 16m43s |   -0.52145 |             0.2071 |    2.9937 |      9.6745 |            15.4274 |      0.9495 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Sto

  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   41 | 08m53s |   -0.52217 |             0.2128 |    0.0419 |      6.8210 |            13.4813 |      0.7043 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[734]	train-mlogloss:0.330029+0.00162452	test-mlogloss:0.521386+0.00755979

   42 | 16m04s |   -0.52139 |             0.2072 |    2.9495 |      6.9237 |            13.7162 |      0.7173 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[488]	train-mlogloss:0.342518+0.00173216	test-mlogloss:0.522851+0.00667372

   43 | 15m17s |   -0.52285 |             0.2011 |    2.9888 |      9.9404 |            75.9973 |      0.7020 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stop

  " state: %s" % convergence_dict)


   45 | 12m30s |   -0.52194 |             0.2379 |    2.9856 |      8.9818 |            12.2886 |      0.7533 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1876]	train-mlogloss:0.433604+0.00194479	test-mlogloss:0.525569+0.00649456



  " state: %s" % convergence_dict)


   46 | 21m53s |   -0.52557 |             0.2134 |    2.8300 |      3.1496 |            73.0748 |      0.8203 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[423]	train-mlogloss:0.290714+0.000595318	test-mlogloss:0.521793+0.00672945

   47 | 13m01s |   -0.52179 |             0.2074 |    2.6126 |      8.3304 |            15.2826 |      0.9684 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[435]	train-mlogloss:0.288839+0.00196188	test-mlogloss:0.522274+0.0069791



  " state: %s" % convergence_dict)


   48 | 16m57s |   -0.52227 |             0.2554 |    2.9282 |      9.5541 |            19.1993 |      0.7799 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[441]	train-mlogloss:0.326479+0.00125082	test-mlogloss:0.522097+0.0065175



  " state: %s" % convergence_dict)


   49 | 15m53s |   -0.52210 |             0.2347 |    2.8738 |      9.6917 |            60.8899 |      0.7387 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[547]	train-mlogloss:0.353443+0.0018016	test-mlogloss:0.522208+0.00722251



  " state: %s" % convergence_dict)


   50 | 10m46s |   -0.52221 |             0.2041 |    0.0582 |      5.2912 |            35.3540 |      0.7538 | 


In [33]:
xgb_bo_scores = pd.DataFrame([[s[0]['max_depth'],
                               s[0]['min_child_weight'],
                               s[0]['colsample_bytree'],
                               s[0]['subsample'],
                               s[0]['gamma'],
                               s[1]] for s in zip(xgb_BO.res['all']['params'],xgb_BO.res['all']['values'])],
                            columns = ['max_depth',
                                       'min_child_weight',
                                       'colsample_bytree',
                                       'subsample',
                                       'gamma',
                                       'score'])
xgb_bo_scores=xgb_bo_scores.sort_values('score',ascending=False)
xgb_bo_scores.head(10)

Unnamed: 0,max_depth,min_child_weight,colsample_bytree,subsample,gamma,score
23,6.239472,25.620069,0.20538,0.792672,2.959502,-0.521289
13,7.570794,29.442225,0.211788,0.994441,2.917059,-0.521371
31,6.923701,13.716162,0.207217,0.71732,2.949494,-0.521386
27,7.681158,25.840275,0.205797,0.99348,2.736861,-0.521392
29,9.674519,15.42737,0.207118,0.949473,2.99367,-0.521452
33,7.584293,17.532455,0.236574,0.724688,2.904289,-0.521585
1,9.927876,32.858122,0.231155,0.774997,2.976026,-0.521622
25,6.577559,27.923958,0.201063,0.806311,2.888736,-0.521773
36,8.330396,15.282628,0.207412,0.968408,2.61265,-0.521793
20,7.177386,32.820085,0.203923,0.779411,2.928484,-0.521795


In [39]:
def xgb_blend(estimators, train_x, train_y, test_x, fold, early_stopping_rounds=0):
    N_params = len(estimators)
    print ("Blend %d estimators for %d folds" % (N_params, fold))
    skf = KFold(n_splits=fold,random_state=1234)
    N_class = len(set(train_y))
        
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x_mean = np.zeros((test_x.shape[0], N_class*N_params))
    test_blend_x_gmean = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros ((fold,N_params))
    best_rounds = np.zeros ((fold, N_params))
    
    for j, est in enumerate(estimators):
        est.set_params(objective = 'multi:softprob')
        est.set_params(silent = False)
        est.set_params(learning_rate = 0.02)
        est.set_params(n_estimators=100000)
        
        print ("Model %d: %s" %(j+1, est))

        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))
    
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x.iloc[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x.iloc[val_index]
            val_y_fold = train_y[val_index]      

            est.fit(train_x_fold,train_y_fold,
                    eval_set = [(val_x_fold, val_y_fold)],
                    eval_metric = 'mlogloss',
                    early_stopping_rounds=early_stopping_rounds,
                    verbose=False)
            best_round=est.best_iteration
            best_rounds[i,j]=best_round
            print ("best round %d" % (best_round))
            val_y_predict_fold = est.predict_proba(val_x_fold,ntree_limit=best_round)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print ("Score: ", score)
            scores[i,j]=score
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = est.predict_proba(test_x,ntree_limit=best_round)
            print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start))
            
        test_blend_x_mean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
        
        test_blend_x_gmean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([gmean(test_blend_x_j[:,range(0,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(1,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(2,N_class*fold,N_class)], axis=1)]).T
            
        print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print ("Score for blended models is %f" % (np.mean(scores)))
    return (train_blend_x, test_blend_x_mean, test_blend_x_gmean, scores,best_rounds)


In [40]:
estimators = [
#     xgb.XGBClassifier(max_depth = 7,
#                               min_child_weight = 24,
#                               colsample_bytree = 0.309861 ,
#                               subsample = 0.998132 ,
#                               gamma = 2.211859),
#              xgb.XGBClassifier(max_depth = 6,
#                               min_child_weight = 19,
#                               colsample_bytree = 0.432358,
#                               subsample = 0.949350,
#                               gamma = 2.976848),
#              xgb.XGBClassifier(max_depth = 7,
#                               min_child_weight = 23,
#                               colsample_bytree = 0.214791,
#                               subsample = 0.997197,
#                               gamma = 2.163581),         
#              xgb.XGBClassifier(max_depth = 8,
#                               min_child_weight = 23,
#                               colsample_bytree = 0.5,
#                               subsample = 0.988002,
#                               gamma = 3.0),  
             xgb.XGBClassifier(max_depth = 6,
                              min_child_weight = 25,
                              colsample_bytree = 0.205380,
                              subsample = 0.792672,
                              gamma = 2.959502)              
             ]

#  	 	max_depth 	min_child_weight 	colsample_bytree 	subsample 	gamma 	score
# 23 	6.239472 	25.620069 	 	 	0.205380 	 	 	0.792672 	2.959502 	-0.521289
# 13 	7.570794 	29.442225 	 	 	0.211788 	 	 	0.994441 	2.917059 	-0.521371
# 31 	6.923701 	13.716162 	 	 	0.207217 	 	 	0.717320 	2.949494 	-0.521386
# 27 	7.681158 	25.840275 	 	 	0.205797 	 	 	0.993480 	2.736861 	-0.521392
# 29 	9.674519 	15.427370 	 	 	0.207118 	 	 	0.949473 	2.993670 	-0.521452


(train_blend_x_xgb,
 test_blend_x_xgb_mean,
 test_blend_x_xgb_gmean,
 blend_scores_xgb,
 best_rounds_xgb) = xgb_blend(estimators,
                              train_X,train_y,
                              test_X,
                              5,
                              500)


Blend 1 estimators for 5 folds
Model 1: XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.20538,
       gamma=2.959502, learning_rate=0.02, max_delta_step=0, max_depth=6,
       min_child_weight=25, missing=None, n_estimators=100000, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.792672)
Model 1 fold 1
best round 3789
('Score: ', 0.52623984795218892)
Model 1 fold 1 fitting finished in 1048.941s
Model 1 fold 2
best round 2465
('Score: ', 0.51521617151629873)
Model 1 fold 2 fitting finished in 724.615s
Model 1 fold 3
best round 3246
('Score: ', 0.50918119850995902)
Model 1 fold 3 fitting finished in 918.114s
Model 1 fold 4
best round 2825
('Score: ', 0.51632242760436764)
Model 1 fold 4 fitting finished in 822.807s
Model 1 fold 5
best round 3384
('Score: ', 0.53591464058234262)
Model 1 fold 5 fitting finished in 949.518s
Score for model 1 is 0.520575
Score for blended models is 0.52

In [41]:
train_blend_x_xgb = pd.DataFrame(train_blend_x_xgb)
train_blend_x_xgb.columns = ["low", "medium", "high"]
train_blend_x_xgb["listing_id"] = train_X.listing_id.values

test_blend_x_xgb_mean = pd.DataFrame(test_blend_x_xgb_mean)
test_blend_x_xgb_mean.columns = ["low", "medium", "high"]
test_blend_x_xgb_mean["listing_id"] = test_X.listing_id.values

test_blend_x_xgb_gmean = pd.DataFrame(test_blend_x_xgb_gmean)
test_blend_x_xgb_gmean.columns = ["low", "medium", "high"]
test_blend_x_xgb_gmean["listing_id"] = test_X.listing_id.values

In [42]:
tmp_train = train_X_0322[['listing_id']].merge(train_blend_x_xgb,on = 'listing_id', how = 'left')[["low", "medium", "high"]].values
tmp_test_mean = test_X_0322[['listing_id']].merge(test_blend_x_xgb_mean,on = 'listing_id', how = 'left')[["low", "medium", "high"]].values
tmp_test_gmean = test_X_0322[['listing_id']].merge(test_blend_x_xgb_gmean,on = 'listing_id', how = 'left')[["low", "medium", "high"]].values

In [44]:
from datetime import datetime
now = datetime.now()

name_train_blend = '../output/train_blend_xgb_cv_price_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../output/test_blend_xgb_mean_cv_price_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../output/test_blend_xgb_gmean_cv_price_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_xgb,axis=0))
print (np.mean(best_rounds_xgb,axis=0))
np.savetxt(name_train_blend,tmp_train, delimiter=",")
np.savetxt(name_test_blend_mean,tmp_test_mean, delimiter=",")
np.savetxt(name_test_blend_gmean,tmp_test_gmean, delimiter=",")

[ 0.52057486]
[ 3141.8]


In [45]:
# now = datetime.now()
sub_name = '../output/sub_XGB_mean_cv223_5blend_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(tmp_test_mean[:,:3])
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = test_X_0322.listing_id.values
out_df.to_csv(sub_name, index=False)