In [1]:
import numpy as np
import pandas as pd
from bayes_opt import BayesianOptimization
import xgboost as xgb
from itertools import product
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn import model_selection,ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import time
import random
from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, Imputer, LabelBinarizer, MultiLabelBinarizer,LabelEncoder
from sklearn.cluster import KMeans
from scipy.stats.mstats import gmean
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



In [2]:
def cv_train(train,y,max_depth = 6,min_child_weight = 1,colsample_bytree = 1, subsample = 1, gamma = 0 , verbose_eval = None,
            seed = 0, early_stop = 50, nfold = 5, eta=0.3):
    xgtrain = xgb.DMatrix(train, label=y)
    params = dict()
    params['objective']='multi:softprob'
    params['eval_metric']='mlogloss',
    params['num_class']=3
    params['silent']=0
    params['eta'] = eta
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    params['colsample_bytree'] = colsample_bytree
    params['subsample'] = subsample
    params['gamma'] = gamma
    
    cv_result = xgb.cv(
        params, xgtrain, 
        num_boost_round=10000, nfold=nfold,
        metrics = 'mlogloss', verbose_eval = verbose_eval,
        seed=seed,callbacks=[xgb.callback.early_stop(early_stop, maximize=False, verbose=False)]
    )

    return cv_result['test-mlogloss-mean'].values[-1]

In [3]:
def CV_st(train,test,feature1,feature2):
    index=list(range(train.shape[0]))
    random.seed(a=0)
    random.shuffle(index)
    kf = KFold(n_splits=5,shuffle=True, random_state=0)
    
    # median feature names
    features_tmp = []
    features_tmp.append(feature1 + '_' + feature2 + '_low_median') 
    features_tmp.append(feature1 + '_' + feature2 + '_medium_median') 
    features_tmp.append(feature1 + '_' + feature2 + '_high_median')
    
    # train data 
    f_low=pd.Series([np.nan]*len(train))
    f_medium=pd.Series([np.nan]*len(train))
    f_high=pd.Series([np.nan]*len(train))

    for train_index, test_index in kf.split(index):
        tmp = train.iloc[train_index].groupby(['interest_level',feature1])[feature2].median().\
                reset_index().rename(columns={feature2:'new'})
        f_low[test_index]    = train.iloc[test_index].merge(tmp[tmp.interest_level == 'low'],
                                                            on=feature1,how='left')['new'].values.astype('float32')
        f_medium[test_index] = train.iloc[test_index].merge(tmp[tmp.interest_level == 'medium'],
                                                            on=feature1,how='left')['new'].values.astype('float32')
        f_high[test_index]   = train.iloc[test_index].merge(tmp[tmp.interest_level == 'high'],
                                                            on=feature1,how='left')['new'].values.astype('float32')   
    train[features_tmp[0]] = f_low
    train[features_tmp[1]] = f_medium
    train[features_tmp[2]] = f_high

    # test data
    tmp = train.groupby(['interest_level',feature1])[feature2].median().\
            reset_index().rename(columns={feature2:'new'})    
    test[features_tmp[0]] = test.merge(tmp[tmp.interest_level == 'low'],
                                       on=feature1,how='left')['new'].values.astype('float32')
    test[features_tmp[1]] = test.merge(tmp[tmp.interest_level == 'medium'],
                                       on=feature1,how='left')['new'].values.astype('float32')
    test[features_tmp[2]] = test.merge(tmp[tmp.interest_level == 'high'],
                                       on=feature1,how='left')['new'].values.astype('float32') 
    

    # mean feature names
    features_tmp.append(feature1 + '_' + feature2 + '_low_mean') 
    features_tmp.append(feature1 + '_' + feature2 + '_medium_mean') 
    features_tmp.append(feature1 + '_' + feature2 + '_high_mean')
    
    # train data 
    f_low=pd.Series([np.nan]*len(train))
    f_medium=pd.Series([np.nan]*len(train))
    f_high=pd.Series([np.nan]*len(train))

    for train_index, test_index in kf.split(index):
        tmp = train.iloc[train_index].groupby(['interest_level',feature1])[feature2].mean().\
                reset_index().rename(columns={feature2:'new'})
        f_low[test_index]    = train.iloc[test_index].merge(tmp[tmp.interest_level == 'low'],
                                                            on=feature1,how='left')['new'].values.astype('float32')
        f_medium[test_index] = train.iloc[test_index].merge(tmp[tmp.interest_level == 'medium'],
                                                            on=feature1,how='left')['new'].values.astype('float32')
        f_high[test_index]   = train.iloc[test_index].merge(tmp[tmp.interest_level == 'high'],
                                                            on=feature1,how='left')['new'].values.astype('float32')   
    train[features_tmp[3]] = f_low
    train[features_tmp[4]] = f_medium
    train[features_tmp[5]] = f_high

    # test data
    tmp = train.groupby(['interest_level',feature1])[feature2].mean().\
            reset_index().rename(columns={feature2:'new'})    
    test[features_tmp[3]] = test.merge(tmp[tmp.interest_level == 'low'],
                                       on=feature1,how='left')['new'].values.astype('float32')
    test[features_tmp[4]] = test.merge(tmp[tmp.interest_level == 'medium'],
                                       on=feature1,how='left')['new'].values.astype('float32')
    test[features_tmp[5]] = test.merge(tmp[tmp.interest_level == 'high'],
                                       on=feature1,how='left')['new'].values.astype('float32') 
    
    # max feature names
    features_tmp.append(feature1 + '_' + feature2 + '_low_max') 
    features_tmp.append(feature1 + '_' + feature2 + '_medium_max') 
    features_tmp.append(feature1 + '_' + feature2 + '_high_max')
    
    # train data 
    f_low=pd.Series([np.nan]*len(train))
    f_medium=pd.Series([np.nan]*len(train))
    f_high=pd.Series([np.nan]*len(train))

    for train_index, test_index in kf.split(index):
        tmp = train.iloc[train_index].groupby(['interest_level',feature1])[feature2].max().\
                reset_index().rename(columns={feature2:'new'})
        f_low[test_index]    = train.iloc[test_index].merge(tmp[tmp.interest_level == 'low'],
                                                            on=feature1,how='left')['new'].values.astype('float32')
        f_medium[test_index] = train.iloc[test_index].merge(tmp[tmp.interest_level == 'medium'],
                                                            on=feature1,how='left')['new'].values.astype('float32')
        f_high[test_index]   = train.iloc[test_index].merge(tmp[tmp.interest_level == 'high'],
                                                            on=feature1,how='left')['new'].values.astype('float32')   
    train[features_tmp[6]] = f_low
    train[features_tmp[7]] = f_medium
    train[features_tmp[8]] = f_high

    # test data
    tmp = train.groupby(['interest_level',feature1])[feature2].max().\
            reset_index().rename(columns={feature2:'new'})    
    test[features_tmp[6]] = test.merge(tmp[tmp.interest_level == 'low'],
                                       on=feature1,how='left')['new'].values.astype('float32')
    test[features_tmp[7]] = test.merge(tmp[tmp.interest_level == 'medium'],
                                       on=feature1,how='left')['new'].values.astype('float32')
    test[features_tmp[8]] = test.merge(tmp[tmp.interest_level == 'high'],
                                       on=feature1,how='left')['new'].values.astype('float32') 
    
    # min feature names
    features_tmp.append(feature1 + '_' + feature2 + '_low_min') 
    features_tmp.append(feature1 + '_' + feature2 + '_medium_min') 
    features_tmp.append(feature1 + '_' + feature2 + '_high_min')
    
    # train data 
    f_low=pd.Series([np.nan]*len(train))
    f_medium=pd.Series([np.nan]*len(train))
    f_high=pd.Series([np.nan]*len(train))

    for train_index, test_index in kf.split(index):
        tmp = train.iloc[train_index].groupby(['interest_level',feature1])[feature2].min().\
                reset_index().rename(columns={feature2:'new'})
        f_low[test_index]    = train.iloc[test_index].merge(tmp[tmp.interest_level == 'low'],
                                                            on=feature1,how='left')['new'].values.astype('float32')
        f_medium[test_index] = train.iloc[test_index].merge(tmp[tmp.interest_level == 'medium'],
                                                            on=feature1,how='left')['new'].values.astype('float32')
        f_high[test_index]   = train.iloc[test_index].merge(tmp[tmp.interest_level == 'high'],
                                                            on=feature1,how='left')['new'].values.astype('float32')   
    train[features_tmp[9]] = f_low
    train[features_tmp[10]] = f_medium
    train[features_tmp[11]] = f_high

    # test data
    tmp = train.groupby(['interest_level',feature1])[feature2].min().\
            reset_index().rename(columns={feature2:'new'})    
    test[features_tmp[9]] = test.merge(tmp[tmp.interest_level == 'low'],
                                       on=feature1,how='left')['new'].values.astype('float32')
    test[features_tmp[10]] = test.merge(tmp[tmp.interest_level == 'medium'],
                                       on=feature1,how='left')['new'].values.astype('float32')
    test[features_tmp[11]] = test.merge(tmp[tmp.interest_level == 'high'],
                                       on=feature1,how='left')['new'].values.astype('float32') 

#     # std feature names
#     features_tmp.append(feature1 + '_' + feature2 + '_low_std') 
#     features_tmp.append(feature1 + '_' + feature2 + '_medium_std') 
#     features_tmp.append(feature1 + '_' + feature2 + '_high_std')
    
#     # train data 
#     f_low=pd.Series([np.nan]*len(train))
#     f_medium=pd.Series([np.nan]*len(train))
#     f_high=pd.Series([np.nan]*len(train))

#     for train_index, test_index in kf.split(index):
#         tmp = train.iloc[train_index].groupby(['interest_level',feature1])[feature2].std().\
#                 reset_index().rename(columns={feature2:'new'})
#         f_low[test_index]    = train.iloc[test_index].merge(tmp[tmp.interest_level == 'low'],
#                                                             on=feature1,how='left')['new'].values
#         f_medium[test_index] = train.iloc[test_index].merge(tmp[tmp.interest_level == 'medium'],
#                                                             on=feature1,how='left')['new'].values
#         f_high[test_index]   = train.iloc[test_index].merge(tmp[tmp.interest_level == 'high'],
#                                                             on=feature1,how='left')['new'].values   
#     train[features_tmp[12]] = f_low
#     train[features_tmp[13]] = f_medium
#     train[features_tmp[14]] = f_high

#     # test data
#     tmp = train.groupby(['interest_level',feature1])[feature2].std().\
#             reset_index().rename(columns={feature2:'new'})    
#     test[features_tmp[12]] = test.merge(tmp[tmp.interest_level == 'low'],
#                                        on=feature1,how='left')['new'].values
#     test[features_tmp[13]] = test.merge(tmp[tmp.interest_level == 'medium'],
#                                        on=feature1,how='left')['new'].values
#     test[features_tmp[14]] = test.merge(tmp[tmp.interest_level == 'high'],
#                                        on=feature1,how='left')['new'].values 
    
#     # var feature names
#     features_tmp.append(feature1 + '_' + feature2 + '_low_var') 
#     features_tmp.append(feature1 + '_' + feature2 + '_medium_var') 
#     features_tmp.append(feature1 + '_' + feature2 + '_high_var')
    
#     # train data 
#     f_low=pd.Series([np.nan]*len(train))
#     f_medium=pd.Series([np.nan]*len(train))
#     f_high=pd.Series([np.nan]*len(train))

#     for train_index, test_index in kf.split(index):
#         tmp = train.iloc[train_index].groupby(['interest_level',feature1])[feature2].var().\
#                 reset_index().rename(columns={feature2:'new'})
#         f_low[test_index]    = train.iloc[test_index].merge(tmp[tmp.interest_level == 'low'],
#                                                             on=feature1,how='left')['new'].values
#         f_medium[test_index] = train.iloc[test_index].merge(tmp[tmp.interest_level == 'medium'],
#                                                             on=feature1,how='left')['new'].values
#         f_high[test_index]   = train.iloc[test_index].merge(tmp[tmp.interest_level == 'high'],
#                                                             on=feature1,how='left')['new'].values   
#     train[features_tmp[15]] = f_low
#     train[features_tmp[16]] = f_medium
#     train[features_tmp[17]] = f_high

#     # test data
#     tmp = train.groupby(['interest_level',feature1])[feature2].var().\
#             reset_index().rename(columns={feature2:'new'})    
#     test[features_tmp[15]] = test.merge(tmp[tmp.interest_level == 'low'],
#                                        on=feature1,how='left')['new'].values
#     test[features_tmp[16]] = test.merge(tmp[tmp.interest_level == 'medium'],
#                                        on=feature1,how='left')['new'].values
#     test[features_tmp[17]] = test.merge(tmp[tmp.interest_level == 'high'],
#                                        on=feature1,how='left')['new'].values 
    
#     # ratio/diff feature
#     cols = features_tmp[:]
# #     features_tmp = []
#     for col in cols:
#         new_feature = col+'_ratio'
#         train[new_feature] = train[col] / train[feature2]
#         test[new_feature] = test[col] / test[feature2]
#         features_tmp.append(new_feature)
        
#     print feature1,' vs ', feature2,'Done!'
    return train,test,features_tmp

In [4]:
#input data
train_df=pd.read_json('../input/train.json').reset_index(drop = True)
test_df=pd.read_json('../input/test.json').reset_index(drop = True)
test_df.loc[test_df.bathrooms == 112.0,'bathrooms'] = 1.5    
test_df.loc[test_df.bathrooms == 20.0,'bathrooms'] = 2.0
test_df.loc[test_df.listing_id == 7220763,'bedrooms'] = 3
test_df.loc[test_df.listing_id == 7047074,'bedrooms'] = 6
print train_df.shape
print test_df.shape

(49352, 15)
(74659, 14)


In [5]:
def add_features(df):
    fmt = lambda s: s.replace("\u00a0", "").strip().lower()
    df["num_photo_count"] = df["photos"].apply(len)
    df["street_address"] = df['street_address'].apply(fmt)
    df["display_address"] = df["display_address"].apply(fmt)
    df["num_desc_wordcount"] = df["description"].apply(len)
    df["num_pricePerBed"] = df['price'] / df['bedrooms']
    df["num_pricePerBath"] = df['price'] / df['bathrooms']
    df["num_pricePerRoom"] = df['price'] / (df['bedrooms'] + df['bathrooms'])
    df["num_bedPerBath"] = df['bedrooms'] / df['bathrooms']
    df["num_bedBathDiff"] = df['bedrooms'] - df['bathrooms']
    df["num_bedBathSum"] = df["bedrooms"] + df['bathrooms']
    df["num_bedsPerc"] = df["bedrooms"] / (df['bedrooms'] + df['bathrooms'])

    df = df.fillna(-1).replace(np.inf, -1)
    return df

# Add common features
train_df = add_features(train_df)
test_df = add_features(test_df) 


# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

train_df['num_desc_length_null'] = (train_df.description.str.len()==0).astype(float)
test_df['num_desc_length_null'] = (test_df.description.str.len()==0).astype(float)
    
features_to_use=[
    "latitude", "longitude","num_pricePerBed",
    'num_bedBathSum','num_pricePerBath','num_pricePerRoom','num_bedPerBath',
    'num_bedBathDiff','num_bedsPerc',
    "num_photo_count", "num_features", "num_desc_wordcount",'num_desc_length_null',
    "listing_id"]

print 'Done!'

Done!


In [6]:
# Location features: Latitude, longitude
precision = 3
x = np.sqrt(((train_df.latitude - train_df.latitude.median())**2) + (train_df.longitude - train_df.longitude.median())**2)
train_df['num_dist_from_center'] = x.values
x = np.sqrt(((test_df.latitude - train_df.latitude.median())**2) + (test_df.longitude - train_df.longitude.median())**2)
test_df['num_dist_from_center'] = x.values
train_df['position'] = train_df.longitude.round(precision).astype(str) + '_' + train_df.latitude.round(precision).astype(str)
test_df['position'] = test_df.longitude.round(precision).astype(str) + '_' + test_df.latitude.round(precision).astype(str)

new_feature = ['num_dist_from_center']
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)
print 'Done!'

Done!


In [7]:
# Degree of "outlierness"
OutlierAggregated = (train_df.bedrooms > 4).astype(float)
OutlierAggregated2 = (test_df.bedrooms > 4).astype(float)
OutlierAggregated += (train_df.bathrooms > 3).astype(float)
OutlierAggregated2 += (test_df.bathrooms > 3).astype(float)
OutlierAggregated += (train_df.bathrooms < 1).astype(float)
OutlierAggregated2 += (test_df.bathrooms < 1).astype(float)
x = np.abs((train_df.price - train_df.price.median())/train_df.price.std()) > 0.30
OutlierAggregated += x.astype(float)
x2 = np.abs((test_df.price - train_df.price.median())/train_df.price.std()) > 0.30
OutlierAggregated2 += x2.astype(float)
x = np.log1p(train_df.price/(train_df.bedrooms.clip(1,3) + train_df.bathrooms.clip(1,2))) > 8.2
OutlierAggregated += x.astype(float)
x2 = np.log1p(test_df.price/(test_df.bedrooms.clip(1,3) + test_df.bathrooms.clip(1,2))) > 8.2
OutlierAggregated2 += x2.astype(float)
x = np.sqrt(((train_df.latitude - train_df.latitude.median())**2) + (train_df.longitude - train_df.longitude.median())**2) > 0.30
OutlierAggregated += x.astype(float)
x2 = np.sqrt(((test_df.latitude - train_df.latitude.median())**2) + (test_df.longitude - train_df.longitude.median())**2) > 0.30
OutlierAggregated2 += x2.astype(float)
train_df['num_OutlierAggregated'] = OutlierAggregated.values
test_df['num_OutlierAggregated'] = OutlierAggregated2.values


new_feature = ['num_OutlierAggregated']
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)
        
print 'Done!'

Done!


In [8]:
# Density in unique locations at given precision
vals = train_df['position'].value_counts()
dvals = vals.to_dict()
train_df['num_pos_density'] = train_df['position'].apply(lambda x: dvals.get(x, vals.min()))
test_df['num_pos_density'] = test_df['position'].apply(lambda x: dvals.get(x, vals.min()))

# Building null
train_df['num_building_null'] = (train_df.building_id=='0').astype(float)
test_df['num_building_null'] = (test_df.building_id=='0').astype(float)


new_feature = ['num_pos_density','num_building_null']
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)
        
print 'Done!'

Done!


In [9]:
# Creation time features
train_df['created'] = pd.to_datetime(train_df.created)
train_df['num_created_weekday'] = train_df.created.dt.dayofweek.astype(float)
train_df['num_created_weekofyear'] = train_df.created.dt.weekofyear
train_df['num_created_day'] = train_df.created.dt.day
train_df['num_created_month'] = train_df.created.dt.month
train_df['num_created_hour'] = train_df.created.dt.hour
  
test_df['created'] = pd.to_datetime(test_df.created)
test_df['num_created_weekday'] = test_df.created.dt.dayofweek
test_df['num_created_weekofyear'] = test_df.created.dt.weekofyear
test_df['num_created_day'] = test_df.created.dt.day
test_df['num_created_month'] = test_df.created.dt.month
test_df['num_created_hour'] = test_df.created.dt.hour


new_feature = ['num_created_weekday','num_created_weekofyear','num_created_day','num_created_month','num_created_hour']
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)
        
print 'Done!'

Done!


In [10]:
# Bedrooms/Bathrooms/Price
train_df['num_bathrooms'] = train_df.bathrooms.clip_upper(4)
test_df['num_bathrooms'] = test_df.bathrooms.clip_upper(4)

train_df['num_bedrooms'] = train_df.bedrooms.clip_upper(5)
test_df['num_bedrooms'] = test_df.bedrooms.clip_upper(5)

train_df['num_price'] = train_df.price.clip_upper(10000)
test_df['num_price'] = test_df.price.clip_upper(10000)

bins = train_df.price.quantile(np.arange(0.05, 1, 0.05))
train_df['num_price_q'] = np.digitize(train_df.price, bins)
test_df['num_price_q'] = np.digitize(test_df.price, bins)


new_feature = ['num_bathrooms','num_bedrooms','num_price','num_price_q']
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)
        
print 'Done!'

Done!


In [11]:
# Composite features based on: 
# https://www.kaggle.com/arnaldcat/two-sigma-connect-rental-listing-inquiries/a-proxy-for-sqft-and-the-interest-on-1-2-baths
train_df['num_priceXroom'] = (train_df.price / (1 + train_df.bedrooms.clip(1, 4) + 0.5*train_df.bathrooms.clip(0, 2))).values
test_df['num_priceXroom'] = (test_df.price / (1 + test_df.bedrooms.clip(1, 4) + 0.5*test_df.bathrooms.clip(0, 2))).values

train_df['num_even_bathrooms'] = ((np.round(train_df.bathrooms) - train_df.bathrooms)==0).astype(float)
test_df['num_even_bathrooms'] = ((np.round(test_df.bathrooms) - test_df.bathrooms)==0).astype(float)

new_feature = ['num_priceXroom','num_even_bathrooms']
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

print 'Done!'

Done!


In [12]:
categorical = ["display_address", "manager_id", "building_id", "street_address",'position']
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            if f not in features_to_use:
                features_to_use.append(f)

In [13]:
dftemp = train_df.copy()
for i in ['latitude', 'longitude']:
    while(1):
        x = dftemp[i].median()
        ix = abs(dftemp[i] - x) > 3*dftemp[i].std()
        if ix.sum()==0:
            break
        dftemp.loc[ix, i] = np.nan
dftemp = dftemp.loc[dftemp[['latitude', 'longitude']].isnull().sum(1) == 0, :]

dfm = DataFrameMapper([(['latitude'], [StandardScaler()]), (['longitude'], [StandardScaler()])])

for i in [6]:
    pipe_location = make_pipeline(dfm, KMeans(n_clusters=i, random_state=1))
    pipe_location.fit(dftemp);
    train_df['location_'+str(i)] = pipe_location.predict(train_df).astype(str)
    test_df['location_'+str(i)] = pipe_location.predict(test_df).astype(str)
for i in train_df.location_6.unique():
    f = 'num_location_6_'+str(i)
    train_df[f] = (train_df.location_6==i).astype(float)
    test_df[f] = (test_df.location_6==i).astype(float)
    if f not in features_to_use:
        features_to_use.append(f)
    
    
train_df['tmp_bathrooms'] = train_df.bathrooms.clip_upper(2)
test_df['tmp_bathrooms'] = test_df.bathrooms.clip_upper(2)
train_df['tmp_bedrooms'] = train_df.bedrooms.clip_upper(4)
test_df['tmp_bedrooms'] = test_df.bedrooms.clip_upper(4)
train_df['roomcal'] = train_df.tmp_bedrooms.astype(str) + '_' + train_df.tmp_bathrooms.astype(str)    
test_df['roomcal'] = test_df.tmp_bedrooms.astype(str) + '_' + test_df.tmp_bathrooms.astype(str)    

room_lb = LabelBinarizer()
room_lb.fit(train_df['roomcal'])
room_col = ['num_room_type_' + str(x) for x in range(len(train_df['roomcal'].unique()))]
for f in room_col:
    if f not in features_to_use:
        features_to_use.append(f)

train_df = train_df.join(pd.DataFrame(room_lb.transform(train_df['roomcal']),columns=room_col,index=train_df.index))
test_df = test_df.join(pd.DataFrame(room_lb.transform(test_df['roomcal']),columns=room_col,index=test_df.index))

tmp = train_df.groupby(['roomcal','location_6'])['num_price'].median().\
            reset_index().rename(columns={'num_price':'num_6_median_price'})
    
train_df = train_df.merge(tmp,on=['roomcal','location_6'],how='left')
test_df = test_df.merge(tmp,on=['roomcal','location_6'],how='left')

test_df.loc[27462,'num_6_median_price'] =  7200.0

train_df['num_6_price_ratio'] = train_df['num_price'] / train_df['num_6_median_price']
train_df['num_6_price_diff'] = train_df['num_price'] - train_df['num_6_median_price']
test_df['num_6_price_ratio'] = test_df['num_price'] / test_df['num_6_median_price']
test_df['num_6_price_diff'] = test_df['num_price'] - test_df['num_6_median_price']


for f in ['num_6_median_price','num_6_price_ratio','num_6_price_diff']:
    if f not in features_to_use:
        features_to_use.append(f)
        
        
print 'Done!'

Done!


In [14]:
tmp = train_df.groupby(['num_bedrooms','location_6'])['num_price'].median().\
            reset_index().rename(columns={'num_price':'num_6_median_price_bedroom'})
    
train_df = train_df.merge(tmp,on=['num_bedrooms','location_6'],how='left')
test_df = test_df.merge(tmp,on=['num_bedrooms','location_6'],how='left')

train_df['num_6_price_ratio_bedroom'] = train_df['num_price'] / train_df['num_6_median_price_bedroom']
train_df['num_6_price_diff_bedroom'] = train_df['num_price'] - train_df['num_6_median_price_bedroom']
test_df['num_6_price_ratio_bedroom'] = test_df['num_price'] / test_df['num_6_median_price_bedroom']
test_df['num_6_price_diff_bedroom'] = test_df['num_price'] - test_df['num_6_median_price_bedroom']


for f in ['num_6_median_price_bedroom','num_6_price_ratio_bedroom','num_6_price_diff_bedroom']:
    if f not in features_to_use:
        features_to_use.append(f)

In [15]:
def create_binary_features(df):
    bows = {
        "dogs": ("dogs", "dog",'pet friendly','pets'),
        "cats": ("cats",'pet friendly','pets'),
        "nofee": ("no fee", "no-fee", "no  fee", "nofee", "no_fee"),
        "lowfee": ("reduced_fee", "low_fee", "reduced fee", "low fee"),
        "furnished": ("furnished",'equipped'),
        "parquet": ("parquet", "hardwood"),
        "concierge": ("concierge", "doorman", "housekeep", "in_super"),
        "prewar": ("prewar", "pre_war", "pre war", "pre-war"),
        "laundry": ("laundry", "lndry"),
        "health": ("health", "gym", "fitness", "training"),
        "transport": ("train", "subway", "transport"),
        "parking": ("parking",),
        "utilities": ("utilities", "heat water", "water included"),
        'fireplace': ('fireplace','fireplaces'),
        'elevator': ('elevator'),
        'pool':('pool'),
        'loft':('loft'),
        'luxury':('luxury','valet'),
        'marble':('marble'),
        'onemounthfree': ('1 month free','one month free'),
        'washer':('washer','dryer')
    }

    def indicator(bow):
        return lambda s: int(any([x in s for x in bow]))

    features = df["features"].apply(lambda f: " ".join(f).lower())   # convert features to string
    for key in bows:
        tmp_key = "feature_" + key
        df[tmp_key] = features.apply(indicator(bows[key]))
        if tmp_key not in features_to_use:
            features_to_use.append(tmp_key)
    return df

# Create binarized features
train_df = create_binary_features(train_df)
test_df = create_binary_features(test_df)


In [16]:
data_path = "../input/"
train_X_0322 = pd.read_csv(data_path + 'train_BM_MB_add03052240.csv')
test_X_0322 = pd.read_csv(data_path + 'test_BM_MB_add03052240.csv')

print train_X_0322.shape
print test_X_0322.shape

(49352, 322)
(74659, 322)


In [17]:
sentiment = [
#     'building_id_mean_med','building_id_mean_high', 
#     'manager_id_mean_med','manager_id_mean_high',
    'median_price_bed', 'ratio_bed',
       'compound', 'neg', 'neu', 'pos', 'street',
       'avenue', 'east', 'west', 'north', 'south', 'other_address',
       'Zero_building_id', 'top_10_building', 'top_25_building',
       'top_5_building', 'top_50_building', 'top_1_building',
       'top_2_building', 'top_15_building', 'top_20_building',
       'top_30_building','listing_id'
]

train_df = train_df.merge(train_X_0322[sentiment],on='listing_id', how='left')
test_df = test_df.merge(test_X_0322[sentiment],on='listing_id', how='left')

for f in sentiment:
    if f not in features_to_use:
        features_to_use.append(f)

In [18]:
train_df.isnull().values.any()

False

In [19]:
test_df.isnull().values.any()

False

In [21]:
# init_feature = features_to_use[:]

# CV statistics

In [22]:
# CV_feature = []

In [20]:
index=list(range(train_df.shape[0]))
random.seed(a=0)
random.shuffle(index)
a=[np.nan]*len(train_df)
b=[np.nan]*len(train_df)
c=[np.nan]*len(train_df)

for i in range(5):
    building_level={}
    for j in train_df['manager_id'].values:
        building_level[j]=[0,0,0]
    test_index=index[int((i*train_df.shape[0])/5):int(((i+1)*train_df.shape[0])/5)]
    train_index=list(set(index).difference(test_index))
    for j in train_index:
        temp=train_df.iloc[j]
        if temp['interest_level']=='low':
            building_level[temp['manager_id']][0]+=1
        if temp['interest_level']=='medium':
            building_level[temp['manager_id']][1]+=1
        if temp['interest_level']=='high':
            building_level[temp['manager_id']][2]+=1
    for j in test_index:
        temp=train_df.iloc[j]
        if sum(building_level[temp['manager_id']])!=0:
            a[j]=building_level[temp['manager_id']][0]*1.0/sum(building_level[temp['manager_id']])
            b[j]=building_level[temp['manager_id']][1]*1.0/sum(building_level[temp['manager_id']])
            c[j]=building_level[temp['manager_id']][2]*1.0/sum(building_level[temp['manager_id']])
train_df['manager_level_low']=a
train_df['manager_level_medium']=b
train_df['manager_level_high']=c



a=[]
b=[]
c=[]
building_level={}
for j in train_df['manager_id'].values:
    building_level[j]=[0,0,0]
for j in range(train_df.shape[0]):
    temp=train_df.iloc[j]
    if temp['interest_level']=='low':
        building_level[temp['manager_id']][0]+=1
    if temp['interest_level']=='medium':
        building_level[temp['manager_id']][1]+=1
    if temp['interest_level']=='high':
        building_level[temp['manager_id']][2]+=1

for i in test_df['manager_id'].values:
    if i not in building_level.keys():
        a.append(np.nan)
        b.append(np.nan)
        c.append(np.nan)
    else:
        a.append(building_level[i][0]*1.0/sum(building_level[i]))
        b.append(building_level[i][1]*1.0/sum(building_level[i]))
        c.append(building_level[i][2]*1.0/sum(building_level[i]))
test_df['manager_level_low']=a
test_df['manager_level_medium']=b
test_df['manager_level_high']=c

# features_to_use = []
features_to_use.append('manager_level_low') 
features_to_use.append('manager_level_medium') 
features_to_use.append('manager_level_high')
# CV_feature.append(features_to_use)

In [21]:
index=list(range(train_df.shape[0]))
random.seed(a=0)
random.shuffle(index)
a=[np.nan]*len(train_df)
b=[np.nan]*len(train_df)
c=[np.nan]*len(train_df)

for i in range(5):
    building_level={}
    for j in train_df['building_id'].values:
        building_level[j]=[0,0,0]
    test_index=index[int((i*train_df.shape[0])/5):int(((i+1)*train_df.shape[0])/5)]
    train_index=list(set(index).difference(test_index))
    for j in train_index:
        temp=train_df.iloc[j]
        if temp['interest_level']=='low':
            building_level[temp['building_id']][0]+=1
        if temp['interest_level']=='medium':
            building_level[temp['building_id']][1]+=1
        if temp['interest_level']=='high':
            building_level[temp['building_id']][2]+=1
    for j in test_index:
        temp=train_df.iloc[j]
        if sum(building_level[temp['building_id']])!=0:
            a[j]=building_level[temp['building_id']][0]*1.0/sum(building_level[temp['building_id']])
            b[j]=building_level[temp['building_id']][1]*1.0/sum(building_level[temp['building_id']])
            c[j]=building_level[temp['building_id']][2]*1.0/sum(building_level[temp['building_id']])
train_df['building_level_low']=a
train_df['building_level_medium']=b
train_df['building_level_high']=c



a=[]
b=[]
c=[]
building_level={}
for j in train_df['building_id'].values:
    building_level[j]=[0,0,0]
for j in range(train_df.shape[0]):
    temp=train_df.iloc[j]
    if temp['interest_level']=='low':
        building_level[temp['building_id']][0]+=1
    if temp['interest_level']=='medium':
        building_level[temp['building_id']][1]+=1
    if temp['interest_level']=='high':
        building_level[temp['building_id']][2]+=1

for i in test_df['building_id'].values:
    if i not in building_level.keys():
        a.append(np.nan)
        b.append(np.nan)
        c.append(np.nan)
    else:
        a.append(building_level[i][0]*1.0/sum(building_level[i]))
        b.append(building_level[i][1]*1.0/sum(building_level[i]))
        c.append(building_level[i][2]*1.0/sum(building_level[i]))
test_df['building_level_low']=a
test_df['building_level_medium']=b
test_df['building_level_high']=c


# features_to_use = []
features_to_use.append('building_level_low') 
features_to_use.append('building_level_medium') 
features_to_use.append('building_level_high')
# CV_feature.append(features_to_use)

In [22]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','ratio_bed')
features_to_use.extend(new_feature)

In [23]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'building_id','num_price_q')
features_to_use.extend(new_feature)

In [86]:
used_feature1 = used_feature[:]
used_feature1

['latitude',
 'longitude',
 'num_pricePerBed',
 'num_bedBathSum',
 'num_pricePerBath',
 'num_pricePerRoom',
 'num_bedPerBath',
 'num_bedBathDiff',
 'num_bedsPerc',
 'num_photo_count',
 'num_features',
 'num_desc_wordcount',
 'num_desc_length_null',
 'listing_id',
 'num_dist_from_center',
 'num_OutlierAggregated',
 'num_pos_density',
 'num_building_null',
 'num_created_weekday',
 'num_created_weekofyear',
 'num_created_day',
 'num_created_month',
 'num_created_hour',
 'num_bathrooms',
 'num_bedrooms',
 'num_price',
 'num_price_q',
 'num_priceXroom',
 'num_even_bathrooms',
 'display_address',
 'manager_id',
 'building_id',
 'street_address',
 'position',
 'num_location_6_3',
 'num_location_6_1',
 'num_location_6_0',
 'num_location_6_5',
 'num_location_6_4',
 'num_location_6_2',
 'num_room_type_0',
 'num_room_type_1',
 'num_room_type_2',
 'num_room_type_3',
 'num_room_type_4',
 'num_room_type_5',
 'num_room_type_6',
 'num_room_type_7',
 'num_room_type_8',
 'num_room_type_9',
 'num_room_ty

In [98]:
best_score

0.52888760000000001

# Val

In [26]:
train_X = train_df[features_to_use]
test_X = test_df[features_to_use]

train_X.replace(np.inf, np.nan)
test_X.replace(np.inf, np.nan)

train_X.loc[:,'num_nan'] = train_X.isnull().sum(axis=1)
test_X.loc[:,'num_nan'] = test_X.isnull().sum(axis=1)

target_num_map = {'high':2, 'medium':1, 'low':0}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print train_X.shape, test_X.shape 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


(49352, 141) (74659, 141)


In [27]:
now = time.time()
print cv_train(train_X,train_y,verbose_eval = 10, early_stop = 20)
print '\nTraining :{:0.2f}s'.format(time.time() - now)

[0]	train-mlogloss:0.905261+0.000641652	test-mlogloss:0.910695+0.00110889
[10]	train-mlogloss:0.528161+0.00146897	test-mlogloss:0.572251+0.00323927
[20]	train-mlogloss:0.471311+0.00206303	test-mlogloss:0.545762+0.00446619
[30]	train-mlogloss:0.434981+0.00233531	test-mlogloss:0.537474+0.00501406
[40]	train-mlogloss:0.407295+0.00276496	test-mlogloss:0.533319+0.00574307
[50]	train-mlogloss:0.384502+0.00375174	test-mlogloss:0.53113+0.00580746
[60]	train-mlogloss:0.363385+0.00261623	test-mlogloss:0.52972+0.00608787
[70]	train-mlogloss:0.344761+0.00298249	test-mlogloss:0.528976+0.00589441
[80]	train-mlogloss:0.326979+0.00329575	test-mlogloss:0.529332+0.00600917
[90]	train-mlogloss:0.309887+0.0032997	test-mlogloss:0.529932+0.00615072
0.528882

Training :227.23s


In [32]:
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, train_size=.80, random_state=2016)
rgr = xgb.XGBClassifier(
            objective='multi:softprob',
            seed = 0, # use a fixed seed during tuning so we can reproduce the results
            learning_rate = 0.3,
            n_estimators = 80,
            max_depth= 6,
            nthread = -1,
            colsample_bytree = 0.3,
            subsample =0.7,
            silent = 1
        )
rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
#         early_stopping_rounds=20,
        verbose=20
    )

[0]	validation_0-mlogloss:0.929519
[20]	validation_0-mlogloss:0.576712
[40]	validation_0-mlogloss:0.561279
[60]	validation_0-mlogloss:0.555683


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.3,
       gamma=0, learning_rate=0.3, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=80, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=1, subsample=0.7)

In [33]:
import xgbfir
xgbfir.saveXgbFI(rgr, feature_names=X_train.columns, OutputXlsxFile = '../FE/FI.xlsx')

# Tune XGBoost

In [27]:
# X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, train_size=.80, random_state=1234)
# print X_train.shape
# print X_val.shape
# # xgtrain = xgb.DMatrix(X_train, label=y_train)

(39481, 394)
(9871, 394)


In [53]:
best_score = 1000
for x in [3,4,5,6,7,8,9,10]:
#     rgr = xgb.XGBClassifier(
#         objective='multi:softprob',
#         seed = 1234, # use a fixed seed during tuning so we can reproduce the results
#         learning_rate = learning_rate,
#         n_estimators = 10000,
#         max_depth= x,
#         nthread = -1,
#         silent = False
#     )
#     rgr.fit(
#         X_train,y_train,
#         eval_set=[(X_val,y_val)],
#         eval_metric='mlogloss',
#         early_stopping_rounds=50,
#         verbose=False
#     )
    tmp = cv_train(train_X,train_y,max_depth = x)
    if  tmp < best_score:
        best_score = tmp
        train_param = x

    print x, '\t', tmp

Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[312]	train-mlogloss:0.395492+0.00170774	test-mlogloss:0.527702+0.00663205

3 	0.5277018
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[141]	train-mlogloss:0.396861+0.00183942	test-mlogloss:0.527621+0.00736431

4 	0.5276214
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[102]	train-mlogloss:0.357724+0.00219303	test-mlogloss:0.528282+0.00654119

5 	0.5282822
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[57]	train-mlogloss:0.358

In [54]:
# max_depth = train_param
max_depth = train_param
print max_depth

4


In [55]:
train_param = 1
for x in [8,12,16,20,24,28,32,40,48,64,80,128]:
#     rgr = xgb.XGBClassifier(
#         objective='multi:softprob',
#         seed = 1234, # use a fixed seed during tuning so we can reproduce the results
#         learning_rate = learning_rate,
#         n_estimators = 10000,
#         max_depth= max_depth,
#         nthread = -1,
#         silent = False,
#         min_child_weight = x
#     )
#     rgr.fit(
#         X_train,y_train,
#         eval_set=[(X_val,y_val)],
#         eval_metric='mlogloss',
#         early_stopping_rounds=50,
#         verbose=False
#     )
    
    tmp = cv_train(train_X,train_y,max_depth = max_depth,min_child_weight = x)
    if  tmp < best_score:
        best_score = tmp
        train_param = x

    print x, '\t', tmp

Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[186]	train-mlogloss:0.367066+0.00186211	test-mlogloss:0.527145+0.0074838

2 	0.5271452
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[213]	train-mlogloss:0.352706+0.00230145	test-mlogloss:0.526443+0.00732842

4 	0.5264426
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[159]	train-mlogloss:0.390464+0.00279415	test-mlogloss:0.527391+0.00724981

8 	0.5273912
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[177]	train-mlogloss:0.381

In [56]:
min_child_weight = train_param
print min_child_weight

16


In [57]:
train_param = 1
for x in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
#     rgr = xgb.XGBClassifier(
#         objective='multi:softprob',
#         seed = 1234, # use a fixed seed during tuning so we can reproduce the results
#         learning_rate = learning_rate,
#         n_estimators = 10000,
#         max_depth= max_depth,
#         nthread = -1,
#         silent = False,
#         min_child_weight = min_child_weight,
#         colsample_bytree = x
#     )
#     rgr.fit(
#         X_train,y_train,
#         eval_set=[(X_val,y_val)],
#         eval_metric='mlogloss',
#         early_stopping_rounds=50,
#         verbose=False
#     )

    tmp = cv_train(train_X,train_y,max_depth = max_depth,min_child_weight = min_child_weight, colsample_bytree = x)
    if  tmp < best_score:
        best_score = tmp
        train_param = x

    print x, '\t', tmp

Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[262]	train-mlogloss:0.395672+0.00192811	test-mlogloss:0.532532+0.00697024

0.05 	0.5325316
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[251]	train-mlogloss:0.38311+0.00308902	test-mlogloss:0.529113+0.00529031

0.1 	0.5291126
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[212]	train-mlogloss:0.387354+0.0026151	test-mlogloss:0.526019+0.00693851

0.2 	0.5260188
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[205]	train-mlogloss

In [58]:
colsample_bytree = train_param
print colsample_bytree

1


In [59]:
train_param = 1
for x in [0.7,0.8,0.9]:
#     rgr = xgb.XGBClassifier(
#         objective='multi:softprob',
#         seed = 1234, # use a fixed seed during tuning so we can reproduce the results
#         learning_rate = learning_rate,
#         n_estimators = 10000,
#         max_depth= max_depth,
#         nthread = -1,
#         silent = False,
#         min_child_weight = min_child_weight,
#         colsample_bytree = colsample_bytree,
#         subsample = x
#     )
#     rgr.fit(
#         X_train,y_train,
#         eval_set=[(X_val,y_val)],
#         eval_metric='mlogloss',
#         early_stopping_rounds=50,
#         verbose=False
#     )
    tmp = cv_train(train_X,train_y,max_depth = max_depth,min_child_weight = min_child_weight, 
                   colsample_bytree = colsample_bytree, subsample = x)
    if  tmp < best_score:
        best_score = tmp
        train_param = x

    print x, '\t', tmp

Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[114]	train-mlogloss:0.429981+0.00163237	test-mlogloss:0.535683+0.0063162

0.5 	0.5356834
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[127]	train-mlogloss:0.417173+0.00106604	test-mlogloss:0.532641+0.0071292

0.6 	0.5326408
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[117]	train-mlogloss:0.422807+0.00248712	test-mlogloss:0.530766+0.00658263

0.7 	0.5307656
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[138]	train-mlogloss:

In [60]:
subsample = train_param
print subsample

1


In [61]:
train_param = 0
for x in [0.3, 0.6, 0.9, 1.2, 1.5, 1.8, 2.1, 2.4, 2.7, 3.0]:
#     rgr = xgb.XGBClassifier(
#         objective='multi:softprob',
#         seed = 1234, # use a fixed seed during tuning so we can reproduce the results
#         learning_rate = learning_rate,
#         n_estimators = 10000,
#         max_depth= max_depth,
#         nthread = -1,
#         silent = False,
#         min_child_weight = min_child_weight,
#         colsample_bytree = colsample_bytree,
#         subsample = subsample,
#         gamma = x
#     )
#     rgr.fit(
#         X_train,y_train,
#         eval_set=[(X_val,y_val)],
#         eval_metric='mlogloss',
#         early_stopping_rounds=50,
#         verbose=False
#     )

    tmp = cv_train(train_X,train_y,max_depth = max_depth,min_child_weight = min_child_weight, 
                   colsample_bytree = colsample_bytree, subsample = subsample, gamma = x)
    if  tmp < best_score:
        best_score = tmp
        train_param = x

    print x, '\t', tmp

Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[164]	train-mlogloss:0.393296+0.00247187	test-mlogloss:0.526782+0.00697394

0.3 	0.5267822
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[186]	train-mlogloss:0.378468+0.00338923	test-mlogloss:0.526772+0.00705368

0.6 	0.526772
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[175]	train-mlogloss:0.38681+0.00159871	test-mlogloss:0.526846+0.00696989

0.9 	0.526846
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[167]	train-mlogloss:0

In [62]:
gamma = train_param
print gamma

0


In [31]:
xgtrain = xgb.DMatrix(train_X, label=train_y) 

def xgb_evaluate(min_child_weight, colsample_bytree, max_depth, gamma): #, subsample
    params = dict()
    params['objective']='multi:softprob'
    params['eval_metric']='mlogloss',
    params['num_class']=3
    params['silent']=1
    params['eta'] = 0.1
    params['verbose_eval'] = True
    params['min_child_weight'] = int(min_child_weight)
    params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = 0.99     # max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    
    cv_result = xgb.cv(
        params, xgtrain, 
        num_boost_round=10000, nfold=5,
        metrics = 'mlogloss',
        seed=1234,callbacks=[xgb.callback.early_stop(50)]
    )
    
    return -cv_result['test-mlogloss-mean'].values[-1]


xgb_BO = BayesianOptimization(
    xgb_evaluate, 
    {
        'max_depth': (4,15),
        'min_child_weight': (8,80),
        'colsample_bytree': (0.2,0.8),
#         'subsample': (0.7,1),
        'gamma': (0,3)
    }
)

xgb_BO.maximize(init_points=10, n_iter=40)

[31mInitialization[0m
[94m-------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[157]	train-mlogloss:0.23395+0.00122047	test-mlogloss:0.524207+0.00616704

    1 | 18m19s | [35m  -0.52421[0m | [32m            0.3187[0m | [32m   1.8873[0m | [32m    14.5286[0m | [32m           15.1511[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[761]	train-mlogloss:0.399624+0.00264638	test-mlogloss:0.524972+0.00588274

    2 | 19m25s |   -0.52497 |             0.6128 |    1.5493 |      4.6842 |            71.0705 | 
Multiple eval metrics 

  " state: %s" % convergence_dict)


   12 | 14m08s |   -0.52319 |             0.2640 |    2.9058 |     14.9508 |            66.8396 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1334]	train-mlogloss:0.418507+0.00200147	test-mlogloss:0.524462+0.00647836

   13 | 12m30s |   -0.52446 |             0.2136 |    2.8573 |      4.0338 |            51.0298 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1049]	train-mlogloss:0.396552+0.00368615	test-mlogloss:0.523874+0.00596379

   14 | 21m59s |   -0.52387 |             0.7004 |    2.9854 |      4.0457 |            20.4812 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[233]	train-mloglo

  " state: %s" % convergence_dict)


   15 | 07m34s |   -0.52440 |             0.2268 |    0.0235 |     14.6446 |            79.4545 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[757]	train-mlogloss:0.37502+0.000834533	test-mlogloss:0.523353+0.0065462

   16 | 08m25s |   -0.52335 |             0.2603 |    0.0289 |      4.3906 |            10.8162 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[90]	train-mlogloss:0.179749+0.00297028	test-mlogloss:0.534121+0.00831265

   17 | 05m06s |   -0.53412 |             0.2494 |    0.6160 |     14.8996 |             8.0272 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[739]	train-mlogloss:0

  " state: %s" % convergence_dict)


   19 | 10m03s |   -0.52196 |             0.2231 |    2.9957 |     14.6389 |            33.2639 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1404]	train-mlogloss:0.396343+0.00268244	test-mlogloss:0.52397+0.00633921

   20 | 28m49s |   -0.52397 |             0.7037 |    2.9951 |      4.1527 |            40.6643 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[394]	train-mlogloss:0.338903+0.00173118	test-mlogloss:0.522425+0.00721242



  " state: %s" % convergence_dict)


   21 | 11m12s |   -0.52242 |             0.2334 |    2.9479 |     12.6802 |            57.2975 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[459]	train-mlogloss:0.332138+0.00115371	test-mlogloss:0.521091+0.00701603



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   22 | 08m50s | [35m  -0.52109[0m | [32m            0.2203[0m | [32m   2.9524[0m | [32m     8.0637[0m | [32m           14.5202[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[461]	train-mlogloss:0.33277+0.00138585	test-mlogloss:0.521888+0.00664911

   23 | 10m26s |   -0.52189 |             0.2173 |    2.9724 |     10.8705 |            39.5027 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[428]	train-mlogloss:0.324331+0.00113049	test-mlogloss:0.520749+0.00630795



  " state: %s" % convergence_dict)


   24 | 09m04s | [35m  -0.52075[0m | [32m            0.2196[0m | [32m   2.9991[0m | [32m     9.9786[0m | [32m           17.9067[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[508]	train-mlogloss:0.326938+0.000692585	test-mlogloss:0.52163+0.00584595



  " state: %s" % convergence_dict)


   25 | 11m05s |   -0.52163 |             0.2209 |    2.9016 |      9.7721 |            31.3829 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[750]	train-mlogloss:0.373866+0.00227905	test-mlogloss:0.521902+0.00690022



  " state: %s" % convergence_dict)


   26 | 11m19s |   -0.52190 |             0.2107 |    2.9886 |      7.6344 |            61.3300 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[252]	train-mlogloss:0.336469+0.00218109	test-mlogloss:0.522949+0.00666208

   27 | 05m42s |   -0.52295 |             0.2092 |    0.0029 |      9.1155 |            60.5460 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1297]	train-mlogloss:0.402724+0.00187718	test-mlogloss:0.522849+0.00631957



  " state: %s" % convergence_dict)


   28 | 14m38s |   -0.52285 |             0.2665 |    2.9725 |      4.0401 |            13.8616 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[198]	train-mlogloss:0.271867+0.00145997	test-mlogloss:0.522183+0.00691938



  " state: %s" % convergence_dict)


   29 | 06m13s |   -0.52218 |             0.2430 |    0.0136 |      9.5056 |            18.3178 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[317]	train-mlogloss:0.295096+0.00105552	test-mlogloss:0.523096+0.00605109



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   30 | 19m02s |   -0.52310 |             0.7740 |    2.9917 |      9.2091 |            16.3239 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[303]	train-mlogloss:0.320382+0.0021877	test-mlogloss:0.522868+0.00622288



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   31 | 05m57s |   -0.52287 |             0.2014 |    0.0477 |      8.5258 |            44.3389 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[656]	train-mlogloss:0.348866+0.00212292	test-mlogloss:0.521771+0.00663447



  " state: %s" % convergence_dict)


   32 | 10m22s |   -0.52177 |             0.2070 |    2.9618 |      7.3106 |            23.5246 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[548]	train-mlogloss:0.34419+0.00189499	test-mlogloss:0.522579+0.00646543

   33 | 12m28s |   -0.52258 |             0.2098 |    2.9181 |     11.0907 |            71.9354 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[342]	train-mlogloss:0.289751+0.00167157	test-mlogloss:0.521939+0.00695795



  " state: %s" % convergence_dict)


   34 | 11m14s |   -0.52194 |             0.2141 |    2.8797 |     13.7639 |            20.5322 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[187]	train-mlogloss:0.271868+0.00204199	test-mlogloss:0.523538+0.00687634



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   35 | 05m46s |   -0.52354 |             0.2102 |    0.1569 |     11.0299 |            32.1360 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[356]	train-mlogloss:0.336891+0.00153242	test-mlogloss:0.523284+0.00716312

   36 | 12m29s |   -0.52328 |             0.2847 |    2.7754 |     14.8875 |            79.9029 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[582]	train-mlogloss:0.345618+0.00146564	test-mlogloss:0.522022+0.00678958



  " state: %s" % convergence_dict)


   37 | 12m33s |   -0.52202 |             0.2175 |    2.9719 |     10.3356 |            64.7402 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[499]	train-mlogloss:0.334555+0.00142179	test-mlogloss:0.522136+0.00695774

   38 | 10m39s |   -0.52214 |             0.2029 |    2.8065 |     10.3466 |            50.1483 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[433]	train-mlogloss:0.362511+0.00196944	test-mlogloss:0.522037+0.00631406



  " state: %s" % convergence_dict)


   39 | 08m00s |   -0.52204 |             0.2014 |    2.9767 |      8.7965 |            35.7313 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[694]	train-mlogloss:0.396012+0.00174065	test-mlogloss:0.523436+0.00683814

   40 | 10m46s |   -0.52344 |             0.2257 |    0.2491 |      4.0482 |            24.1539 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[385]	train-mlogloss:0.332956+0.00125381	test-mlogloss:0.521049+0.00672689

   41 | 14m52s |   -0.52105 |             0.2191 |    2.6892 |      8.4803 |            19.8283 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[707]	train-mlogloss

  " state: %s" % convergence_dict)


   42 | 10m58s |   -0.52129 |             0.2223 |    2.9244 |      6.9990 |            11.2221 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[374]	train-mlogloss:0.302449+0.00141864	test-mlogloss:0.521144+0.00642974



  " state: %s" % convergence_dict)


   43 | 10m15s |   -0.52114 |             0.2206 |    2.9852 |     11.9499 |            17.4165 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[326]	train-mlogloss:0.302866+0.000916377	test-mlogloss:0.521396+0.00659012



  " state: %s" % convergence_dict)


   44 | 08m17s |   -0.52140 |             0.2133 |    2.6109 |     10.3001 |            19.8886 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[346]	train-mlogloss:0.290526+0.00123105	test-mlogloss:0.522693+0.0065156



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   45 | 11m24s |   -0.52269 |             0.2262 |    2.8142 |     14.9305 |            28.4110 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[443]	train-mlogloss:0.376112+0.00197498	test-mlogloss:0.521707+0.00691315

   46 | 08m35s |   -0.52171 |             0.2135 |    2.9574 |      8.0057 |            57.1056 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[491]	train-mlogloss:0.329543+0.00140547	test-mlogloss:0.520369+0.00691078

   47 | 09m26s | [35m  -0.52037[0m | [32m            0.2151[0m | [32m   2.8569[0m | [32m     8.1622[0m | [32m           18.5155[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
S

  " state: %s" % convergence_dict)


   50 | 08m13s |   -0.52480 |             0.2362 |    0.0639 |      4.0108 |            59.2135 | 


In [None]:
#  Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight | 
# Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

# Will train until test-mlogloss hasn't improved in 50 rounds.
# Stopping. Best iteration:
# [201]	train-mlogloss:0.293189+0.00192919	test-mlogloss:0.521722+0.00743877

#     1 | 08m41s |   -0.52172 |             0.3187 |    1.8873 |      9.7000 |            15.1511 | 

In [32]:
xgb_bo_scores = pd.DataFrame([[s[0]['max_depth'],
                               s[0]['min_child_weight'],
                               s[0]['colsample_bytree'],
#                                s[0]['subsample'],
                               s[0]['gamma'],
                               s[1]] for s in zip(xgb_BO.res['all']['params'],xgb_BO.res['all']['values'])],
                            columns = ['max_depth',
                                       'min_child_weight',
                                       'colsample_bytree',
#                                        'subsample',
                                       'gamma',
                                       'score'])
xgb_bo_scores=xgb_bo_scores.sort_values('score',ascending=False)
xgb_bo_scores.head(10)

Unnamed: 0,max_depth,min_child_weight,colsample_bytree,gamma,score
36,8.162198,18.515538,0.215099,2.856949,-0.520369
13,9.978596,17.906659,0.219601,2.99913,-0.520749
30,8.480326,19.828347,0.219113,2.689181,-0.521049
11,8.06368,14.520219,0.220268,2.952387,-0.521091
32,11.949935,17.416496,0.220621,2.985171,-0.521144
31,6.999021,11.222119,0.222311,2.924356,-0.521293
33,10.300054,19.888567,0.213301,2.610908,-0.521396
14,9.772146,31.382917,0.220899,2.901584,-0.52163
35,8.0057,57.105623,0.21353,2.957402,-0.521707
21,7.310599,23.524609,0.206995,2.961841,-0.521771


In [47]:
def xgb_blend(estimators, train_x, train_y, test_x, fold, early_stopping_rounds=0,randomseed=1234):
    N_params = len(estimators)
#     print ("Blend %d estimators for %d folds" % (N_params, fold))
    skf = KFold(n_splits=fold,shuffle=True,random_state=randomseed)
    N_class = len(set(train_y))
        
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x_mean = np.zeros((test_x.shape[0], N_class*N_params))
    test_blend_x_gmean = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros ((fold,N_params))
    best_rounds = np.zeros ((fold, N_params))
    
    for j, est in enumerate(estimators):
        est.set_params(objective = 'multi:softprob')
        est.set_params(silent = False)
        est.set_params(learning_rate = 0.02)
        est.set_params(n_estimators=1000000)
        
#         print ("Model %d: %s" %(j+1, est))

        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))
    
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
#             print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x.iloc[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x.iloc[val_index]
            val_y_fold = train_y[val_index]      

            est.fit(train_x_fold,train_y_fold,
                    eval_set = [(val_x_fold, val_y_fold)],
                    eval_metric = 'mlogloss',
                    early_stopping_rounds=early_stopping_rounds,
                    verbose=False)
            best_round=est.best_iteration
            best_rounds[i,j]=best_round
#             print ("best round %d" % (best_round))
            val_y_predict_fold = est.predict_proba(val_x_fold,ntree_limit=best_round)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print "Score: ", score
            scores[i,j]=score
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = est.predict_proba(test_x,ntree_limit=best_round)
            print ("Model %d fold %d fitting finished in %0.3fm" % (j+1,i+1, (time.time() - fold_start)/60))
            
        test_blend_x_mean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
        
        test_blend_x_gmean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([gmean(test_blend_x_j[:,range(0,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(1,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(2,N_class*fold,N_class)], axis=1)]).T
            
#         print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print ("Score for blended models is %f" % (np.mean(scores)))
    return (train_blend_x, test_blend_x_mean, test_blend_x_gmean, scores,best_rounds)


In [48]:
train_total = np.zeros((train_X.shape[0], 3))
test_total = np.zeros((test_X.shape[0], 3))
score_total = 0
count = 5

for n in range(count):
    randomseed = n
    estimators = [
                 xgb.XGBClassifier(max_depth = 8,
                                  min_child_weight = 18,
                                  colsample_bytree = 0.215099,
                                  subsample = 0.99,
                                  gamma = 2.856949),  
                 ]

    (train_blend_x_xgb,
     test_blend_x_xgb_mean,
     test_blend_x_xgb_gmean,
     blend_scores_xgb,
     best_rounds_xgb) = xgb_blend(estimators,
                                  train_X,train_y,
                                  test_X,
                                  6,
                                  500,randomseed)
    train_total += train_blend_x_xgb
    test_total += test_blend_x_xgb_mean
    score_total += np.mean(blend_scores_xgb)
    
train_total = train_total / count
test_total = test_total / count
score_total = score_total / count

#  	 	max_depth 	min_child_weight 	colsample_bytree 	gamma 	score
# 36 	8.162198 	18.515538 	 	 	0.215099 	 	 	2.856949 	-0.520369
# 13 	9.978596 	17.906659 	 	 	0.219601 	 	 	2.999130 	-0.520749
# 30 	8.480326 	19.828347 	 	 	0.219113 	 	 	2.689181 	-0.521049
# 11 	8.063680 	14.520219 	 	 	0.220268 	 	 	2.952387 	-0.521091

Score:  0.515082505834
Model 1 fold 1 fitting finished in 10.988m
Score:  0.512235477029
Model 1 fold 2 fitting finished in 26.918m
Score:  0.520604877548
Model 1 fold 3 fitting finished in 9.655m
Score:  0.511695108693
Model 1 fold 4 fitting finished in 12.752m
Score:  0.511981690547
Model 1 fold 5 fitting finished in 17.613m
Score:  0.521202828917
Model 1 fold 6 fitting finished in 11.254m
Score for blended models is 0.515467
Score:  0.512940925513
Model 1 fold 1 fitting finished in 13.341m
Score:  0.515271860963
Model 1 fold 2 fitting finished in 9.429m
Score:  0.524370546769
Model 1 fold 3 fitting finished in 14.211m
Score:  0.514223065154
Model 1 fold 4 fitting finished in 14.095m
Score:  0.50739124694
Model 1 fold 5 fitting finished in 17.170m
Score:  0.518891196996
Model 1 fold 6 fitting finished in 17.798m
Score for blended models is 0.515515
Score:  0.522438349286
Model 1 fold 1 fitting finished in 9.580m
Score:  0.517003901371
Model 1 fold 2 fitting finished in 14.412m
Score:

In [49]:
score_total

0.5157806614905075

In [50]:
train_blend_x_xgb = pd.DataFrame(train_total)
train_blend_x_xgb.columns = ["low", "medium", "high"]
train_blend_x_xgb["listing_id"] = train_X.listing_id.values

test_blend_x_xgb_mean = pd.DataFrame(test_total)
test_blend_x_xgb_mean.columns = ["low", "medium", "high"]
test_blend_x_xgb_mean["listing_id"] = test_X.listing_id.values

In [51]:
tmp_train = train_X_0322[['listing_id']].merge(train_blend_x_xgb,on = 'listing_id', how = 'left')[["low", "medium", "high"]].values
tmp_test_mean = test_X_0322[['listing_id']].merge(test_blend_x_xgb_mean,on = 'listing_id', how = 'left')[["low", "medium", "high"]].values

In [52]:
from datetime import datetime
now = datetime.now()

name_train_blend = '../output/train_blend_xgb_141bagging_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../output/test_blend_xgb_mean_141bagging_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_xgb,axis=0))
print (np.mean(best_rounds_xgb,axis=0))
np.savetxt(name_train_blend,tmp_train, delimiter=",")
np.savetxt(name_test_blend_mean,tmp_test_mean, delimiter=",")

[ 0.51590411]
[ 3839.66666667]


In [53]:
# now = datetime.now()
sub_name = '../output/sub_XGB_mean_141bagging_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(tmp_test_mean[:,:3])
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = test_X_0322.listing_id.values
out_df.to_csv(sub_name, index=False)