In [1]:
import numpy as np
import pandas as pd
from bayes_opt import BayesianOptimization
import xgboost as xgb
from itertools import product
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn import model_selection,ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import time
import random
from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, Imputer, LabelBinarizer, MultiLabelBinarizer,LabelEncoder
from sklearn.cluster import KMeans
from scipy.stats.mstats import gmean
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



In [2]:
def cv_train(train,y,max_depth = 6,min_child_weight = 1,colsample_bytree = 1, subsample = 1, gamma = 0 , verbose_eval = None,
            seed = 0, early_stop = 50, nfold = 5, eta=0.3):
    xgtrain = xgb.DMatrix(train, label=y)
    params = dict()
    params['objective']='multi:softprob'
    params['eval_metric']='mlogloss',
    params['num_class']=3
    params['silent']=0
    params['eta'] = eta
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    params['colsample_bytree'] = colsample_bytree
    params['subsample'] = subsample
    params['gamma'] = gamma
#     params['booster'] = 'dart'
#     params['rate_drop'] = 0.1
#     params['skip_drop'] = 0.5
    
    cv_result = xgb.cv(
        params, xgtrain, 
        num_boost_round=10000, nfold=nfold,
        metrics = 'mlogloss', verbose_eval = verbose_eval,
        seed=seed,callbacks=[xgb.callback.early_stop(early_stop)]
    )

    return cv_result['test-mlogloss-mean'].values[-1]

In [3]:
def CV_st(train,test,feature1,feature2):
    index=list(range(train.shape[0]))
    random.shuffle(index)
    kf = KFold(n_splits=5,shuffle=True, random_state=0)
    
    # median feature names
    features_tmp = []
    features_tmp.append(feature1 + '_' + feature2 + '_low_median') 
    features_tmp.append(feature1 + '_' + feature2 + '_medium_median') 
    features_tmp.append(feature1 + '_' + feature2 + '_high_median')
    
    # train data 
    f_low=pd.Series([np.nan]*len(train))
    f_medium=pd.Series([np.nan]*len(train))
    f_high=pd.Series([np.nan]*len(train))

    for train_index, test_index in kf.split(index):
        tmp = train.iloc[train_index].groupby(['interest_level',feature1])[feature2].median().\
                reset_index().rename(columns={feature2:'new'})
        f_low[test_index]    = train.iloc[test_index].merge(tmp[tmp.interest_level == 'low'],
                                                            on=feature1,how='left')['new'].values
        f_medium[test_index] = train.iloc[test_index].merge(tmp[tmp.interest_level == 'medium'],
                                                            on=feature1,how='left')['new'].values
        f_high[test_index]   = train.iloc[test_index].merge(tmp[tmp.interest_level == 'high'],
                                                            on=feature1,how='left')['new'].values   
    train[features_tmp[0]] = f_low
    train[features_tmp[1]] = f_medium
    train[features_tmp[2]] = f_high

    # test data
    tmp = train.groupby(['interest_level',feature1])[feature2].median().\
            reset_index().rename(columns={feature2:'new'})    
    test[features_tmp[0]] = test.merge(tmp[tmp.interest_level == 'low'],
                                       on=feature1,how='left')['new'].values
    test[features_tmp[1]] = test.merge(tmp[tmp.interest_level == 'medium'],
                                       on=feature1,how='left')['new'].values
    test[features_tmp[2]] = test.merge(tmp[tmp.interest_level == 'high'],
                                       on=feature1,how='left')['new'].values 
    

    # mean feature names
    features_tmp.append(feature1 + '_' + feature2 + '_low_mean') 
    features_tmp.append(feature1 + '_' + feature2 + '_medium_mean') 
    features_tmp.append(feature1 + '_' + feature2 + '_high_mean')
    
    # train data 
    f_low=pd.Series([np.nan]*len(train))
    f_medium=pd.Series([np.nan]*len(train))
    f_high=pd.Series([np.nan]*len(train))

    for train_index, test_index in kf.split(index):
        tmp = train.iloc[train_index].groupby(['interest_level',feature1])[feature2].mean().\
                reset_index().rename(columns={feature2:'new'})
        f_low[test_index]    = train.iloc[test_index].merge(tmp[tmp.interest_level == 'low'],
                                                            on=feature1,how='left')['new'].values
        f_medium[test_index] = train.iloc[test_index].merge(tmp[tmp.interest_level == 'medium'],
                                                            on=feature1,how='left')['new'].values
        f_high[test_index]   = train.iloc[test_index].merge(tmp[tmp.interest_level == 'high'],
                                                            on=feature1,how='left')['new'].values   
    train[features_tmp[3]] = f_low
    train[features_tmp[4]] = f_medium
    train[features_tmp[5]] = f_high

    # test data
    tmp = train.groupby(['interest_level',feature1])[feature2].mean().\
            reset_index().rename(columns={feature2:'new'})    
    test[features_tmp[3]] = test.merge(tmp[tmp.interest_level == 'low'],
                                       on=feature1,how='left')['new'].values
    test[features_tmp[4]] = test.merge(tmp[tmp.interest_level == 'medium'],
                                       on=feature1,how='left')['new'].values
    test[features_tmp[5]] = test.merge(tmp[tmp.interest_level == 'high'],
                                       on=feature1,how='left')['new'].values 
    
    # max feature names
    features_tmp.append(feature1 + '_' + feature2 + '_low_max') 
    features_tmp.append(feature1 + '_' + feature2 + '_medium_max') 
    features_tmp.append(feature1 + '_' + feature2 + '_high_max')
    
    # train data 
    f_low=pd.Series([np.nan]*len(train))
    f_medium=pd.Series([np.nan]*len(train))
    f_high=pd.Series([np.nan]*len(train))

    for train_index, test_index in kf.split(index):
        tmp = train.iloc[train_index].groupby(['interest_level',feature1])[feature2].max().\
                reset_index().rename(columns={feature2:'new'})
        f_low[test_index]    = train.iloc[test_index].merge(tmp[tmp.interest_level == 'low'],
                                                            on=feature1,how='left')['new'].values
        f_medium[test_index] = train.iloc[test_index].merge(tmp[tmp.interest_level == 'medium'],
                                                            on=feature1,how='left')['new'].values
        f_high[test_index]   = train.iloc[test_index].merge(tmp[tmp.interest_level == 'high'],
                                                            on=feature1,how='left')['new'].values   
    train[features_tmp[6]] = f_low
    train[features_tmp[7]] = f_medium
    train[features_tmp[8]] = f_high

    # test data
    tmp = train.groupby(['interest_level',feature1])[feature2].max().\
            reset_index().rename(columns={feature2:'new'})    
    test[features_tmp[6]] = test.merge(tmp[tmp.interest_level == 'low'],
                                       on=feature1,how='left')['new'].values
    test[features_tmp[7]] = test.merge(tmp[tmp.interest_level == 'medium'],
                                       on=feature1,how='left')['new'].values
    test[features_tmp[8]] = test.merge(tmp[tmp.interest_level == 'high'],
                                       on=feature1,how='left')['new'].values 
    
    # min feature names
    features_tmp.append(feature1 + '_' + feature2 + '_low_min') 
    features_tmp.append(feature1 + '_' + feature2 + '_medium_min') 
    features_tmp.append(feature1 + '_' + feature2 + '_high_min')
    
    # train data 
    f_low=pd.Series([np.nan]*len(train))
    f_medium=pd.Series([np.nan]*len(train))
    f_high=pd.Series([np.nan]*len(train))

    for train_index, test_index in kf.split(index):
        tmp = train.iloc[train_index].groupby(['interest_level',feature1])[feature2].min().\
                reset_index().rename(columns={feature2:'new'})
        f_low[test_index]    = train.iloc[test_index].merge(tmp[tmp.interest_level == 'low'],
                                                            on=feature1,how='left')['new'].values
        f_medium[test_index] = train.iloc[test_index].merge(tmp[tmp.interest_level == 'medium'],
                                                            on=feature1,how='left')['new'].values
        f_high[test_index]   = train.iloc[test_index].merge(tmp[tmp.interest_level == 'high'],
                                                            on=feature1,how='left')['new'].values   
    train[features_tmp[9]] = f_low
    train[features_tmp[10]] = f_medium
    train[features_tmp[11]] = f_high

    # test data
    tmp = train.groupby(['interest_level',feature1])[feature2].min().\
            reset_index().rename(columns={feature2:'new'})    
    test[features_tmp[9]] = test.merge(tmp[tmp.interest_level == 'low'],
                                       on=feature1,how='left')['new'].values
    test[features_tmp[10]] = test.merge(tmp[tmp.interest_level == 'medium'],
                                       on=feature1,how='left')['new'].values
    test[features_tmp[11]] = test.merge(tmp[tmp.interest_level == 'high'],
                                       on=feature1,how='left')['new'].values 

#     # std feature names
#     features_tmp.append(feature1 + '_' + feature2 + '_low_std') 
#     features_tmp.append(feature1 + '_' + feature2 + '_medium_std') 
#     features_tmp.append(feature1 + '_' + feature2 + '_high_std')
    
#     # train data 
#     f_low=pd.Series([np.nan]*len(train))
#     f_medium=pd.Series([np.nan]*len(train))
#     f_high=pd.Series([np.nan]*len(train))

#     for train_index, test_index in kf.split(index):
#         tmp = train.iloc[train_index].groupby(['interest_level',feature1])[feature2].std().\
#                 reset_index().rename(columns={feature2:'new'})
#         f_low[test_index]    = train.iloc[test_index].merge(tmp[tmp.interest_level == 'low'],
#                                                             on=feature1,how='left')['new'].values
#         f_medium[test_index] = train.iloc[test_index].merge(tmp[tmp.interest_level == 'medium'],
#                                                             on=feature1,how='left')['new'].values
#         f_high[test_index]   = train.iloc[test_index].merge(tmp[tmp.interest_level == 'high'],
#                                                             on=feature1,how='left')['new'].values   
#     train[features_tmp[12]] = f_low
#     train[features_tmp[13]] = f_medium
#     train[features_tmp[14]] = f_high

#     # test data
#     tmp = train.groupby(['interest_level',feature1])[feature2].std().\
#             reset_index().rename(columns={feature2:'new'})    
#     test[features_tmp[12]] = test.merge(tmp[tmp.interest_level == 'low'],
#                                        on=feature1,how='left')['new'].values
#     test[features_tmp[13]] = test.merge(tmp[tmp.interest_level == 'medium'],
#                                        on=feature1,how='left')['new'].values
#     test[features_tmp[14]] = test.merge(tmp[tmp.interest_level == 'high'],
#                                        on=feature1,how='left')['new'].values 
    
#     # var feature names
#     features_tmp.append(feature1 + '_' + feature2 + '_low_var') 
#     features_tmp.append(feature1 + '_' + feature2 + '_medium_var') 
#     features_tmp.append(feature1 + '_' + feature2 + '_high_var')
    
#     # train data 
#     f_low=pd.Series([np.nan]*len(train))
#     f_medium=pd.Series([np.nan]*len(train))
#     f_high=pd.Series([np.nan]*len(train))

#     for train_index, test_index in kf.split(index):
#         tmp = train.iloc[train_index].groupby(['interest_level',feature1])[feature2].var().\
#                 reset_index().rename(columns={feature2:'new'})
#         f_low[test_index]    = train.iloc[test_index].merge(tmp[tmp.interest_level == 'low'],
#                                                             on=feature1,how='left')['new'].values
#         f_medium[test_index] = train.iloc[test_index].merge(tmp[tmp.interest_level == 'medium'],
#                                                             on=feature1,how='left')['new'].values
#         f_high[test_index]   = train.iloc[test_index].merge(tmp[tmp.interest_level == 'high'],
#                                                             on=feature1,how='left')['new'].values   
#     train[features_tmp[15]] = f_low
#     train[features_tmp[16]] = f_medium
#     train[features_tmp[17]] = f_high

#     # test data
#     tmp = train.groupby(['interest_level',feature1])[feature2].var().\
#             reset_index().rename(columns={feature2:'new'})    
#     test[features_tmp[15]] = test.merge(tmp[tmp.interest_level == 'low'],
#                                        on=feature1,how='left')['new'].values
#     test[features_tmp[16]] = test.merge(tmp[tmp.interest_level == 'medium'],
#                                        on=feature1,how='left')['new'].values
#     test[features_tmp[17]] = test.merge(tmp[tmp.interest_level == 'high'],
#                                        on=feature1,how='left')['new'].values 
    
#     # ratio/diff feature
#     cols = features_tmp[:]
# #     features_tmp = []
#     for col in cols:
#         new_feature = col+'_ratio'
#         train[new_feature] = train[col] / train[feature2]
#         test[new_feature] = test[col] / test[feature2]
#         features_tmp.append(new_feature)
        
    print feature1,' vs ', feature2,'Done!'
    return train,test,features_tmp

In [33]:
#input data
train_df=pd.read_json('../input/train.json').reset_index(drop = True)
test_df=pd.read_json('../input/test.json').reset_index(drop = True)
test_df.loc[test_df.bathrooms == 112.0,'bathrooms'] = 1.5    
test_df.loc[test_df.bathrooms == 20.0,'bathrooms'] = 2.0
test_df.loc[test_df.listing_id == 7220763,'bedrooms'] = 3
test_df.loc[test_df.listing_id == 7047074,'bedrooms'] = 6
print train_df.shape
print test_df.shape

(49352, 15)
(74659, 14)


In [34]:
def add_features(df):
    fmt = lambda s: s.replace("\u00a0", "").strip().lower()
    df["num_photo_count"] = df["photos"].apply(len)
    df["street_address"] = df['street_address'].apply(fmt)
    df["display_address"] = df["display_address"].apply(fmt)
    df["num_desc_wordcount"] = df["description"].apply(len)
    df["num_pricePerBed"] = df['price'] / df['bedrooms']
    df["num_pricePerBath"] = df['price'] / df['bathrooms']
    df["num_pricePerRoom"] = df['price'] / (df['bedrooms'] + df['bathrooms'])
    df["num_bedPerBath"] = df['bedrooms'] / df['bathrooms']
    df["num_bedBathDiff"] = df['bedrooms'] - df['bathrooms']
    df["num_bedBathSum"] = df["bedrooms"] + df['bathrooms']
    df["num_bedsPerc"] = df["bedrooms"] / (df['bedrooms'] + df['bathrooms'])

    df = df.fillna(-1).replace(np.inf, -1)
    return df

# Add common features
train_df = add_features(train_df)
test_df = add_features(test_df) 


# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

train_df['num_desc_length_null'] = (train_df.description.str.len()==0).astype(float)
test_df['num_desc_length_null'] = (test_df.description.str.len()==0).astype(float)
    
features_to_use=[
    "latitude", "longitude","num_pricePerBed",
    'num_bedBathSum','num_pricePerBath','num_pricePerRoom','num_bedPerBath',
    'num_bedBathDiff','num_bedsPerc',
    "num_photo_count", "num_features", "num_desc_wordcount",'num_desc_length_null',
    "listing_id"]

print 'Done!'

Done!


In [35]:
# Location features: Latitude, longitude
precision = 3
x = np.sqrt(((train_df.latitude - train_df.latitude.median())**2) + (train_df.longitude - train_df.longitude.median())**2)
train_df['num_dist_from_center'] = x.values
x = np.sqrt(((test_df.latitude - train_df.latitude.median())**2) + (test_df.longitude - train_df.longitude.median())**2)
test_df['num_dist_from_center'] = x.values
train_df['position'] = train_df.longitude.round(precision).astype(str) + '_' + train_df.latitude.round(precision).astype(str)
test_df['position'] = test_df.longitude.round(precision).astype(str) + '_' + test_df.latitude.round(precision).astype(str)

new_feature = ['num_dist_from_center']
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)
print 'Done!'

Done!


In [36]:
# Degree of "outlierness"
OutlierAggregated = (train_df.bedrooms > 4).astype(float)
OutlierAggregated2 = (test_df.bedrooms > 4).astype(float)
OutlierAggregated += (train_df.bathrooms > 3).astype(float)
OutlierAggregated2 += (test_df.bathrooms > 3).astype(float)
OutlierAggregated += (train_df.bathrooms < 1).astype(float)
OutlierAggregated2 += (test_df.bathrooms < 1).astype(float)
x = np.abs((train_df.price - train_df.price.median())/train_df.price.std()) > 0.30
OutlierAggregated += x.astype(float)
x2 = np.abs((test_df.price - train_df.price.median())/train_df.price.std()) > 0.30
OutlierAggregated2 += x2.astype(float)
x = np.log1p(train_df.price/(train_df.bedrooms.clip(1,3) + train_df.bathrooms.clip(1,2))) > 8.2
OutlierAggregated += x.astype(float)
x2 = np.log1p(test_df.price/(test_df.bedrooms.clip(1,3) + test_df.bathrooms.clip(1,2))) > 8.2
OutlierAggregated2 += x2.astype(float)
x = np.sqrt(((train_df.latitude - train_df.latitude.median())**2) + (train_df.longitude - train_df.longitude.median())**2) > 0.30
OutlierAggregated += x.astype(float)
x2 = np.sqrt(((test_df.latitude - train_df.latitude.median())**2) + (test_df.longitude - train_df.longitude.median())**2) > 0.30
OutlierAggregated2 += x2.astype(float)
train_df['num_OutlierAggregated'] = OutlierAggregated.values
test_df['num_OutlierAggregated'] = OutlierAggregated2.values


new_feature = ['num_OutlierAggregated']
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)
        
print 'Done!'

Done!


In [37]:
# Density in unique locations at given precision
vals = train_df['position'].value_counts()
dvals = vals.to_dict()
train_df['num_pos_density'] = train_df['position'].apply(lambda x: dvals.get(x, vals.min()))
test_df['num_pos_density'] = test_df['position'].apply(lambda x: dvals.get(x, vals.min()))

# Building null
train_df['num_building_null'] = (train_df.building_id=='0').astype(float)
test_df['num_building_null'] = (test_df.building_id=='0').astype(float)


new_feature = ['num_pos_density','num_building_null']
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)
        
print 'Done!'

Done!


In [38]:
# Creation time features
train_df['created'] = pd.to_datetime(train_df.created)
train_df['num_created_weekday'] = train_df.created.dt.dayofweek.astype(float)
train_df['num_created_weekofyear'] = train_df.created.dt.weekofyear
train_df['num_created_day'] = train_df.created.dt.day
train_df['num_created_month'] = train_df.created.dt.month
train_df['num_created_hour'] = train_df.created.dt.hour
  
test_df['created'] = pd.to_datetime(test_df.created)
test_df['num_created_weekday'] = test_df.created.dt.dayofweek
test_df['num_created_weekofyear'] = test_df.created.dt.weekofyear
test_df['num_created_day'] = test_df.created.dt.day
test_df['num_created_month'] = test_df.created.dt.month
test_df['num_created_hour'] = test_df.created.dt.hour


new_feature = ['num_created_weekday','num_created_weekofyear','num_created_day','num_created_month','num_created_hour']
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)
        
print 'Done!'

Done!


In [39]:
# Bedrooms/Bathrooms/Price
train_df['num_bathrooms'] = train_df.bathrooms.clip_upper(4)
test_df['num_bathrooms'] = test_df.bathrooms.clip_upper(4)

train_df['num_bedrooms'] = train_df.bedrooms.clip_upper(5)
test_df['num_bedrooms'] = test_df.bedrooms.clip_upper(5)

train_df['num_price'] = train_df.price.clip_upper(10000)
test_df['num_price'] = test_df.price.clip_upper(10000)

bins = train_df.price.quantile(np.arange(0.05, 1, 0.05))
train_df['num_price_q'] = np.digitize(train_df.price, bins)
test_df['num_price_q'] = np.digitize(test_df.price, bins)


new_feature = ['num_bathrooms','num_bedrooms','num_price','num_price_q']
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)
        
print 'Done!'

Done!


In [40]:
# Composite features based on: 
# https://www.kaggle.com/arnaldcat/two-sigma-connect-rental-listing-inquiries/a-proxy-for-sqft-and-the-interest-on-1-2-baths
train_df['num_priceXroom'] = (train_df.price / (1 + train_df.bedrooms.clip(1, 4) + 0.5*train_df.bathrooms.clip(0, 2))).values
test_df['num_priceXroom'] = (test_df.price / (1 + test_df.bedrooms.clip(1, 4) + 0.5*test_df.bathrooms.clip(0, 2))).values

train_df['num_even_bathrooms'] = ((np.round(train_df.bathrooms) - train_df.bathrooms)==0).astype(float)
test_df['num_even_bathrooms'] = ((np.round(test_df.bathrooms) - test_df.bathrooms)==0).astype(float)

new_feature = ['num_priceXroom','num_even_bathrooms']
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

print 'Done!'

Done!


In [41]:
categorical = ["display_address", "manager_id", "building_id", "street_address",'position']
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            if f not in features_to_use:
                features_to_use.append(f)

In [42]:
dftemp = train_df.copy()
for i in ['latitude', 'longitude']:
    while(1):
        x = dftemp[i].median()
        ix = abs(dftemp[i] - x) > 3*dftemp[i].std()
        if ix.sum()==0:
            break
        dftemp.loc[ix, i] = np.nan
dftemp = dftemp.loc[dftemp[['latitude', 'longitude']].isnull().sum(1) == 0, :]

dfm = DataFrameMapper([(['latitude'], [StandardScaler()]), (['longitude'], [StandardScaler()])])

for i in [6]:
    pipe_location = make_pipeline(dfm, KMeans(n_clusters=i, random_state=1))
    pipe_location.fit(dftemp);
    train_df['location_'+str(i)] = pipe_location.predict(train_df).astype(str)
    test_df['location_'+str(i)] = pipe_location.predict(test_df).astype(str)
for i in train_df.location_6.unique():
    f = 'num_location_6_'+str(i)
    train_df[f] = (train_df.location_6==i).astype(float)
    test_df[f] = (test_df.location_6==i).astype(float)
    if f not in features_to_use:
        features_to_use.append(f)
    
    
train_df['tmp_bathrooms'] = train_df.bathrooms.clip_upper(2)
test_df['tmp_bathrooms'] = test_df.bathrooms.clip_upper(2)
train_df['tmp_bedrooms'] = train_df.bedrooms.clip_upper(4)
test_df['tmp_bedrooms'] = test_df.bedrooms.clip_upper(4)
train_df['roomcal'] = train_df.tmp_bedrooms.astype(str) + '_' + train_df.tmp_bathrooms.astype(str)    
test_df['roomcal'] = test_df.tmp_bedrooms.astype(str) + '_' + test_df.tmp_bathrooms.astype(str)    

room_lb = LabelBinarizer()
room_lb.fit(train_df['roomcal'])
room_col = ['num_room_type_' + str(x) for x in range(len(train_df['roomcal'].unique()))]
for f in room_col:
    if f not in features_to_use:
        features_to_use.append(f)

train_df = train_df.join(pd.DataFrame(room_lb.transform(train_df['roomcal']),columns=room_col,index=train_df.index))
test_df = test_df.join(pd.DataFrame(room_lb.transform(test_df['roomcal']),columns=room_col,index=test_df.index))

tmp = train_df.groupby(['roomcal','location_6'])['num_price'].median().\
            reset_index().rename(columns={'num_price':'num_6_median_price'})
    
train_df = train_df.merge(tmp,on=['roomcal','location_6'],how='left')
test_df = test_df.merge(tmp,on=['roomcal','location_6'],how='left')

test_df.loc[27462,'num_6_median_price'] =  7200.0

train_df['num_6_price_ratio'] = train_df['num_price'] / train_df['num_6_median_price']
train_df['num_6_price_diff'] = train_df['num_price'] - train_df['num_6_median_price']
test_df['num_6_price_ratio'] = test_df['num_price'] / test_df['num_6_median_price']
test_df['num_6_price_diff'] = test_df['num_price'] - test_df['num_6_median_price']


for f in ['num_6_median_price','num_6_price_ratio','num_6_price_diff']:
    if f not in features_to_use:
        features_to_use.append(f)
        
        
print 'Done!'

Done!


In [43]:
tmp = train_df.groupby(['num_bedrooms','location_6'])['num_price'].median().\
            reset_index().rename(columns={'num_price':'num_6_median_price_bedroom'})
    
train_df = train_df.merge(tmp,on=['num_bedrooms','location_6'],how='left')
test_df = test_df.merge(tmp,on=['num_bedrooms','location_6'],how='left')

train_df['num_6_price_ratio_bedroom'] = train_df['num_price'] / train_df['num_6_median_price_bedroom']
train_df['num_6_price_diff_bedroom'] = train_df['num_price'] - train_df['num_6_median_price_bedroom']
test_df['num_6_price_ratio_bedroom'] = test_df['num_price'] / test_df['num_6_median_price_bedroom']
test_df['num_6_price_diff_bedroom'] = test_df['num_price'] - test_df['num_6_median_price_bedroom']


for f in ['num_6_median_price_bedroom','num_6_price_ratio_bedroom','num_6_price_diff_bedroom']:
    if f not in features_to_use:
        features_to_use.append(f)

In [44]:
def create_binary_features(df):
    bows = {
        "dogs": ("dogs", "dog",'pet friendly','pets'),
        "cats": ("cats",'pet friendly','pets'),
        "nofee": ("no fee", "no-fee", "no  fee", "nofee", "no_fee"),
        "lowfee": ("reduced_fee", "low_fee", "reduced fee", "low fee"),
        "furnished": ("furnished",'equipped'),
        "parquet": ("parquet", "hardwood"),
        "concierge": ("concierge", "doorman", "housekeep", "in_super"),
        "prewar": ("prewar", "pre_war", "pre war", "pre-war"),
        "laundry": ("laundry", "lndry"),
        "health": ("health", "gym", "fitness", "training"),
        "transport": ("train", "subway", "transport"),
        "parking": ("parking",),
        "utilities": ("utilities", "heat water", "water included"),
        'fireplace': ('fireplace','fireplaces'),
        'elevator': ('elevator'),
        'pool':('pool'),
        'loft':('loft'),
        'luxury':('luxury','valet'),
        'marble':('marble'),
        'onemounthfree': ('1 month free','one month free'),
        'washer':('washer','dryer')
    }

    def indicator(bow):
        return lambda s: int(any([x in s for x in bow]))

    features = df["features"].apply(lambda f: " ".join(f).lower())   # convert features to string
    for key in bows:
        tmp_key = "feature_" + key
        df[tmp_key] = features.apply(indicator(bows[key]))
        if tmp_key not in features_to_use:
            features_to_use.append(tmp_key)
    return df

# Create binarized features
train_df = create_binary_features(train_df)
test_df = create_binary_features(test_df)


In [45]:
data_path = "../input/"
train_X_0322 = pd.read_csv(data_path + 'train_BM_MB_add03052240.csv')
test_X_0322 = pd.read_csv(data_path + 'test_BM_MB_add03052240.csv')

print train_X_0322.shape
print test_X_0322.shape

(49352, 322)
(74659, 322)


In [46]:
sentiment = [
#     'building_id_mean_med','building_id_mean_high', 
#     'manager_id_mean_med','manager_id_mean_high',
    'median_price_bed', 'ratio_bed',
       'compound', 'neg', 'neu', 'pos', 'street',
       'avenue', 'east', 'west', 'north', 'south', 'other_address',
       'Zero_building_id', 'top_10_building', 'top_25_building',
       'top_5_building', 'top_50_building', 'top_1_building',
       'top_2_building', 'top_15_building', 'top_20_building',
       'top_30_building','listing_id'
]

train_df = train_df.merge(train_X_0322[sentiment],on='listing_id', how='left')
test_df = test_df.merge(test_X_0322[sentiment],on='listing_id', how='left')

for f in sentiment:
    if f not in features_to_use:
        features_to_use.append(f)

In [47]:
train_df.isnull().values.any()

False

In [48]:
test_df.isnull().values.any()

False

# CV statistics

In [49]:
index=list(range(train_df.shape[0]))
random.shuffle(index)
a=[np.nan]*len(train_df)
b=[np.nan]*len(train_df)
c=[np.nan]*len(train_df)

for i in range(5):
    building_level={}
    for j in train_df['manager_id'].values:
        building_level[j]=[0,0,0]
    test_index=index[int((i*train_df.shape[0])/5):int(((i+1)*train_df.shape[0])/5)]
    train_index=list(set(index).difference(test_index))
    for j in train_index:
        temp=train_df.iloc[j]
        if temp['interest_level']=='low':
            building_level[temp['manager_id']][0]+=1
        if temp['interest_level']=='medium':
            building_level[temp['manager_id']][1]+=1
        if temp['interest_level']=='high':
            building_level[temp['manager_id']][2]+=1
    for j in test_index:
        temp=train_df.iloc[j]
        if sum(building_level[temp['manager_id']])!=0:
            a[j]=building_level[temp['manager_id']][0]*1.0/sum(building_level[temp['manager_id']])
            b[j]=building_level[temp['manager_id']][1]*1.0/sum(building_level[temp['manager_id']])
            c[j]=building_level[temp['manager_id']][2]*1.0/sum(building_level[temp['manager_id']])
train_df['manager_level_low']=a
train_df['manager_level_medium']=b
train_df['manager_level_high']=c



a=[]
b=[]
c=[]
building_level={}
for j in train_df['manager_id'].values:
    building_level[j]=[0,0,0]
for j in range(train_df.shape[0]):
    temp=train_df.iloc[j]
    if temp['interest_level']=='low':
        building_level[temp['manager_id']][0]+=1
    if temp['interest_level']=='medium':
        building_level[temp['manager_id']][1]+=1
    if temp['interest_level']=='high':
        building_level[temp['manager_id']][2]+=1

for i in test_df['manager_id'].values:
    if i not in building_level.keys():
        a.append(np.nan)
        b.append(np.nan)
        c.append(np.nan)
    else:
        a.append(building_level[i][0]*1.0/sum(building_level[i]))
        b.append(building_level[i][1]*1.0/sum(building_level[i]))
        c.append(building_level[i][2]*1.0/sum(building_level[i]))
test_df['manager_level_low']=a
test_df['manager_level_medium']=b
test_df['manager_level_high']=c

features_to_use.append('manager_level_low') 
features_to_use.append('manager_level_medium') 
features_to_use.append('manager_level_high')

In [28]:
# index=list(range(train_df.shape[0]))
# random.shuffle(index)
# a=[np.nan]*len(train_df)
# b=[np.nan]*len(train_df)
# c=[np.nan]*len(train_df)

# for i in range(5):
#     building_level={}
#     for j in train_df['building_id'].values:
#         building_level[j]=[0,0,0]
#     test_index=index[int((i*train_df.shape[0])/5):int(((i+1)*train_df.shape[0])/5)]
#     train_index=list(set(index).difference(test_index))
#     for j in train_index:
#         temp=train_df.iloc[j]
#         if temp['interest_level']=='low':
#             building_level[temp['building_id']][0]+=1
#         if temp['interest_level']=='medium':
#             building_level[temp['building_id']][1]+=1
#         if temp['interest_level']=='high':
#             building_level[temp['building_id']][2]+=1
#     for j in test_index:
#         temp=train_df.iloc[j]
#         if sum(building_level[temp['building_id']])!=0:
#             a[j]=building_level[temp['building_id']][0]*1.0/sum(building_level[temp['building_id']])
#             b[j]=building_level[temp['building_id']][1]*1.0/sum(building_level[temp['building_id']])
#             c[j]=building_level[temp['building_id']][2]*1.0/sum(building_level[temp['building_id']])
# train_df['building_level_low']=a
# train_df['building_level_medium']=b
# train_df['building_level_high']=c



# a=[]
# b=[]
# c=[]
# building_level={}
# for j in train_df['building_id'].values:
#     building_level[j]=[0,0,0]
# for j in range(train_df.shape[0]):
#     temp=train_df.iloc[j]
#     if temp['interest_level']=='low':
#         building_level[temp['building_id']][0]+=1
#     if temp['interest_level']=='medium':
#         building_level[temp['building_id']][1]+=1
#     if temp['interest_level']=='high':
#         building_level[temp['building_id']][2]+=1

# for i in test_df['building_id'].values:
#     if i not in building_level.keys():
#         a.append(np.nan)
#         b.append(np.nan)
#         c.append(np.nan)
#     else:
#         a.append(building_level[i][0]*1.0/sum(building_level[i]))
#         b.append(building_level[i][1]*1.0/sum(building_level[i]))
#         c.append(building_level[i][2]*1.0/sum(building_level[i]))
# test_df['building_level_low']=a
# test_df['building_level_medium']=b
# test_df['building_level_high']=c

# features_to_use.append('building_level_low') 
# features_to_use.append('building_level_medium') 
# features_to_use.append('building_level_high')

In [57]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','price')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  price Done!


In [75]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_created_hour')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  num_created_hour Done!


In [106]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_6_price_diff_bedroom')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)        

manager_id  vs  num_6_price_diff_bedroom Done!


In [115]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','bedrooms')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)    

manager_id  vs  bedrooms Done!


In [151]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_photo_count')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  num_photo_count Done!


In [176]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','Zero_building_id')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  Zero_building_id Done!


In [183]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','top_25_building')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  top_25_building Done!


In [189]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','feature_nofee')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  feature_nofee Done!


In [193]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','longitude')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  longitude Done!


In [197]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','latitude')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  latitude Done!


# --------------------------------------

In [188]:
for f in new_feature:
    features_to_use.remove(f)

In [170]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_pricePerRoom')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

manager_id  vs  num_pricePerRoom Done!


In [162]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_features')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

manager_id  vs  num_features Done!


In [96]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'building_id','num_6_price_diff_bedroom')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)  

building_id  vs  num_6_price_diff_bedroom Done!


In [24]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'building_id','num_created_hour')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

building_id  vs  num_created_hour Done!


In [156]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','ratio_bed')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

manager_id  vs  ratio_bed Done!


In [134]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_6_price_diff')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)  

manager_id  vs  num_6_price_diff Done!


In [145]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_priceXroom')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

manager_id  vs  num_priceXroom Done!


In [128]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','pos')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)    

manager_id  vs  pos Done!


In [122]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','bathrooms')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)    

manager_id  vs  bathrooms Done!


In [104]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_6_price_ratio_bedroom')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)        

manager_id  vs  num_6_price_ratio_bedroom Done!


In [80]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_desc_wordcount')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

manager_id  vs  num_desc_wordcount Done!


In [69]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_dist_from_center')

# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

manager_id  vs  num_dist_from_center Done!


In [63]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'building_id','price')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)  

building_id  vs  price Done!


In [42]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_6_price_diff')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

manager_id  vs  num_6_price_diff Done!


In [70]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','building_id_mean_med')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)    

manager_id  vs  building_id_mean_med Done!


In [71]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','building_id_mean_high')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)    

manager_id  vs  building_id_mean_high Done!


In [76]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'building_id','manager_id_mean_med')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)    

building_id  vs  manager_id_mean_med Done!


In [77]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'building_id','manager_id_mean_high')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)    

building_id  vs  manager_id_mean_high Done!


In [104]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'feature_nofee','num_price')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

feature_nofee  vs  num_price Done!


In [97]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'num_building_null','num_priceXroom')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

In [78]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'building_id','num_priceXroom')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

building_id  vs  num_priceXroom Done!


In [67]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','feature_nofee')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)    

manager_id  vs  feature_nofee Done!


In [51]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','manager_id_mean_med')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)    

manager_id  vs  manager_id_mean_med Done!


In [42]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'position','price')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

position  vs  price Done!


In [146]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','median_price_bed')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)        

manager_id  vs  median_price_bed Done!


In [140]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_pricePerBed')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

manager_id  vs  num_pricePerBed Done!


In [133]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_pos_density')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

manager_id  vs  num_pos_density Done!


In [112]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_6_price_ratio')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

manager_id  vs  num_6_price_ratio Done!


In [96]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_6_price_diff_bedroom')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

manager_id  vs  num_6_price_diff_bedroom Done!


In [39]:
# train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_priceXroom')
# for f in new_feature:
#     if f not in features_to_use:
#         features_to_use.append(f)

manager_id  vs  num_priceXroom Done!


# Val

In [198]:
features_to_use[-14:]

['manager_id_longitude_medium_min',
 'manager_id_longitude_high_min',
 'manager_id_latitude_low_median',
 'manager_id_latitude_medium_median',
 'manager_id_latitude_high_median',
 'manager_id_latitude_low_mean',
 'manager_id_latitude_medium_mean',
 'manager_id_latitude_high_mean',
 'manager_id_latitude_low_max',
 'manager_id_latitude_medium_max',
 'manager_id_latitude_high_max',
 'manager_id_latitude_low_min',
 'manager_id_latitude_medium_min',
 'manager_id_latitude_high_min']

In [199]:
train_X = train_df[features_to_use]
test_X = test_df[features_to_use]

train_X.replace(np.inf, np.nan)
test_X.replace(np.inf, np.nan)

train_X.loc[:,'num_nan'] = train_X.isnull().sum(axis=1)
test_X.loc[:,'num_nan'] = test_X.isnull().sum(axis=1)

target_num_map = {'high':2, 'medium':1, 'low':0}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print train_X.shape, test_X.shape 

(49352, 222) (74659, 222)


In [200]:
now = time.time()
print cv_train(train_X,train_y,verbose_eval = 10, early_stop = 20)
print '\nTraining :{:0.2f}s'.format(time.time() - now)

Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[0]	train-mlogloss:0.907156+0.000620376	test-mlogloss:0.912654+0.000696885
[10]	train-mlogloss:0.533056+0.00131796	test-mlogloss:0.575999+0.00303284
[20]	train-mlogloss:0.472661+0.00146713	test-mlogloss:0.547372+0.0042181
[30]	train-mlogloss:0.434887+0.00160214	test-mlogloss:0.537361+0.00440458
[40]	train-mlogloss:0.406874+0.0019503	test-mlogloss:0.533256+0.00486721
[50]	train-mlogloss:0.381179+0.00233676	test-mlogloss:0.530998+0.00515014
[60]	train-mlogloss:0.3616+0.00254957	test-mlogloss:0.529678+0.00591926
[70]	train-mlogloss:0.341874+0.00284534	test-mlogloss:0.529622+0.00651918
[80]	train-mlogloss:0.323864+0.00320415	test-mlogloss:0.529633+0.00723308
[90]	train-mlogloss:0.307327+0.00272611	test-mlogloss:0.530297+0.00755162
Stopping. Best iteration:
[72]	train-mlogloss:0.338725+0.00259565	test-mlogloss:0.529448+0.00668102

0.5294482



In [None]:
# [80]	train-mlogloss:0.382361+0.00375901	test-mlogloss:0.551796+0.00869628 no cv feature
# [68]	train-mlogloss:0.375532+0.00356265	test-mlogloss:0.536175+0.00677244 manager_level
# [63]	train-mlogloss:0.374778+0.0033805	test-mlogloss:0.532757+0.00591715 manager_id_price
# [74]	train-mlogloss:0.351646+0.00247557	test-mlogloss:0.531748+0.0057647 manager_id_num_created_hour
# [75]	train-mlogloss:0.343642+0.00376395	test-mlogloss:0.530804+0.00643943 manager_id_num_6_price_diff_bedroom
# [64]	train-mlogloss:0.3622+0.00462603	test-mlogloss:0.52977+0.00690296 manager_id_bedrooms
# [70]	train-mlogloss:0.348793+0.00420376	test-mlogloss:0.529611+0.00564118 manager_id_num_photo_count
# [77]	train-mlogloss:0.332559+0.00268203	test-mlogloss:0.529099+0.00596648 manager_id_Zero_building_id
# [72]	train-mlogloss:0.342933+0.00285729	test-mlogloss:0.529299+0.00573414 manager_id_feature_nofee
# [77]	train-mlogloss:0.332598+0.00263115	test-mlogloss:0.528572+0.00725079 manager_id_longitude


# del
# [76]	train-mlogloss:0.337109+0.00469833	test-mlogloss:0.529907+0.00624846 manager_id_top_25_building
# [61]	train-mlogloss:0.362698+0.00230359	test-mlogloss:0.531051+0.00675724 manager_id_num_pricePerRoom
# [75]	train-mlogloss:0.337831+0.00452436	test-mlogloss:0.531305+0.00772622 manager_id_num_features
# [79]	train-mlogloss:0.331988+0.00347054	test-mlogloss:0.530181+0.00789851 manager_id_ratio_bed
# [69]	train-mlogloss:0.349364+0.00295152	test-mlogloss:0.531117+0.0070665 manager_id_num_priceXroom
# [78]	train-mlogloss:0.334089+0.00247944	test-mlogloss:0.530873+0.00635053 manager_id_num_6_price_diff
# [73]	train-mlogloss:0.341803+0.00279442	test-mlogloss:0.530613+0.00647791 manager_id_pos
# [73]	train-mlogloss:0.347647+0.0021376	test-mlogloss:0.53096+0.00579744 manager_id_bathrooms
# [71]	train-mlogloss:0.348451+0.00293866	test-mlogloss:0.53033+0.0060853 manager_id_num_bedrooms
# [67]	train-mlogloss:0.356988+0.00434169	test-mlogloss:0.531804+0.00627801 manager_id_num_6_price_ratio_bedroom
# [74]	train-mlogloss:0.347034+0.00154416	test-mlogloss:0.532934+0.00687755 manager_id_num_desc_wordcount
# [64]	train-mlogloss:0.36658+0.00283495	test-mlogloss:0.532886+0.00594258 manager_id_num_dist_from_center
# [68]	train-mlogloss:0.356656+0.0036973	test-mlogloss:0.533344+0.0061344 building_id_price
# [72]	train-mlogloss:0.361483+0.00316415	test-mlogloss:0.536714+0.00584744 building_level
# [67]	train-mlogloss:0.373729+0.00305937	test-mlogloss:0.536255+0.00708668 'building_id_mean_med','building_id_mean_high'

In [None]:
# [81]	train-mlogloss:0.354253+0.00211099	test-mlogloss:0.536977+0.00608719 no cv feature
# [65]	train-mlogloss:0.367155+0.00211633	test-mlogloss:0.534893+0.0062382 price
# [72]	train-mlogloss:0.346812+0.00328497	test-mlogloss:0.53305+0.00688662 building_id  vs  price
# [64]	train-mlogloss:0.357396+0.00288563	test-mlogloss:0.531704+0.00625184 num_dist_from_center
# [76]	train-mlogloss:0.331548+0.00150892	test-mlogloss:0.530707+0.00719767 num_created_hour
# [78]	train-mlogloss:0.327187+0.00326905	test-mlogloss:0.530522+0.00662889 building_id  vs  num_created_hour
# [69]	train-mlogloss:0.341823+0.0009883	test-mlogloss:0.530539+0.00755474 num_desc_wordcount
# del [64]	train-mlogloss:0.348393+0.00437276	test-mlogloss:0.532399+0.00560396 building_id  vs  num_desc_wordcount
# [78]	train-mlogloss:0.322085+0.00255577	test-mlogloss:0.530573+0.00706681 num_6_price_diff_bedroom
# [76]	train-mlogloss:0.322108+0.00188804	test-mlogloss:0.53018+0.00868863 building_id  vs  num_6_price_diff_bedroom
# [57]	train-mlogloss:0.35853+0.0039214	test-mlogloss:0.530243+0.00654921 bedroom
# del [80]	train-mlogloss:0.313073+0.00177185	test-mlogloss:0.530923+0.00642305 building_id  vs  bedroom
# [80]	train-mlogloss:0.311336+0.00318315	test-mlogloss:0.530211+0.00570428 num_6_price_diff

In [181]:
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, train_size=.80, random_state=2016)
rgr = xgb.XGBClassifier(
            objective='multi:softprob',
            seed = 0, # use a fixed seed during tuning so we can reproduce the results
            learning_rate = 0.3,
            n_estimators = 77,
            max_depth= 6,
            nthread = -1,
            colsample_bytree = 0.3,
            subsample =0.7,
            silent = 1
        )
rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
#         early_stopping_rounds=20,
        verbose=20
    )

[0]	validation_0-mlogloss:0.921154
[20]	validation_0-mlogloss:0.558762
[40]	validation_0-mlogloss:0.544735
[60]	validation_0-mlogloss:0.543177


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.3,
       gamma=0, learning_rate=0.3, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=77, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=1, subsample=0.7)

In [182]:
import xgbfir
xgbfir.saveXgbFI(rgr, feature_names=X_train.columns, OutputXlsxFile = '../FE/FI.xlsx')

# Tune XGBoost

In [53]:
best_score = 1000
for x in [3,4,5,6,7,8,9,10,11,12,13,14,15]:

    tmp = cv_train(train_X,train_y,max_depth = x)
    if  tmp < best_score:
        best_score = tmp
        train_param = x

    print x, '\t', tmp

Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[312]	train-mlogloss:0.395492+0.00170774	test-mlogloss:0.527702+0.00663205

3 	0.5277018
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[141]	train-mlogloss:0.396861+0.00183942	test-mlogloss:0.527621+0.00736431

4 	0.5276214
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[102]	train-mlogloss:0.357724+0.00219303	test-mlogloss:0.528282+0.00654119

5 	0.5282822
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[57]	train-mlogloss:0.358

In [54]:
# max_depth = train_param
max_depth = train_param
print max_depth

4


In [55]:
train_param = 1
for x in [2,4,8,12,16,20,24,28,32,40,48,64,80,128]:
#     rgr = xgb.XGBClassifier(
#         objective='multi:softprob',
#         seed = 1234, # use a fixed seed during tuning so we can reproduce the results
#         learning_rate = learning_rate,
#         n_estimators = 10000,
#         max_depth= max_depth,
#         nthread = -1,
#         silent = False,
#         min_child_weight = x
#     )
#     rgr.fit(
#         X_train,y_train,
#         eval_set=[(X_val,y_val)],
#         eval_metric='mlogloss',
#         early_stopping_rounds=50,
#         verbose=False
#     )
    
    tmp = cv_train(train_X,train_y,max_depth = max_depth,min_child_weight = x)
    if  tmp < best_score:
        best_score = tmp
        train_param = x

    print x, '\t', tmp

Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[186]	train-mlogloss:0.367066+0.00186211	test-mlogloss:0.527145+0.0074838

2 	0.5271452
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[213]	train-mlogloss:0.352706+0.00230145	test-mlogloss:0.526443+0.00732842

4 	0.5264426
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[159]	train-mlogloss:0.390464+0.00279415	test-mlogloss:0.527391+0.00724981

8 	0.5273912
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[177]	train-mlogloss:0.381

In [56]:
min_child_weight = train_param
print min_child_weight

16


In [57]:
train_param = 1
for x in [0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
#     rgr = xgb.XGBClassifier(
#         objective='multi:softprob',
#         seed = 1234, # use a fixed seed during tuning so we can reproduce the results
#         learning_rate = learning_rate,
#         n_estimators = 10000,
#         max_depth= max_depth,
#         nthread = -1,
#         silent = False,
#         min_child_weight = min_child_weight,
#         colsample_bytree = x
#     )
#     rgr.fit(
#         X_train,y_train,
#         eval_set=[(X_val,y_val)],
#         eval_metric='mlogloss',
#         early_stopping_rounds=50,
#         verbose=False
#     )

    tmp = cv_train(train_X,train_y,max_depth = max_depth,min_child_weight = min_child_weight, colsample_bytree = x)
    if  tmp < best_score:
        best_score = tmp
        train_param = x

    print x, '\t', tmp

Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[262]	train-mlogloss:0.395672+0.00192811	test-mlogloss:0.532532+0.00697024

0.05 	0.5325316
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[251]	train-mlogloss:0.38311+0.00308902	test-mlogloss:0.529113+0.00529031

0.1 	0.5291126
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[212]	train-mlogloss:0.387354+0.0026151	test-mlogloss:0.526019+0.00693851

0.2 	0.5260188
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[205]	train-mlogloss

In [58]:
colsample_bytree = train_param
print colsample_bytree

1


In [59]:
train_param = 1
for x in [0.5,0.6,0.7,0.8,0.9]:
#     rgr = xgb.XGBClassifier(
#         objective='multi:softprob',
#         seed = 1234, # use a fixed seed during tuning so we can reproduce the results
#         learning_rate = learning_rate,
#         n_estimators = 10000,
#         max_depth= max_depth,
#         nthread = -1,
#         silent = False,
#         min_child_weight = min_child_weight,
#         colsample_bytree = colsample_bytree,
#         subsample = x
#     )
#     rgr.fit(
#         X_train,y_train,
#         eval_set=[(X_val,y_val)],
#         eval_metric='mlogloss',
#         early_stopping_rounds=50,
#         verbose=False
#     )
    tmp = cv_train(train_X,train_y,max_depth = max_depth,min_child_weight = min_child_weight, 
                   colsample_bytree = colsample_bytree, subsample = x)
    if  tmp < best_score:
        best_score = tmp
        train_param = x

    print x, '\t', tmp

Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[114]	train-mlogloss:0.429981+0.00163237	test-mlogloss:0.535683+0.0063162

0.5 	0.5356834
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[127]	train-mlogloss:0.417173+0.00106604	test-mlogloss:0.532641+0.0071292

0.6 	0.5326408
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[117]	train-mlogloss:0.422807+0.00248712	test-mlogloss:0.530766+0.00658263

0.7 	0.5307656
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[138]	train-mlogloss:

In [60]:
subsample = train_param
print subsample

1


In [61]:
train_param = 0
for x in [0.3, 0.6, 0.9, 1.2, 1.5, 1.8, 2.1, 2.4, 2.7, 3.0]:
#     rgr = xgb.XGBClassifier(
#         objective='multi:softprob',
#         seed = 1234, # use a fixed seed during tuning so we can reproduce the results
#         learning_rate = learning_rate,
#         n_estimators = 10000,
#         max_depth= max_depth,
#         nthread = -1,
#         silent = False,
#         min_child_weight = min_child_weight,
#         colsample_bytree = colsample_bytree,
#         subsample = subsample,
#         gamma = x
#     )
#     rgr.fit(
#         X_train,y_train,
#         eval_set=[(X_val,y_val)],
#         eval_metric='mlogloss',
#         early_stopping_rounds=50,
#         verbose=False
#     )

    tmp = cv_train(train_X,train_y,max_depth = max_depth,min_child_weight = min_child_weight, 
                   colsample_bytree = colsample_bytree, subsample = subsample, gamma = x)
    if  tmp < best_score:
        best_score = tmp
        train_param = x

    print x, '\t', tmp

Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[164]	train-mlogloss:0.393296+0.00247187	test-mlogloss:0.526782+0.00697394

0.3 	0.5267822
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[186]	train-mlogloss:0.378468+0.00338923	test-mlogloss:0.526772+0.00705368

0.6 	0.526772
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[175]	train-mlogloss:0.38681+0.00159871	test-mlogloss:0.526846+0.00696989

0.9 	0.526846
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[167]	train-mlogloss:0

In [62]:
gamma = train_param
print gamma

0


In [201]:
xgtrain = xgb.DMatrix(train_X, label=train_y) 

def xgb_evaluate(min_child_weight, colsample_bytree, max_depth, subsample, gamma): #
    params = dict()
    params['objective']='multi:softprob'
    params['eval_metric']='mlogloss',
    params['num_class']=3
    params['silent']=1
    params['eta'] = 0.1
    params['verbose_eval'] = True
    params['min_child_weight'] = int(min_child_weight)
    params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = 0.99# max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    
    cv_result = xgb.cv(
        params, xgtrain, 
        num_boost_round=10000, nfold=5,
        metrics = 'mlogloss',
        seed=1234,callbacks=[xgb.callback.early_stop(50)]
    )
    
    return -cv_result['test-mlogloss-mean'].values[-1]


xgb_BO = BayesianOptimization(
    xgb_evaluate, 
    {
        'max_depth': (3,10),
        'min_child_weight': (8,80),
        'colsample_bytree': (0.2,1),
        'subsample': (0.7,1),
        'gamma': (0,3)
    }
)

xgb_BO.maximize(init_points=10, n_iter=40)

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[683]	train-mlogloss:0.385311+0.00256115	test-mlogloss:0.524352+0.0086797

    1 | 25m35s | [35m  -0.52435[0m | [32m            0.7361[0m | [32m   0.6388[0m | [32m     4.7833[0m | [32m           42.5554[0m | [32m     0.7012[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[379]	train-mlogloss:0.371498+0.00244822	test-mlogloss:0.523748+0.00712916

    2 | 19m33s | [35m  -0.52375[0m | [32m            0.6535[0m | 

  " state: %s" % convergence_dict)


   12 | 14m22s |   -0.52382 |             0.2361 |    0.3772 |      3.0095 |             8.7551 |      0.9196 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[258]	train-mlogloss:0.353133+0.00235446	test-mlogloss:0.524525+0.00757331

   13 | 30m55s |   -0.52453 |             0.9911 |    2.9391 |      9.9607 |            67.0525 |      0.7844 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[494]	train-mlogloss:0.330845+0.00187125	test-mlogloss:0.521793+0.00796687

   14 | 13m54s | [35m  -0.52179[0m | [32m            0.2144[0m | [32m   2.9898[0m | [32m     8.4786[0m | [32m           18.8402[0m | [32m     0.7721[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train 

  " state: %s" % convergence_dict)


   19 | 26m12s |   -0.52432 |             0.2673 |    2.8450 |      3.8539 |             8.0205 |      0.7101 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[242]	train-mlogloss:0.340356+0.00241948	test-mlogloss:0.524328+0.00751484

   20 | 27m22s |   -0.52433 |             0.9215 |    0.0042 |      9.8197 |            79.6447 |      0.8523 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[267]	train-mlogloss:0.325633+0.00157508	test-mlogloss:0.524159+0.00733517



  " state: %s" % convergence_dict)


   21 | 09m15s |   -0.52416 |             0.2160 |    0.0679 |      9.3548 |            61.0663 |      0.9894 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[238]	train-mlogloss:0.305969+0.00234088	test-mlogloss:0.524582+0.0072658

   22 | 29m34s |   -0.52458 |             0.9920 |    2.8891 |      9.6503 |            21.7810 |      0.9462 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1734]	train-mlogloss:0.438638+0.00255533	test-mlogloss:0.525621+0.00777304



  " state: %s" % convergence_dict)


   23 | 20m54s |   -0.52562 |             0.2447 |    2.8440 |      3.0257 |            34.4976 |      0.7415 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[263]	train-mlogloss:0.326697+0.00184619	test-mlogloss:0.523681+0.00706285



  " state: %s" % convergence_dict)


   24 | 09m14s |   -0.52368 |             0.2120 |    1.9111 |      9.9499 |            44.9476 |      0.7173 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[949]	train-mlogloss:0.416746+0.00235725	test-mlogloss:0.524177+0.00773736



  " state: %s" % convergence_dict)


   25 | 13m23s |   -0.52418 |             0.2044 |    2.9839 |      4.7569 |            23.0357 |      0.9776 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[699]	train-mlogloss:0.351606+0.0013223	test-mlogloss:0.522573+0.00768256



  " state: %s" % convergence_dict)


   26 | 19m27s |   -0.52257 |             0.2011 |    2.9888 |      9.9404 |            75.9973 |      0.7020 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1719]	train-mlogloss:0.443561+0.00269171	test-mlogloss:0.526098+0.00765076

   27 | 21m27s |   -0.52610 |             0.2579 |    2.9177 |      3.2913 |            47.1084 |      0.7582 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[568]	train-mlogloss:0.321001+0.00131872	test-mlogloss:0.522378+0.0079632



  " state: %s" % convergence_dict)


   28 | 16m50s |   -0.52238 |             0.2065 |    2.9887 |      9.5413 |            29.2594 |      0.7878 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[518]	train-mlogloss:0.345841+0.00149309	test-mlogloss:0.522665+0.00721251

   29 | 15m28s |   -0.52266 |             0.2036 |    2.9264 |      9.5995 |            55.4239 |      0.8143 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[285]	train-mlogloss:0.329229+0.00192084	test-mlogloss:0.523928+0.00766047

   30 | 11m14s |   -0.52393 |             0.2617 |    0.0178 |      9.9774 |            72.3185 |      0.9314 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stop

  " state: %s" % convergence_dict)


   32 | 16m19s |   -0.52216 |             0.3361 |    2.9502 |      7.0673 |            40.0506 |      0.7166 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1240]	train-mlogloss:0.402477+0.00193444	test-mlogloss:0.525229+0.00842347

   33 | 14m38s |   -0.52523 |             0.2180 |    0.1209 |      3.0021 |            19.2456 |      0.7396 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[717]	train-mlogloss:0.354072+0.00129081	test-mlogloss:0.520578+0.00757638



  " state: %s" % convergence_dict)


   34 | 14m57s | [35m  -0.52058[0m | [32m            0.2072[0m | [32m   2.9495[0m | [32m     6.9237[0m | [32m           13.7162[0m | [32m     0.7173[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[458]	train-mlogloss:0.325628+0.00128807	test-mlogloss:0.52091+0.00684728

   35 | 14m17s |   -0.52091 |             0.2379 |    2.9856 |      8.9818 |            12.2886 |      0.7533 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[554]	train-mlogloss:0.332979+0.00097861	test-mlogloss:0.520635+0.00822586

   36 | 13m44s |   -0.52063 |             0.2088 |    2.8794 |      7.2221 |            10.0298 |      0.7043 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train u

  " state: %s" % convergence_dict)


   37 | 15m05s |   -0.52324 |             0.2022 |    2.9717 |      4.1377 |            13.1347 |      0.7430 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[339]	train-mlogloss:0.324518+0.00134196	test-mlogloss:0.523164+0.00766708

   38 | 31m03s |   -0.52316 |             0.9989 |    2.9543 |      7.0194 |            11.9620 |      0.7634 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[336]	train-mlogloss:0.325949+0.00248427	test-mlogloss:0.522305+0.00839462

   39 | 09m37s |   -0.52231 |             0.2721 |    0.3063 |      6.7152 |             8.0713 |      0.7114 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stop

  " state: %s" % convergence_dict)


   40 | 16m19s |   -0.52326 |             0.2027 |    2.5613 |      9.5142 |            79.9297 |      0.7890 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[812]	train-mlogloss:0.400845+0.00256943	test-mlogloss:0.52363+0.00775763



  " state: %s" % convergence_dict)


   41 | 14m37s |   -0.52363 |             0.2135 |    2.8322 |      5.3262 |            63.4206 |      0.7262 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[596]	train-mlogloss:0.370898+0.00193831	test-mlogloss:0.521622+0.00796374

   42 | 12m28s |   -0.52162 |             0.2011 |    2.8887 |      6.5776 |            27.9240 |      0.8063 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[609]	train-mlogloss:0.334307+0.00154035	test-mlogloss:0.520606+0.00790453



  " state: %s" % convergence_dict)


   43 | 14m58s |   -0.52061 |             0.2080 |    2.9926 |      7.0594 |             8.7081 |      0.9898 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[768]	train-mlogloss:0.350951+0.00157895	test-mlogloss:0.5212+0.0079181

   44 | 16m09s |   -0.52120 |             0.2159 |    2.9281 |      6.4299 |            15.7984 |      0.9352 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[459]	train-mlogloss:0.317504+0.0012678	test-mlogloss:0.521366+0.00774674



  " state: %s" % convergence_dict)


   45 | 13m02s |   -0.52137 |             0.2057 |    2.8321 |      8.3789 |            10.7667 |      0.8651 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[565]	train-mlogloss:0.30838+0.00172829	test-mlogloss:0.522001+0.00718506



  " state: %s" % convergence_dict)


   46 | 17m12s |   -0.52200 |             0.2071 |    2.9937 |      9.6745 |            15.4274 |      0.9495 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[408]	train-mlogloss:0.339893+0.00236381	test-mlogloss:0.522586+0.00807417

   47 | 10m50s |   -0.52259 |             0.2536 |    0.0966 |      6.5241 |            33.2196 |      0.7376 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[970]	train-mlogloss:0.380553+0.00163913	test-mlogloss:0.522622+0.00759671



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   48 | 18m57s |   -0.52262 |             0.2013 |    2.9689 |      6.5378 |            70.6606 |      0.7252 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[586]	train-mlogloss:0.319867+0.000900069	test-mlogloss:0.520595+0.00714989

   49 | 19m07s |   -0.52060 |             0.3144 |    2.9352 |      7.2775 |             8.9591 |      0.7007 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[368]	train-mlogloss:0.348874+0.00094506	test-mlogloss:0.521468+0.00740512



  " state: %s" % convergence_dict)


   50 | 10m26s |   -0.52147 |             0.2218 |    2.7296 |      7.8701 |            14.6104 |      0.7245 | 


In [202]:
xgb_bo_scores = pd.DataFrame([[s[0]['max_depth'],
                               s[0]['min_child_weight'],
                               s[0]['colsample_bytree'],
                               s[0]['subsample'],
                               s[0]['gamma'],
                               s[1]] for s in zip(xgb_BO.res['all']['params'],xgb_BO.res['all']['values'])],
                            columns = ['max_depth',
                                       'min_child_weight',
                                       'colsample_bytree',
                                       'subsample',
                                       'gamma',
                                       'score'])
xgb_bo_scores=xgb_bo_scores.sort_values('score',ascending=False)
xgb_bo_scores.head(10)

Unnamed: 0,max_depth,min_child_weight,colsample_bytree,subsample,gamma,score
23,6.923701,13.716162,0.207217,0.71732,2.949494,-0.520578
38,7.277497,8.959092,0.314392,0.700725,2.935174,-0.520595
32,7.059368,8.708084,0.208029,0.98982,2.99259,-0.520606
25,7.222086,10.029754,0.208806,0.704291,2.879361,-0.520635
7,8.557833,25.544802,0.250313,0.706447,2.988536,-0.520883
24,8.981806,12.288637,0.237923,0.753265,2.985601,-0.52091
33,6.429874,15.798436,0.215938,0.935211,2.928114,-0.5212
34,8.378914,10.766739,0.205679,0.865052,2.832059,-0.521366
39,7.870134,14.610424,0.221777,0.724519,2.729583,-0.521468
31,6.577559,27.923958,0.201063,0.806311,2.888736,-0.521622


In [203]:
train_X.to_csv(data_path + 'train_CV_MS_52571.csv',index=False)
test_X.to_csv(data_path + 'test_CV_MS_52571.csv',index=False)

In [204]:
def xgb_blend(estimators, train_x, train_y, test_x, fold, early_stopping_rounds=0):
    N_params = len(estimators)
    print ("Blend %d estimators for %d folds" % (N_params, fold))
    skf = KFold(n_splits=fold,random_state=1234)
    N_class = len(set(train_y))
        
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x_mean = np.zeros((test_x.shape[0], N_class*N_params))
    test_blend_x_gmean = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros ((fold,N_params))
    best_rounds = np.zeros ((fold, N_params))
    
    for j, est in enumerate(estimators):
        est.set_params(objective = 'multi:softprob')
        est.set_params(silent = False)
        est.set_params(learning_rate = 0.02)
        est.set_params(n_estimators=100000)
        
        print ("Model %d: %s" %(j+1, est))

        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))
    
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x.iloc[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x.iloc[val_index]
            val_y_fold = train_y[val_index]      

            est.fit(train_x_fold,train_y_fold,
                    eval_set = [(val_x_fold, val_y_fold)],
                    eval_metric = 'mlogloss',
                    early_stopping_rounds=early_stopping_rounds,
                    verbose=False)
            best_round=est.best_iteration
            best_rounds[i,j]=best_round
            print ("best round %d" % (best_round))
            val_y_predict_fold = est.predict_proba(val_x_fold,ntree_limit=best_round)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print ("Score: ", score)
            scores[i,j]=score
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = est.predict_proba(test_x,ntree_limit=best_round)
            print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start))
            
        test_blend_x_mean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
        
        test_blend_x_gmean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([gmean(test_blend_x_j[:,range(0,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(1,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(2,N_class*fold,N_class)], axis=1)]).T
            
        print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print ("Score for blended models is %f" % (np.mean(scores)))
    return (train_blend_x, test_blend_x_mean, test_blend_x_gmean, scores,best_rounds)


In [206]:
estimators = [
             xgb.XGBClassifier(max_depth = 6,
                              min_child_weight = 13,
                              colsample_bytree = 0.207217 ,
                              subsample = 0.99 ,
                              gamma = 2.949494),
             xgb.XGBClassifier(max_depth = 7,
                              min_child_weight = 8,
                              colsample_bytree = 0.314392,
                              subsample = 0.99,
                              gamma = 2.935174),
             xgb.XGBClassifier(max_depth = 7,
                              min_child_weight = 8,
                              colsample_bytree = 0.208029,
                              subsample = 0.99,
                              gamma = 2.992590),         
             xgb.XGBClassifier(max_depth = 7,
                              min_child_weight = 10,
                              colsample_bytree = 0.208806,
                              subsample = 0.99,
                              gamma = 2.879361),  
             xgb.XGBClassifier(max_depth = 8,
                              min_child_weight = 25,
                              colsample_bytree = 0.250313,
                              subsample = 0.99,
                              gamma = 2.988536)              
             ]

#  	 	max_depth 	min_child_weight 	colsample_bytree 	subsample 	gamma 	score
# 23 	6.923701 	13.716162 	 	 	0.207217 	 	 	0.717320 	2.949494 	-0.520578
# 38 	7.277497 	8.959092 	 	 	0.314392 	 	 	0.700725 	2.935174 	-0.520595
# 32 	7.059368 	8.708084 	 	 	0.208029 	 	 	0.989820 	2.992590 	-0.520606
# 25 	7.222086 	10.029754 	 	 	0.208806 	 	 	0.704291 	2.879361 	-0.520635
# 7 	8.557833 	25.544802 	 	 	0.250313 	 	 	0.706447 	2.988536 	-0.520883


(train_blend_x_xgb,
 test_blend_x_xgb_mean,
 test_blend_x_xgb_gmean,
 blend_scores_xgb,
 best_rounds_xgb) = xgb_blend(estimators,
                              train_X,train_y,
                              test_X,
                              10,
                              500)


Blend 5 estimators for 10 folds
Model 1: XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.207217,
       gamma=2.949494, learning_rate=0.02, max_delta_step=0, max_depth=6,
       min_child_weight=13, missing=None, n_estimators=100000, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.99)
Model 1 fold 1
best round 4052
('Score: ', 0.52759085893433222)
Model 1 fold 1 fitting finished in 1156.973s
Model 1 fold 2
best round 3687
('Score: ', 0.51287864554181262)
Model 1 fold 2 fitting finished in 1061.310s
Model 1 fold 3
best round 6012
('Score: ', 0.50368627371380925)
Model 1 fold 3 fitting finished in 1626.184s
Model 1 fold 4
best round 5425
('Score: ', 0.51214417568970272)
Model 1 fold 4 fitting finished in 1469.969s
Model 1 fold 5
best round 5905
('Score: ', 0.49936669306178966)
Model 1 fold 5 fitting finished in 1587.994s
Model 1 fold 6
best round 2615
('Score: ', 0.517199549583216

In [209]:
# train_blend_x_xgb = pd.DataFrame(train_blend_x_xgb)
train_blend_x_xgb.columns = ["low0", "medium0", "high0","low1", "medium1", "high1",
                             "low2", "medium2", "high2","low3", "medium3", "high3",
                             "low4", "medium4", "high4",]
train_blend_x_xgb["listing_id"] = train_X.listing_id.values

test_blend_x_xgb_mean = pd.DataFrame(test_blend_x_xgb_mean)
test_blend_x_xgb_mean.columns = ["low0", "medium0", "high0","low1", "medium1", "high1",
                             "low2", "medium2", "high2","low3", "medium3", "high3",
                             "low4", "medium4", "high4",]
test_blend_x_xgb_mean["listing_id"] = test_X.listing_id.values

test_blend_x_xgb_gmean = pd.DataFrame(test_blend_x_xgb_gmean)
test_blend_x_xgb_gmean.columns = ["low0", "medium0", "high0","low1", "medium1", "high1",
                             "low2", "medium2", "high2","low3", "medium3", "high3",
                             "low4", "medium4", "high4",]
test_blend_x_xgb_gmean["listing_id"] = test_X.listing_id.values

In [210]:
tmp_train = train_X_0322[['listing_id']].merge(train_blend_x_xgb,on = 'listing_id', how = 'left')[["low0", "medium0", "high0","low1", "medium1", "high1",
                             "low2", "medium2", "high2","low3", "medium3", "high3",
                             "low4", "medium4", "high4",]].values
tmp_test_mean = test_X_0322[['listing_id']].merge(test_blend_x_xgb_mean,on = 'listing_id', how = 'left')[["low0", "medium0", "high0","low1", "medium1", "high1",
                             "low2", "medium2", "high2","low3", "medium3", "high3",
                             "low4", "medium4", "high4",]].values
tmp_test_gmean = test_X_0322[['listing_id']].merge(test_blend_x_xgb_gmean,on = 'listing_id', how = 'left')[["low0", "medium0", "high0","low1", "medium1", "high1",
                             "low2", "medium2", "high2","low3", "medium3", "high3",
                             "low4", "medium4", "high4",]].values

In [211]:
from datetime import datetime
now = datetime.now()

name_train_blend = '../output/train_blend_xgb_CV_MS_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../output/test_blend_xgb_mean_CV_MS_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../output/test_blend_xgb_gmean_CV_MS_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_xgb,axis=0))
print (np.mean(best_rounds_xgb,axis=0))
np.savetxt(name_train_blend,tmp_train, delimiter=",")
np.savetxt(name_test_blend_mean,tmp_test_mean, delimiter=",")
np.savetxt(name_test_blend_gmean,tmp_test_gmean, delimiter=",")

[ 0.51577033  0.51496813  0.51500373  0.51514879  0.51510358]
[ 4310.4  3456.5  3890.8  3474.2  3226. ]


In [212]:
# now = datetime.now()
sub_name = '../output/sub_XGB_mean_CV_MS222_10blend_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(tmp_test_mean[:,3:6])
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = test_X_0322.listing_id.values
out_df.to_csv(sub_name, index=False)