In [59]:
import numpy as np
import pandas as pd
from bayes_opt import BayesianOptimization
import xgboost as xgb
from itertools import product
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn import model_selection,ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import time
import random
from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, Imputer, LabelBinarizer, MultiLabelBinarizer,LabelEncoder
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
def cv_train(train,y):
    xgtrain = xgb.DMatrix(train, label=y)
    params = dict()
    params['objective']='multi:softprob'
    params['eval_metric']='mlogloss',
    params['num_class']=3
    params['silent']=0
    params['eta'] = 0.3
    params['verbose_eval'] = False
    params['max_depth'] = 6


    cv_result = xgb.cv(
        params, xgtrain, 
        num_boost_round=10000, nfold=5,
        metrics = 'mlogloss',
        seed=0,callbacks=[xgb.callback.early_stop(20)]
    )

    return cv_result['test-mlogloss-mean'].values[-1]

In [20]:
def CV_st(train,test,feature1,feature2):
    index=list(range(train.shape[0]))
    random.shuffle(index)
    kf = KFold(n_splits=5,shuffle=True, random_state=0)
    
    # median feature names
    features_tmp = []
    features_tmp.append(feature1 + '_' + feature2 + '_low_median') 
    features_tmp.append(feature1 + '_' + feature2 + '_medium_median') 
    features_tmp.append(feature1 + '_' + feature2 + '_high_median')
    
    # train data 
    f_low=pd.Series([np.nan]*len(train))
    f_medium=pd.Series([np.nan]*len(train))
    f_high=pd.Series([np.nan]*len(train))

    for train_index, test_index in kf.split(index):
        tmp = train.iloc[train_index].groupby(['interest_level',feature1])[feature2].median().\
                reset_index().rename(columns={feature2:'new'})
        f_low[test_index]    = train.iloc[test_index].merge(tmp[tmp.interest_level == 'low'],
                                                            on=feature1,how='left')['new'].values
        f_medium[test_index] = train.iloc[test_index].merge(tmp[tmp.interest_level == 'medium'],
                                                            on=feature1,how='left')['new'].values
        f_high[test_index]   = train.iloc[test_index].merge(tmp[tmp.interest_level == 'high'],
                                                            on=feature1,how='left')['new'].values   
    train[features_tmp[0]] = f_low
    train[features_tmp[1]] = f_medium
    train[features_tmp[2]] = f_high

    # test data
    tmp = train.groupby(['interest_level',feature1])[feature2].median().\
            reset_index().rename(columns={feature2:'new'})    
    test[features_tmp[0]] = test.merge(tmp[tmp.interest_level == 'low'],
                                       on=feature1,how='left')['new'].values
    test[features_tmp[1]] = test.merge(tmp[tmp.interest_level == 'medium'],
                                       on=feature1,how='left')['new'].values
    test[features_tmp[2]] = test.merge(tmp[tmp.interest_level == 'high'],
                                       on=feature1,how='left')['new'].values 
    

    # mean feature names
    features_tmp.append(feature1 + '_' + feature2 + '_low_mean') 
    features_tmp.append(feature1 + '_' + feature2 + '_medium_mean') 
    features_tmp.append(feature1 + '_' + feature2 + '_high_mean')
    
    # train data 
    f_low=pd.Series([np.nan]*len(train))
    f_medium=pd.Series([np.nan]*len(train))
    f_high=pd.Series([np.nan]*len(train))

    for train_index, test_index in kf.split(index):
        tmp = train.iloc[train_index].groupby(['interest_level',feature1])[feature2].mean().\
                reset_index().rename(columns={feature2:'new'})
        f_low[test_index]    = train.iloc[test_index].merge(tmp[tmp.interest_level == 'low'],
                                                            on=feature1,how='left')['new'].values
        f_medium[test_index] = train.iloc[test_index].merge(tmp[tmp.interest_level == 'medium'],
                                                            on=feature1,how='left')['new'].values
        f_high[test_index]   = train.iloc[test_index].merge(tmp[tmp.interest_level == 'high'],
                                                            on=feature1,how='left')['new'].values   
    train[features_tmp[3]] = f_low
    train[features_tmp[4]] = f_medium
    train[features_tmp[5]] = f_high

    # test data
    tmp = train.groupby(['interest_level',feature1])[feature2].mean().\
            reset_index().rename(columns={feature2:'new'})    
    test[features_tmp[3]] = test.merge(tmp[tmp.interest_level == 'low'],
                                       on=feature1,how='left')['new'].values
    test[features_tmp[4]] = test.merge(tmp[tmp.interest_level == 'medium'],
                                       on=feature1,how='left')['new'].values
    test[features_tmp[5]] = test.merge(tmp[tmp.interest_level == 'high'],
                                       on=feature1,how='left')['new'].values 
    
    # max feature names
    features_tmp.append(feature1 + '_' + feature2 + '_low_max') 
    features_tmp.append(feature1 + '_' + feature2 + '_medium_max') 
    features_tmp.append(feature1 + '_' + feature2 + '_high_max')
    
    # train data 
    f_low=pd.Series([np.nan]*len(train))
    f_medium=pd.Series([np.nan]*len(train))
    f_high=pd.Series([np.nan]*len(train))

    for train_index, test_index in kf.split(index):
        tmp = train.iloc[train_index].groupby(['interest_level',feature1])[feature2].max().\
                reset_index().rename(columns={feature2:'new'})
        f_low[test_index]    = train.iloc[test_index].merge(tmp[tmp.interest_level == 'low'],
                                                            on=feature1,how='left')['new'].values
        f_medium[test_index] = train.iloc[test_index].merge(tmp[tmp.interest_level == 'medium'],
                                                            on=feature1,how='left')['new'].values
        f_high[test_index]   = train.iloc[test_index].merge(tmp[tmp.interest_level == 'high'],
                                                            on=feature1,how='left')['new'].values   
    train[features_tmp[6]] = f_low
    train[features_tmp[7]] = f_medium
    train[features_tmp[8]] = f_high

    # test data
    tmp = train.groupby(['interest_level',feature1])[feature2].max().\
            reset_index().rename(columns={feature2:'new'})    
    test[features_tmp[6]] = test.merge(tmp[tmp.interest_level == 'low'],
                                       on=feature1,how='left')['new'].values
    test[features_tmp[7]] = test.merge(tmp[tmp.interest_level == 'medium'],
                                       on=feature1,how='left')['new'].values
    test[features_tmp[8]] = test.merge(tmp[tmp.interest_level == 'high'],
                                       on=feature1,how='left')['new'].values 
    
    # min feature names
    features_tmp.append(feature1 + '_' + feature2 + '_low_min') 
    features_tmp.append(feature1 + '_' + feature2 + '_medium_min') 
    features_tmp.append(feature1 + '_' + feature2 + '_high_min')
    
    # train data 
    f_low=pd.Series([np.nan]*len(train))
    f_medium=pd.Series([np.nan]*len(train))
    f_high=pd.Series([np.nan]*len(train))

    for train_index, test_index in kf.split(index):
        tmp = train.iloc[train_index].groupby(['interest_level',feature1])[feature2].min().\
                reset_index().rename(columns={feature2:'new'})
        f_low[test_index]    = train.iloc[test_index].merge(tmp[tmp.interest_level == 'low'],
                                                            on=feature1,how='left')['new'].values
        f_medium[test_index] = train.iloc[test_index].merge(tmp[tmp.interest_level == 'medium'],
                                                            on=feature1,how='left')['new'].values
        f_high[test_index]   = train.iloc[test_index].merge(tmp[tmp.interest_level == 'high'],
                                                            on=feature1,how='left')['new'].values   
    train[features_tmp[9]] = f_low
    train[features_tmp[10]] = f_medium
    train[features_tmp[11]] = f_high

    # test data
    tmp = train.groupby(['interest_level',feature1])[feature2].min().\
            reset_index().rename(columns={feature2:'new'})    
    test[features_tmp[9]] = test.merge(tmp[tmp.interest_level == 'low'],
                                       on=feature1,how='left')['new'].values
    test[features_tmp[10]] = test.merge(tmp[tmp.interest_level == 'medium'],
                                       on=feature1,how='left')['new'].values
    test[features_tmp[11]] = test.merge(tmp[tmp.interest_level == 'high'],
                                       on=feature1,how='left')['new'].values 

    # std feature names
    features_tmp.append(feature1 + '_' + feature2 + '_low_std') 
    features_tmp.append(feature1 + '_' + feature2 + '_medium_std') 
    features_tmp.append(feature1 + '_' + feature2 + '_high_std')
    
    # train data 
    f_low=pd.Series([np.nan]*len(train))
    f_medium=pd.Series([np.nan]*len(train))
    f_high=pd.Series([np.nan]*len(train))

    for train_index, test_index in kf.split(index):
        tmp = train.iloc[train_index].groupby(['interest_level',feature1])[feature2].std().\
                reset_index().rename(columns={feature2:'new'})
        f_low[test_index]    = train.iloc[test_index].merge(tmp[tmp.interest_level == 'low'],
                                                            on=feature1,how='left')['new'].values
        f_medium[test_index] = train.iloc[test_index].merge(tmp[tmp.interest_level == 'medium'],
                                                            on=feature1,how='left')['new'].values
        f_high[test_index]   = train.iloc[test_index].merge(tmp[tmp.interest_level == 'high'],
                                                            on=feature1,how='left')['new'].values   
    train[features_tmp[12]] = f_low
    train[features_tmp[13]] = f_medium
    train[features_tmp[14]] = f_high

    # test data
    tmp = train.groupby(['interest_level',feature1])[feature2].std().\
            reset_index().rename(columns={feature2:'new'})    
    test[features_tmp[12]] = test.merge(tmp[tmp.interest_level == 'low'],
                                       on=feature1,how='left')['new'].values
    test[features_tmp[13]] = test.merge(tmp[tmp.interest_level == 'medium'],
                                       on=feature1,how='left')['new'].values
    test[features_tmp[14]] = test.merge(tmp[tmp.interest_level == 'high'],
                                       on=feature1,how='left')['new'].values 
    
#     # var feature names
#     features_tmp.append(feature1 + '_' + feature2 + '_low_var') 
#     features_tmp.append(feature1 + '_' + feature2 + '_medium_var') 
#     features_tmp.append(feature1 + '_' + feature2 + '_high_var')
    
#     # train data 
#     f_low=pd.Series([np.nan]*len(train))
#     f_medium=pd.Series([np.nan]*len(train))
#     f_high=pd.Series([np.nan]*len(train))

#     for train_index, test_index in kf.split(index):
#         tmp = train.iloc[train_index].groupby(['interest_level',feature1])[feature2].var().\
#                 reset_index().rename(columns={feature2:'new'})
#         f_low[test_index]    = train.iloc[test_index].merge(tmp[tmp.interest_level == 'low'],
#                                                             on=feature1,how='left')['new'].values
#         f_medium[test_index] = train.iloc[test_index].merge(tmp[tmp.interest_level == 'medium'],
#                                                             on=feature1,how='left')['new'].values
#         f_high[test_index]   = train.iloc[test_index].merge(tmp[tmp.interest_level == 'high'],
#                                                             on=feature1,how='left')['new'].values   
#     train[features_tmp[15]] = f_low
#     train[features_tmp[16]] = f_medium
#     train[features_tmp[17]] = f_high

#     # test data
#     tmp = train.groupby(['interest_level',feature1])[feature2].var().\
#             reset_index().rename(columns={feature2:'new'})    
#     test[features_tmp[15]] = test.merge(tmp[tmp.interest_level == 'low'],
#                                        on=feature1,how='left')['new'].values
#     test[features_tmp[16]] = test.merge(tmp[tmp.interest_level == 'medium'],
#                                        on=feature1,how='left')['new'].values
#     test[features_tmp[17]] = test.merge(tmp[tmp.interest_level == 'high'],
#                                        on=feature1,how='left')['new'].values 
    
    # ratio/diff feature
    cols = features_tmp[:]
#     features_tmp = []
    for col in cols:
        new_feature = col+'_ratio'
        train[new_feature] = train[col] / train[feature2]
        test[new_feature] = test[col] / test[feature2]
        features_tmp.append(new_feature)
        
    print feature1,' vs ', feature2,'Done!'
    return train,test,features_tmp

In [4]:
#input data
train_df=pd.read_json('../input/train.json').reset_index(drop = True)
test_df=pd.read_json('../input/test.json').reset_index(drop = True)
test_df.loc[test_df.bathrooms == 112.0,'bathrooms'] = 1.5    
test_df.loc[test_df.bathrooms == 20.0,'bathrooms'] = 2.0
test_df.loc[test_df.listing_id == 7220763,'bedrooms'] = 3
test_df.loc[test_df.listing_id == 7047074,'bedrooms'] = 6
print train_df.shape
print test_df.shape

(49352, 15)
(74659, 14)


In [5]:
def add_features(df):
    fmt = lambda s: s.replace("\u00a0", "").strip().lower()
    df["num_photo_count"] = df["photos"].apply(len)
    df["street_address"] = df['street_address'].apply(fmt)
    df["display_address"] = df["display_address"].apply(fmt)
    df["num_desc_wordcount"] = df["description"].apply(len)
    df["num_pricePerBed"] = df['price'] / df['bedrooms']
    df["num_pricePerBath"] = df['price'] / df['bathrooms']
    df["num_pricePerRoom"] = df['price'] / (df['bedrooms'] + df['bathrooms'])
    df["num_bedPerBath"] = df['bedrooms'] / df['bathrooms']
    df["num_bedBathDiff"] = df['bedrooms'] - df['bathrooms']
    df["num_bedBathSum"] = df["bedrooms"] + df['bathrooms']
    df["num_bedsPerc"] = df["bedrooms"] / (df['bedrooms'] + df['bathrooms'])

    df = df.fillna(-1).replace(np.inf, -1)
    return df

# Add common features
train_df = add_features(train_df)
test_df = add_features(test_df) 


# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

train_df['num_desc_length_null'] = (train_df.description.str.len()==0).astype(float)
test_df['num_desc_length_null'] = (test_df.description.str.len()==0).astype(float)
    
features_to_use=[
    "latitude", "longitude","num_pricePerBed",
    'num_bedBathSum','num_pricePerBath','num_pricePerRoom','num_bedPerBath',
    'num_bedBathDiff','num_bedsPerc',
    "num_photo_count", "num_features", "num_desc_wordcount",'num_desc_length_null',
    "listing_id"]

print 'Done!'

Done!


In [6]:
# Location features: Latitude, longitude
precision = 3
x = np.sqrt(((train_df.latitude - train_df.latitude.median())**2) + (train_df.longitude - train_df.longitude.median())**2)
train_df['num_dist_from_center'] = x.values
x = np.sqrt(((test_df.latitude - train_df.latitude.median())**2) + (test_df.longitude - train_df.longitude.median())**2)
test_df['num_dist_from_center'] = x.values
train_df['position'] = train_df.longitude.round(precision).astype(str) + '_' + train_df.latitude.round(precision).astype(str)
test_df['position'] = test_df.longitude.round(precision).astype(str) + '_' + test_df.latitude.round(precision).astype(str)

new_feature = ['num_dist_from_center']
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)
print 'Done!'

Done!


In [7]:
# Degree of "outlierness"
OutlierAggregated = (train_df.bedrooms > 4).astype(float)
OutlierAggregated2 = (test_df.bedrooms > 4).astype(float)
OutlierAggregated += (train_df.bathrooms > 3).astype(float)
OutlierAggregated2 += (test_df.bathrooms > 3).astype(float)
OutlierAggregated += (train_df.bathrooms < 1).astype(float)
OutlierAggregated2 += (test_df.bathrooms < 1).astype(float)
x = np.abs((train_df.price - train_df.price.median())/train_df.price.std()) > 0.30
OutlierAggregated += x.astype(float)
x2 = np.abs((test_df.price - train_df.price.median())/train_df.price.std()) > 0.30
OutlierAggregated2 += x2.astype(float)
x = np.log1p(train_df.price/(train_df.bedrooms.clip(1,3) + train_df.bathrooms.clip(1,2))) > 8.2
OutlierAggregated += x.astype(float)
x2 = np.log1p(test_df.price/(test_df.bedrooms.clip(1,3) + test_df.bathrooms.clip(1,2))) > 8.2
OutlierAggregated2 += x2.astype(float)
x = np.sqrt(((train_df.latitude - train_df.latitude.median())**2) + (train_df.longitude - train_df.longitude.median())**2) > 0.30
OutlierAggregated += x.astype(float)
x2 = np.sqrt(((test_df.latitude - train_df.latitude.median())**2) + (test_df.longitude - train_df.longitude.median())**2) > 0.30
OutlierAggregated2 += x2.astype(float)
train_df['num_OutlierAggregated'] = OutlierAggregated.values
test_df['num_OutlierAggregated'] = OutlierAggregated2.values


new_feature = ['num_OutlierAggregated']
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)
        
print 'Done!'

Done!


In [8]:
# Density in unique locations at given precision
vals = train_df['position'].value_counts()
dvals = vals.to_dict()
train_df['num_pos_density'] = train_df['position'].apply(lambda x: dvals.get(x, vals.min()))
test_df['num_pos_density'] = test_df['position'].apply(lambda x: dvals.get(x, vals.min()))

# Building null
train_df['num_building_null'] = (train_df.building_id=='0').astype(float)
test_df['num_building_null'] = (test_df.building_id=='0').astype(float)


new_feature = ['num_pos_density','num_building_null']
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)
        
print 'Done!'

Done!


In [9]:
# Creation time features
train_df['created'] = pd.to_datetime(train_df.created)
train_df['num_created_weekday'] = train_df.created.dt.dayofweek.astype(float)
train_df['num_created_weekofyear'] = train_df.created.dt.weekofyear
train_df['num_created_day'] = train_df.created.dt.day
train_df['num_created_month'] = train_df.created.dt.month
train_df['num_created_hour'] = train_df.created.dt.hour
  
test_df['created'] = pd.to_datetime(test_df.created)
test_df['num_created_weekday'] = test_df.created.dt.dayofweek
test_df['num_created_weekofyear'] = test_df.created.dt.weekofyear
test_df['num_created_day'] = test_df.created.dt.day
test_df['num_created_month'] = test_df.created.dt.month
test_df['num_created_hour'] = test_df.created.dt.hour


new_feature = ['num_created_weekday','num_created_weekofyear','num_created_day','num_created_month','num_created_hour']
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)
        
print 'Done!'

Done!


In [10]:
# Bedrooms/Bathrooms/Price
train_df['num_bathrooms'] = train_df.bathrooms.clip_upper(4)
test_df['num_bathrooms'] = test_df.bathrooms.clip_upper(4)

train_df['num_bedrooms'] = train_df.bedrooms.clip_upper(5)
test_df['num_bedrooms'] = test_df.bedrooms.clip_upper(5)

train_df['num_price'] = train_df.price.clip_upper(10000)
test_df['num_price'] = test_df.price.clip_upper(10000)

bins = train_df.price.quantile(np.arange(0.05, 1, 0.05))
train_df['num_price_q'] = np.digitize(train_df.price, bins)
test_df['num_price_q'] = np.digitize(test_df.price, bins)


new_feature = ['num_bathrooms','num_bedrooms','num_price','num_price_q']
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)
        
print 'Done!'

Done!


In [11]:
# Composite features based on: 
# https://www.kaggle.com/arnaldcat/two-sigma-connect-rental-listing-inquiries/a-proxy-for-sqft-and-the-interest-on-1-2-baths
train_df['num_priceXroom'] = (train_df.price / (1 + train_df.bedrooms.clip(1, 4) + 0.5*train_df.bathrooms.clip(0, 2))).values
test_df['num_priceXroom'] = (test_df.price / (1 + test_df.bedrooms.clip(1, 4) + 0.5*test_df.bathrooms.clip(0, 2))).values

train_df['num_even_bathrooms'] = ((np.round(train_df.bathrooms) - train_df.bathrooms)==0).astype(float)
test_df['num_even_bathrooms'] = ((np.round(test_df.bathrooms) - test_df.bathrooms)==0).astype(float)

new_feature = ['num_priceXroom','num_even_bathrooms']
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

print 'Done!'

Done!


In [12]:
categorical = ["display_address", "manager_id", "building_id", "street_address",'position']
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            if f not in features_to_use:
                features_to_use.append(f)

In [13]:
dftemp = train_df.copy()
for i in ['latitude', 'longitude']:
    while(1):
        x = dftemp[i].median()
        ix = abs(dftemp[i] - x) > 3*dftemp[i].std()
        if ix.sum()==0:
            break
        dftemp.loc[ix, i] = np.nan
dftemp = dftemp.loc[dftemp[['latitude', 'longitude']].isnull().sum(1) == 0, :]

dfm = DataFrameMapper([(['latitude'], [StandardScaler()]), (['longitude'], [StandardScaler()])])

for i in [6]:
    pipe_location = make_pipeline(dfm, KMeans(n_clusters=i, random_state=1))
    pipe_location.fit(dftemp);
    train_df['location_'+str(i)] = pipe_location.predict(train_df).astype(str)
    test_df['location_'+str(i)] = pipe_location.predict(test_df).astype(str)
for i in train_df.location_6.unique():
    f = 'num_location_6_'+str(i)
    train_df[f] = (train_df.location_6==i).astype(float)
    test_df[f] = (test_df.location_6==i).astype(float)
    if f not in features_to_use:
        features_to_use.append(f)
    
    
train_df['tmp_bathrooms'] = train_df.bathrooms.clip_upper(2)
test_df['tmp_bathrooms'] = test_df.bathrooms.clip_upper(2)
train_df['tmp_bedrooms'] = train_df.bedrooms.clip_upper(4)
test_df['tmp_bedrooms'] = test_df.bedrooms.clip_upper(4)
train_df['roomcal'] = train_df.tmp_bedrooms.astype(str) + '_' + train_df.tmp_bathrooms.astype(str)    
test_df['roomcal'] = test_df.tmp_bedrooms.astype(str) + '_' + test_df.tmp_bathrooms.astype(str)    

room_lb = LabelBinarizer()
room_lb.fit(train_df['roomcal'])
room_col = ['num_room_type_' + str(x) for x in range(len(train_df['roomcal'].unique()))]
for f in room_col:
    if f not in features_to_use:
        features_to_use.append(f)

train_df = train_df.join(pd.DataFrame(room_lb.transform(train_df['roomcal']),columns=room_col,index=train_df.index))
test_df = test_df.join(pd.DataFrame(room_lb.transform(test_df['roomcal']),columns=room_col,index=test_df.index))

tmp = train_df.groupby(['roomcal','location_6'])['num_price'].median().\
            reset_index().rename(columns={'num_price':'num_6_median_price'})
    
train_df = train_df.merge(tmp,on=['roomcal','location_6'],how='left')
test_df = test_df.merge(tmp,on=['roomcal','location_6'],how='left')

test_df.loc[27462,'num_6_median_price'] =  7200.0

train_df['num_6_price_ratio'] = train_df['num_price'] / train_df['num_6_median_price']
train_df['num_6_price_diff'] = train_df['num_price'] - train_df['num_6_median_price']
test_df['num_6_price_ratio'] = test_df['num_price'] / test_df['num_6_median_price']
test_df['num_6_price_diff'] = test_df['num_price'] - test_df['num_6_median_price']


for f in ['num_6_median_price','num_6_price_ratio','num_6_price_diff']:
    if f not in features_to_use:
        features_to_use.append(f)
        
        
print 'Done!'

Done!


In [14]:
tmp = train_df.groupby(['num_bedrooms','location_6'])['num_price'].median().\
            reset_index().rename(columns={'num_price':'num_6_median_price_bedroom'})
    
train_df = train_df.merge(tmp,on=['num_bedrooms','location_6'],how='left')
test_df = test_df.merge(tmp,on=['num_bedrooms','location_6'],how='left')

train_df['num_6_price_ratio_bedroom'] = train_df['num_price'] / train_df['num_6_median_price_bedroom']
train_df['num_6_price_diff_bedroom'] = train_df['num_price'] - train_df['num_6_median_price_bedroom']
test_df['num_6_price_ratio_bedroom'] = test_df['num_price'] / test_df['num_6_median_price_bedroom']
test_df['num_6_price_diff_bedroom'] = test_df['num_price'] - test_df['num_6_median_price_bedroom']


for f in ['num_6_median_price_bedroom','num_6_price_ratio_bedroom','num_6_price_diff_bedroom']:
    if f not in features_to_use:
        features_to_use.append(f)

In [15]:
data_path = "../input/"
train_X_0322 = pd.read_csv(data_path + 'train_BM_MB_add03052240.csv')
test_X_0322 = pd.read_csv(data_path + 'test_BM_MB_add03052240.csv')

print train_X_0322.shape
print test_X_0322.shape

(49352, 322)
(74659, 322)


In [16]:
sentiment = [
       'feature_1_month_free', 'feature_24/7_concierge',
       'feature_24/7_doorman', 'feature_24/7_doorman_concierge',
       'feature_actual_apt._photos', 'feature_air_conditioning',
       'feature_all_pets_ok', 'feature_all_utilities_included',
       'feature_assigned-parking-space', 'feature_attended_lobby',
       'feature_backyard', 'feature_balcony', 'feature_basement_storage',
       'feature_basketball_court', 'feature_bike_room',
       'feature_bike_storage', 'feature_billiards_room',
       'feature_billiards_table_and_wet_bar', 'feature_brand_new',
       'feature_breakfast_bar', 'feature_bright', 'feature_brownstone',
       'feature_building-common-outdoor-space', 'feature_business_center',
       'feature_cable/satellite_tv', 'feature_cable_ready',
       'feature_call/text_abraham_caro_@_917-373-0862',
       'feature_cats_allowed', 'feature_central_a/c', 'feature_central_ac',
       'feature_central_air', 'feature_chefs_kitchen',
       "feature_children's_playroom", 'feature_childrens_playroom',
       'feature_cinema_room', 'feature_city_view',
       'feature_close_to_subway', 'feature_closets_galore!',
       'feature_club_sun_deck_has_spectacular_city_and_river_views',
       'feature_cold_storage', 'feature_common_backyard',
       'feature_common_garden', 'feature_common_outdoor_space',
       'feature_common_parking/garage', 'feature_common_roof_deck',
       'feature_common_storage', 'feature_common_terrace',
       'feature_community_recreation_facilities',
       'feature_complimentary_sunday_brunch', 'feature_concierge',
       'feature_concierge_service', 'feature_condo_finishes',
       'feature_courtyard', 'feature_crown_moldings', 'feature_deck',
       'feature_deco_brick_wall', 'feature_decorative_fireplace',
       'feature_dining_room', 'feature_dishwasher', 'feature_dogs_allowed',
       'feature_doorman', 'feature_dry_cleaning_service',
       'feature_dryer_in_unit', 'feature_duplex', 'feature_duplex_lounge',
       'feature_eat-in_kitchen', 'feature_eat_in_kitchen',
       'feature_elegant_glass-enclosed_private_lounge_with_magnificent_river_views',
       'feature_elevator', 'feature_exclusive',
       'feature_exercise/yoga_studio', 'feature_exposed_brick',
       'feature_extra_room', 'feature_fireplace', 'feature_fireplaces',
       'feature_fitness_center', 'feature_fitness_room', 'feature_flex-2',
       'feature_flex-3', 'feature_free_wifi_in_club_lounge',
       'feature_ft_doorman', 'feature_full-time_doorman',
       'feature_full_service_garage',
       'feature_fully-equipped_club_fitness_center',
       'feature_fully__equipped', 'feature_furnished', 'feature_game_room',
       'feature_garage', 'feature_garbage_disposal', 'feature_garden',
       'feature_garden/patio', 'feature_granite_countertops',
       'feature_granite_kitchen', 'feature_green_building',
       'feature_guarantors_accepted', 'feature_gut_renovated',
       'feature_gym', 'feature_gym/fitness', 'feature_gym_in_building',
       'feature_hardwood', 'feature_hardwood_floors',
       'feature_health_club', 'feature_hi_rise',
       'feature_high-speed_internet', 'feature_high_ceiling',
       'feature_high_ceilings', 'feature_high_speed_internet',
       'feature_highrise', 'feature_housekeeping_service',
       'feature_in-unit_washer/dryer', 'feature_indoor_pool',
       'feature_intercom', 'feature_jacuzzi', 'feature_large_living_room',
       'feature_laundry', 'feature_laundry_&_housekeeping',
       'feature_laundry_in_building', 'feature_laundry_in_unit',
       'feature_laundry_on_every_floor', 'feature_laundry_on_floor',
       'feature_laundry_room', 'feature_light', 'feature_live-in_super',
       'feature_live-in_superintendent', 'feature_live/work',
       'feature_live_in_super', 'feature_loft', 'feature_lounge',
       'feature_lounge_room', 'feature_lowrise', 'feature_luxury_building',
       'feature_magnificent_venetian-style', 'feature_mail_room',
       'feature_marble_bath', 'feature_marble_bathroom',
       'feature_media_room', 'feature_media_screening_room',
       'feature_microwave', 'feature_midrise', 'feature_multi-level',
       'feature_new_construction', 'feature_newly_renovated',
       'feature_no_fee', 'feature_no_pets', 'feature_on-site_atm_machine',
       'feature_on-site_attended_garage', 'feature_on-site_garage',
       'feature_on-site_laundry',
#        'feature_on-site_lifestyle_concierge_by_luxury_attach\xc3\xa9',
       'feature_on-site_parking', 'feature_on-site_parking_available',
       'feature_on-site_parking_lot', 'feature_on-site_super',
       'feature_one_month_free', 'feature_outdoor_areas',
       'feature_outdoor_entertainment_space', 'feature_outdoor_pool',
       'feature_outdoor_roof_deck_overlooking_new_york_harbor_and_battery_park',
       'feature_outdoor_space', 'feature_package_room', 'feature_parking',
       'feature_parking_available', 'feature_parking_space',
       'feature_part-time_doorman', 'feature_party_room', 'feature_patio',
       'feature_penthouse', 'feature_pet_friendly', 'feature_pets',
       'feature_pets_allowed', 'feature_pets_on_approval',
       'feature_playroom', 'feature_playroom/nursery', 'feature_pool',
       'feature_post-war', 'feature_post_war', 'feature_pre-war',
       'feature_pre_war', 'feature_prewar', 'feature_private-balcony',
       'feature_private-outdoor-space', 'feature_private_backyard',
       'feature_private_balcony', 'feature_private_deck',
       'feature_private_garden',
       'feature_private_laundry_room_on_every_floor',
       'feature_private_outdoor_space', 'feature_private_parking',
       'feature_private_roof_deck', 'feature_private_roofdeck',
       'feature_private_terrace', 'feature_publicoutdoor',
       'feature_queen_size_bedrooms', 'feature_queen_sized_rooms',
       'feature_reduced_fee', 'feature_renovated',
       'feature_renovated_kitchen', 'feature_residents_garden',
       'feature_residents_lounge', 'feature_roof-deck',
       'feature_roof_access', 'feature_roof_deck',
       'feature_roof_deck_with_grills', 'feature_roofdeck',
       'feature_rooftop_deck', 'feature_rooftop_terrace',
       'feature_s/s_appliances', 'feature_sauna', 'feature_screening_room',
       'feature_separate_kitchen', 'feature_shared_backyard',
       'feature_shared_garden', 'feature_shares_ok',
       'feature_short_term_allowed', 'feature_simplex', 'feature_skylight',
       'feature_skylight_atrium', 'feature_southern_exposure',
       'feature_spa_services', 'feature_ss_appliances',
       'feature_stainless_steel', 'feature_stainless_steel_appliances',
       'feature_state-of-the-art_fitness_center', 'feature_storage',
       'feature_storage_available', 'feature_storage_facilities_available',
       'feature_storage_room', 'feature_sublet', 'feature_subway',
       'feature_sundeck', 'feature_swimming_pool', 'feature_tenant_lounge',
       'feature_terrace', 'feature_terraces_/_balconies',
       'feature_tons_of_natural_light', 'feature_valet',
       'feature_valet_parking', 'feature_valet_service',
       'feature_valet_services',
       'feature_valet_services_including_dry_cleaning',
       'feature_video_intercom', 'feature_view', 'feature_virtual_doorman',
       'feature_virtual_tour', 'feature_walk-in_closet', 'feature_walk-up',
       'feature_walk_in_closet', 'feature_walk_in_closet(s)',
       'feature_washer/dryer', 'feature_washer/dryer_hookup',
       'feature_washer/dryer_in-unit', 'feature_washer/dryer_in_building',
       'feature_washer/dryer_in_unit', 'feature_washer_&_dryer',
       'feature_washer_in_unit', 'feature_wheelchair_access',
       'feature_wheelchair_ramp', 'feature_wifi', 'feature_wifi_access',
       'feature_wood-burning_fireplace', 'feature_yard',
       'feature_yoga_classes','building_id_mean_med',
       'building_id_mean_high', 'manager_id_mean_med',
       'manager_id_mean_high','median_price_bed', 'ratio_bed',
       'compound', 'neg', 'neu', 'pos', 'street',
       'avenue', 'east', 'west', 'north', 'south', 'other_address',
       'top_10_manager', 'top_25_manager', 'top_5_manager',
       'top_50_manager', 'top_1_manager', 'top_2_manager',
       'top_15_manager', 'top_20_manager', 'top_30_manager',
       'Zero_building_id', 'top_10_building', 'top_25_building',
       'top_5_building', 'top_50_building', 'top_1_building',
       'top_2_building', 'top_15_building', 'top_20_building',
       'top_30_building','listing_id'
]

train_df = train_df.merge(train_X_0322[sentiment],on='listing_id', how='left')
test_df = test_df.merge(test_X_0322[sentiment],on='listing_id', how='left')

for f in sentiment:
    if f not in features_to_use:
        features_to_use.append(f)

In [17]:
train_df.isnull().values.any()

False

In [18]:
test_df.isnull().values.any()

False

# CV statistics

In [21]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','price')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  price Done!


In [50]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_6_price_ratio_bedroom')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)
        

manager_id  vs  num_6_price_ratio_bedroom Done!


In [54]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_priceXroom')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  num_priceXroom Done!


In [41]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_6_price_diff_bedroom')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  num_6_price_diff_bedroom Done!


In [59]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','ratio_bed')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  ratio_bed Done!


In [61]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_6_price_ratio')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  num_6_price_ratio Done!


In [62]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_6_price_diff')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  num_6_price_diff Done!


In [63]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_created_hour')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  num_created_hour Done!


In [64]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_photo_count')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  num_photo_count Done!


In [65]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_dist_from_center')

for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  num_dist_from_center Done!


In [66]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','latitude')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  latitude Done!


In [67]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_pricePerBed')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  num_pricePerBed Done!


In [68]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_pricePerBath')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  num_pricePerBath Done!


In [69]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_features')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  num_features Done!


In [70]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_desc_wordcount')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  num_desc_wordcount Done!


In [71]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_pricePerRoom')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  num_pricePerRoom Done!


In [72]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','longitude')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  longitude Done!


In [73]:
train_df, test_df, new_feature = CV_st(train_df,test_df,'manager_id','num_pos_density')
for f in new_feature:
    if f not in features_to_use:
        features_to_use.append(f)

manager_id  vs  num_pos_density Done!


# Val

In [23]:
train_X = train_df[features_to_use]
test_X = test_df[features_to_use]

train_X.replace(np.inf, np.nan)
test_X.replace(np.inf, np.nan)

train_X.loc[:,'num_nan'] = train_X.isnull().sum(axis=1)
test_X.loc[:,'num_nan'] = test_X.isnull().sum(axis=1)

target_num_map = {'high':2, 'medium':1, 'low':0}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print train_X.shape, test_X.shape 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


(49352, 394) (74659, 394)


In [24]:
train_X.head()

Unnamed: 0,latitude,longitude,num_pricePerBed,num_bedBathSum,num_pricePerBath,num_pricePerRoom,num_bedPerBath,num_bedBathDiff,num_bedsPerc,num_photo_count,...,manager_id_price_low_max,manager_id_price_medium_max,manager_id_price_high_max,manager_id_price_low_min,manager_id_price_medium_min,manager_id_price_high_min,manager_id_price_low_std,manager_id_price_medium_std,manager_id_price_high_std,num_nan
0,40.7145,-73.9425,1000.0,4.5,2000.0,666.666667,2.0,1.5,0.666667,5,...,4100.0,4400.0,,1700.0,1800.0,,523.428881,758.927692,,10
1,40.7947,-73.9667,2732.5,3.0,5465.0,1821.666667,2.0,1.0,0.666667,11,...,9800.0,1995.0,,3005.0,1995.0,,1583.222479,,,12
2,40.7388,-74.0018,2850.0,2.0,2850.0,1425.0,1.0,0.0,0.5,8,...,8795.0,7995.0,3895.0,2100.0,1850.0,1650.0,1524.253239,1413.697943,1062.802898,0
3,40.7539,-73.9677,3275.0,2.0,3275.0,1637.5,1.0,0.0,0.5,3,...,11100.0,7495.0,6350.0,1775.0,1995.0,1695.0,1407.45262,1636.311315,1622.957508,0
4,40.8241,-73.9493,837.5,5.0,3350.0,670.0,4.0,3.0,0.8,3,...,5000.0,,,1495.0,,,1102.296157,,,20


# Tune XGBoost

In [27]:
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, train_size=.80, random_state=1234)
print X_train.shape
print X_val.shape
# xgtrain = xgb.DMatrix(X_train, label=y_train)

(39481, 394)
(9871, 394)


In [28]:
learning_rate = 0.1
best_score = 1000
train_param = 0
for x in [3,4,5,6,7,8,9,10,11,12,13,14,15]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= x,
        nthread = -1,
        silent = False
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x

    print x, '\t', rgr.best_score, rgr.best_iteration

3 	0.530492 695
4 	0.528765 525
5 	0.529225 350
6 	0.529538 213
7 	0.529026 179
8 	0.532885 144
9 	0.536976 113
10 	0.541965 74
11 	0.546624 62


KeyboardInterrupt: 

In [29]:
# max_depth = train_param
max_depth = 9
print max_depth

9


In [30]:
best_score = 1000

In [31]:
train_param = 1
for x in [2,4,8,12,16,20,24,28,32,40,48,64,80,128]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

2 	0.536106 109
4 	0.533341 130
8 	0.530499 134
12 	0.528106 140
16 	0.527847 165
20 	0.529206 146
24 	0.527782 138
28 	0.528749 146
32 	0.530082 138
40 	0.529713 208
48 	0.528488 212
64 	0.529307 220
80 	0.529193 230
128 	0.532131 221


In [33]:
min_child_weight = 48
print min_child_weight
best_score = 1000

48


In [34]:
train_param = 1
for x in [0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )

    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.05 	0.535266 355
0.1 	0.530002 293
0.2 	0.526755 229
0.3 	0.527294 166
0.4 	0.528033 218
0.5 	0.526212 217
0.6 	0.528171 215
0.7 	0.528578 151
0.8 	0.530005 163
0.9 	0.529102 160


In [35]:
colsample_bytree = train_param
print colsample_bytree

0.5


In [36]:
train_param = 1
for x in [0.5,0.6,0.7,0.8,0.9]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = colsample_bytree,
        subsample = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.5 	0.534286 143
0.6 	0.532799 180
0.7 	0.530517 180
0.8 	0.530897 204
0.9 	0.529085 194


In [37]:
subsample = train_param
print subsample

1


In [38]:
train_param = 0
for x in [0.3, 0.6, 0.9, 1.2, 1.5, 1.8, 2.1, 2.4, 2.7, 3.0]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = colsample_bytree,
        subsample = subsample,
        gamma = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )

    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.3 	0.527212 180
0.6 	0.52709 234
0.9 	0.526593 222
1.2 	0.528878 238
1.5 	0.526974 193
1.8 	0.527208 188
2.1 	0.526333 193
2.4 	0.52557 207
2.7 	0.526084 240
3.0 	0.52666 224


In [39]:
gamma = train_param
print gamma

2.4


In [48]:
xgtrain = xgb.DMatrix(train_X, label=train_y) 

def xgb_evaluate(min_child_weight, colsample_bytree, max_depth, gamma): #, subsample
    params = dict()
    params['objective']='multi:softprob'
    params['eval_metric']='mlogloss',
    params['num_class']=3
    params['silent']=1
    params['eta'] = 0.1
    params['verbose_eval'] = True
    params['min_child_weight'] = int(min_child_weight)
    params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = 0.99# max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    
    cv_result = xgb.cv(
        params, xgtrain, 
        num_boost_round=10000, nfold=5,
        metrics = 'mlogloss',
        seed=1234,callbacks=[xgb.callback.early_stop(50)]
    )
    
    return -cv_result['test-mlogloss-mean'].values[-1]


xgb_BO = BayesianOptimization(
    xgb_evaluate, 
    {
        'max_depth': (5,14),
        'min_child_weight': (30,100),
        'colsample_bytree': (0.1,0.8),
#         'subsample': (0.7,1),
        'gamma': (0,4)
    }
)

xgb_BO.maximize(init_points=5, n_iter=30)

[31mInitialization[0m
[94m-------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[428]	train-mlogloss:0.363966+0.00300174	test-mlogloss:0.527457+0.00659426

    1 | 27m24s | [35m  -0.52746[0m | [32m            0.5300[0m | [32m   2.5164[0m | [32m     6.7811[0m | [32m           36.9457[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[188]	train-mlogloss:0.322439+0.00229235	test-mlogloss:0.528592+0.00693657

    2 | 27m15s |   -0.52859 |             0.6644 |    2.0657 |     11.1918 |            52.9145 | 
Multiple eval metrics



[31mBayesian Optimization[0m
[94m-------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[301]	train-mlogloss:0.385243+0.00226347	test-mlogloss:0.52922+0.00642015

    6 | 26m00s |   -0.52922 |             0.6477 |    0.1119 |      7.4093 |            96.2057 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[519]	train-mlogloss:0.369016+0.00197565	test-mlogloss:0.527703+0.00731201



  " state: %s" % convergence_dict)


    7 | 16m01s |   -0.52770 |             0.1049 |    2.4241 |      7.3114 |            37.9278 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[289]	train-mlogloss:0.348944+0.00172091	test-mlogloss:0.525854+0.00681073

    8 | 28m49s | [35m  -0.52585[0m | [32m            0.4091[0m | [32m   3.8706[0m | [32m    12.0160[0m | [32m           32.5874[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[289]	train-mlogloss:0.348944+0.00172091	test-mlogloss:0.525854+0.00681073



  " state: %s" % convergence_dict)


    9 | 34m46s |   -0.52585 |             0.4091 |    3.8706 |     12.0160 |            32.5874 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[152]	train-mlogloss:0.323776+0.00206345	test-mlogloss:0.529308+0.00653546



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   10 | 37m54s |   -0.52931 |             0.7969 |    3.1037 |     13.4687 |            34.6022 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[508]	train-mlogloss:0.349179+0.00116203	test-mlogloss:0.530599+0.00694128

   11 | 27m02s |   -0.53060 |             0.1374 |    2.5442 |     13.4305 |            99.1244 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1439]	train-mlogloss:0.429442+0.00211071	test-mlogloss:0.528189+0.00660848



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   12 | 68m05s |   -0.52819 |             0.3700 |    3.9238 |      5.3241 |            96.5526 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[484]	train-mlogloss:0.397501+0.00177919	test-mlogloss:0.527865+0.0061275

   13 | 16m02s |   -0.52787 |             0.1820 |    0.7637 |      5.2381 |            49.4190 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[789]	train-mlogloss:0.392444+0.00281413	test-mlogloss:0.526668+0.00630868



  " state: %s" % convergence_dict)


   14 | 58m32s |   -0.52667 |             0.5682 |    3.9033 |      6.1013 |            42.5387 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[449]	train-mlogloss:0.348903+0.00124365	test-mlogloss:0.526126+0.00618817



  " state: %s" % convergence_dict)



Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[546]	train-mlogloss:0.405224+0.00253266	test-mlogloss:0.527397+0.00644255

   16 | 30m37s |   -0.52740 |             0.4752 |    2.7278 |      5.8785 |            59.6967 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[449]	train-mlogloss:0.348903+0.00124365	test-mlogloss:0.526126+0.00618817


Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.


KeyboardInterrupt: 

In [50]:
xgb_bo_scores = pd.DataFrame([[s[0]['max_depth'],
                               s[0]['min_child_weight'],
                               s[0]['colsample_bytree'],
#                                s[0]['subsample'],
                               s[0]['gamma'],
                               s[1]] for s in zip(xgb_BO.res['all']['params'],xgb_BO.res['all']['values'])],
                            columns = ['max_depth',
                                       'min_child_weight',
                                       'colsample_bytree',
#                                        'subsample',
                                       'gamma',
                                       'score'])
xgb_bo_scores=xgb_bo_scores.sort_values('score',ascending=False)
xgb_bo_scores.head(10)

Unnamed: 0,max_depth,min_child_weight,colsample_bytree,gamma,score
2,12.016003,32.587412,0.409076,3.87058,-0.525854
3,12.016004,32.587412,0.409076,3.87058,-0.525854
9,7.445485,31.132142,0.322098,2.711777,-0.526126
11,7.445485,31.132142,0.322098,2.711777,-0.526126
8,6.101341,42.538652,0.568188,3.903256,-0.526668
10,5.878451,59.696664,0.475184,2.727763,-0.527397
1,7.311439,37.92779,0.104916,2.424076,-0.527703
7,5.238122,49.419011,0.182047,0.763688,-0.527865
6,5.324136,96.552623,0.370024,3.923847,-0.528189
0,7.409311,96.205662,0.647738,0.111881,-0.52922


In [53]:
def xgb_blend(estimators, train_x, train_y, test_x, fold, early_stopping_rounds=0):
    N_params = len(estimators)
    print ("Blend %d estimators for %d folds" % (N_params, fold))
    skf = KFold(n_splits=fold,random_state=1234)
    N_class = len(set(train_y))
        
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x_mean = np.zeros((test_x.shape[0], N_class*N_params))
    test_blend_x_gmean = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros ((fold,N_params))
    best_rounds = np.zeros ((fold, N_params))
    
    for j, est in enumerate(estimators):
        est.set_params(objective = 'multi:softprob')
        est.set_params(silent = False)
        est.set_params(learning_rate = 0.02)
        est.set_params(n_estimators=100000)
        
        print ("Model %d: %s" %(j+1, est))

        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))
    
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x.iloc[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x.iloc[val_index]
            val_y_fold = train_y[val_index]      

            est.fit(train_x_fold,train_y_fold,
                    eval_set = [(val_x_fold, val_y_fold)],
                    eval_metric = 'mlogloss',
                    early_stopping_rounds=early_stopping_rounds,
                    verbose=False)
            best_round=est.best_iteration
            best_rounds[i,j]=best_round
            print ("best round %d" % (best_round))
            val_y_predict_fold = est.predict_proba(val_x_fold,ntree_limit=best_round)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print ("Score: ", score)
            scores[i,j]=score
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = est.predict_proba(test_x,ntree_limit=best_round)
            print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start))
            
        test_blend_x_mean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
        
        test_blend_x_gmean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([gmean(test_blend_x_j[:,range(0,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(1,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(2,N_class*fold,N_class)], axis=1)]).T
            
        print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print ("Score for blended models is %f" % (np.mean(scores)))
    return (train_blend_x, test_blend_x_mean, test_blend_x_gmean, scores,best_rounds)


In [60]:
estimators = [
#     xgb.XGBClassifier(max_depth = 7,
#                               min_child_weight = 24,
#                               colsample_bytree = 0.309861 ,
#                               subsample = 0.998132 ,
#                               gamma = 2.211859),
#              xgb.XGBClassifier(max_depth = 6,
#                               min_child_weight = 19,
#                               colsample_bytree = 0.432358,
#                               subsample = 0.949350,
#                               gamma = 2.976848),
#              xgb.XGBClassifier(max_depth = 7,
#                               min_child_weight = 23,
#                               colsample_bytree = 0.214791,
#                               subsample = 0.997197,
#                               gamma = 2.163581),         
#              xgb.XGBClassifier(max_depth = 8,
#                               min_child_weight = 23,
#                               colsample_bytree = 0.5,
#                               subsample = 0.988002,
#                               gamma = 3.0),  
             xgb.XGBClassifier(max_depth = 12,
                              min_child_weight = 32,
                              colsample_bytree = 0.4091,
                              subsample = 0.99,
                              gamma = 3.8706)              
             ]

#  	max_depth 	min_child_weight 	colsample_bytree 	gamma 	score
# 2 	12.016003 	32.587412 	0.409076 	3.870580 	-0.525854

(train_blend_x_xgb,
 test_blend_x_xgb_mean,
 test_blend_x_xgb_gmean,
 blend_scores_xgb,
 best_rounds_xgb) = xgb_blend(estimators,
                              train_X,train_y,
                              test_X,
                              5,
                              300)


Blend 1 estimators for 5 folds
Model 1: XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.4091,
       gamma=3.8706, learning_rate=0.02, max_delta_step=0, max_depth=12,
       min_child_weight=32, missing=None, n_estimators=100000, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.99)
Model 1 fold 1
best round 2232
('Score: ', 0.53155544781621866)
Model 1 fold 1 fitting finished in 2715.331s
Model 1 fold 2


KeyboardInterrupt: 

In [None]:
train_blend_x_xgb = pd.DataFrame(train_blend_x_xgb)
train_blend_x_xgb.columns = ["low", "medium", "high"]
train_blend_x_xgb["listing_id"] = train_X.listing_id.values

test_blend_x_xgb_mean = pd.DataFrame(test_blend_x_xgb_mean)
test_blend_x_xgb_mean.columns = ["low", "medium", "high"]
test_blend_x_xgb_mean["listing_id"] = test_X.listing_id.values

test_blend_x_xgb_gmean = pd.DataFrame(test_blend_x_xgb_gmean)
test_blend_x_xgb_gmean.columns = ["low", "medium", "high"]
test_blend_x_xgb_gmean["listing_id"] = test_X.listing_id.values

In [None]:
tmp_train = train_X_0322[['listing_id']].merge(train_blend_x_xgb,on = 'listing_id', how = 'left')[["low", "medium", "high"]].values
tmp_test_mean = test_X_0322[['listing_id']].merge(test_blend_x_xgb_mean,on = 'listing_id', how = 'left')[["low", "medium", "high"]].values
tmp_test_gmean = test_X_0322[['listing_id']].merge(test_blend_x_xgb_gmean,on = 'listing_id', how = 'left')[["low", "medium", "high"]].values

In [None]:
now = datetime.now()

name_train_blend = '../output/train_blend_xgb_cv_price_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../output/test_blend_xgb_mean_cv_price_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../output/test_blend_xgb_gmean_cv_price_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_xgb,axis=0))
print (np.mean(best_rounds_xgb,axis=0))
np.savetxt(name_train_blend,tmp_train, delimiter=",")
np.savetxt(name_test_blend_mean,tmp_test_mean, delimiter=",")
np.savetxt(name_test_blend_gmean,tmp_test_gmean, delimiter=",")