In [116]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
from sklearn import model_selection, preprocessing, ensemble
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.stats import skew, boxcox
import time
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

seed =1234

In [129]:
data_path = "../input/"
train_file = data_path + "train.json"
test_file = data_path + "test.json"
train_df = pd.read_json(train_file)
test_df = pd.read_json(test_file)
ntrain = train_df.shape[0]
print train_df.shape
print test_df.shape
print ntrain

(49352, 15)
(74659, 14)
49352


In [118]:
# sc_price
tmp = pd.concat([train_df['price'],test_df['price']])
ulimit = np.percentile(tmp.values, 99)

train_df.loc[:,'sc_price'] = train_df['price'].values.reshape(-1, 1)
test_df.loc[:,'sc_price'] = test_df['price'].values.reshape(-1, 1)

train_df.loc[train_df['sc_price']>ulimit, ['sc_price']] = ulimit
test_df.loc[test_df['sc_price']>ulimit, ['sc_price']] = ulimit

# sc_ba_price
inx_train = train_df['bathrooms'] == 0
inx_test = test_df['bathrooms'] == 0

non0_inx_train = ~inx_train
non0_inx_test = ~inx_test
train_df.loc[non0_inx_train,'sc_ba_price'] = train_df.loc[non0_inx_train,'sc_price']/train_df.loc[non0_inx_train,'bathrooms']
test_df.loc[non0_inx_test,'sc_ba_price'] = test_df.loc[non0_inx_test,'sc_price']/test_df.loc[non0_inx_test,'bathrooms']

train_df.loc[inx_train,'sc_ba_price'] = 0
test_df.loc[inx_test,'sc_ba_price'] = 0

# price per bedrooms

inx_train = train_df['bedrooms'] == 0
inx_test = test_df['bedrooms'] == 0

non0_inx_train = ~inx_train
non0_inx_test = ~inx_test
train_df.loc[non0_inx_train,'sc_be_price'] = train_df.loc[non0_inx_train,'sc_price']/train_df.loc[non0_inx_train,'bedrooms']
test_df.loc[non0_inx_test,'sc_be_price'] = test_df.loc[non0_inx_test,'sc_price']/test_df.loc[non0_inx_test,'bedrooms']

train_df.loc[inx_train,'sc_be_price'] = 0
test_df.loc[inx_test,'sc_be_price'] = 0


# bathrooms

ulimit = 5

train_df['sc_bathrooms']=train_df['bathrooms']
test_df['sc_bathrooms']=test_df['bathrooms']

train_df.loc[train_df['sc_bathrooms']>ulimit,['sc_bathrooms']] = ulimit
test_df.loc[test_df['sc_bathrooms']>ulimit,['sc_bathrooms']] = ulimit

# bedrooms

ulimit = 8

train_df['sc_bedrooms']=train_df['bedrooms']
test_df['sc_bedrooms']=test_df['bedrooms']

train_df.loc[train_df['sc_bedrooms']>ulimit, ['sc_bedrooms']] = ulimit
test_df.loc[test_df['sc_bedrooms']>ulimit,['sc_bedrooms']] = ulimit

# longitude

tmp = pd.concat([train_df['longitude'],test_df['longitude']])
llimit = np.percentile(tmp.values, 0.1)
ulimit = np.percentile(tmp.values, 99.9)

train_df['sc_longitude']=train_df['longitude']
test_df['sc_longitude']=test_df['longitude']

train_df.loc[train_df['sc_longitude']>ulimit, ['sc_longitude']] = ulimit
test_df.loc[test_df['sc_longitude']>ulimit, ['sc_longitude']] = ulimit
train_df.loc[train_df['sc_longitude']<llimit, ['sc_longitude']] = llimit
test_df.loc[test_df['sc_longitude']<llimit, ['sc_longitude']] = llimit

# latitude

tmp = pd.concat([train_df['latitude'],test_df['latitude']])
llimit = np.percentile(tmp.values, 0.1)
ulimit = np.percentile(tmp.values, 99.9)

train_df['sc_latitude']=train_df['latitude']
test_df['sc_latitude']=test_df['latitude']

train_df.loc[train_df['sc_latitude']>ulimit, ['sc_latitude']] = ulimit
test_df.loc[test_df['sc_latitude']>ulimit, ['sc_latitude']] = ulimit
train_df.loc[train_df['sc_latitude']<llimit, ['sc_latitude']] = llimit
test_df.loc[test_df['sc_latitude']<llimit, ['sc_latitude']] = llimit


features_to_use  = ["sc_bathrooms", "sc_bedrooms", "sc_latitude", "sc_longitude", "sc_price", "sc_ba_price", "sc_be_price"]

In [119]:
# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

# convert the created column to datetime object so as to extract more features 
train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])

# Let us extract some features like year, month, day, hour from date columns #
train_df["created_year"] = train_df["created"].dt.year
test_df["created_year"] = test_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
test_df["created_month"] = test_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour

# adding all these new features to use list #
features_to_use.extend(["num_photos", "num_features", "num_description_words", "created_month", 
                        "created_day", "created_hour"])


In [120]:
print features_to_use

['sc_bathrooms', 'sc_bedrooms', 'sc_latitude', 'sc_longitude', 'sc_price', 'sc_ba_price', 'sc_be_price', 'num_photos', 'num_features', 'num_description_words', 'created_month', 'created_day', 'created_hour']


In [121]:
full_data=pd.concat([train_df,test_df])

SSL = preprocessing.StandardScaler()
for col in features_to_use:
    full_data[col], lam = boxcox(full_data[col] - full_data[col].min() + 1)
    full_data[col] = SSL.fit_transform(full_data[col].values.reshape(-1,1)) 
    train_df[col] = full_data[:ntrain,col]
    test_df[col] = full_data[ntrain:,col]
    


In [122]:
features_to_use.append("listing_id")
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)

In [124]:
train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
print(train_df["features"].head())
tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_sparse = tfidf.fit_transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])

10                                                         
10000     Doorman Elevator Fitness_Center Cats_Allowed D...
100004    Laundry_In_Building Dishwasher Hardwood_Floors...
100007                               Hardwood_Floors No_Fee
100013                                              Pre-War
Name: features, dtype: object


In [125]:
# with categorical: fold 1 CV = 0.547696
# without categorical: fold 1 CV = 0.566962

# del_features_to_use = list(set(features_to_use) - set(categorical))


train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print train_X.shape, test_X.shape

(49352, 218) (74659, 218)


In [71]:
# with StandardScaler fold 1 cv = 0.543454

X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, train_size=.80, random_state=1234)


In [74]:
learning_rate = 0.1
for x in [3,4,5,6,7,8,9,10]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= x,
        nthread = -1,
        silent = False
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=20,
        verbose=False
    )

    print rgr.get_xgb_params()['max_depth'], '\t', rgr.best_score

3   0.548962
4   0.544886
5   0.546504
6   0.545221
7   0.547026
8   0.551261
9   0.55255
10   0.556356


In [75]:
max_depth = 4
# 3   0.548962
# 4   0.544886
# 5   0.546504
# 6   0.545221
# 7   0.547026
# 8   0.551261
# 9   0.55255
# 10   0.556356

# SRK ori
# 3   0.553568
# 4   0.549047
# 5   0.549068
# 6   0.549237
# 7   0.550124
# 8   0.551921
# 9   0.557487
# 10   0.558945

In [76]:
for x in [5,10,20,50,80,120,180,240,300]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=20,
        verbose=False
    )


    print rgr.get_xgb_params()['min_child_weight'], '\t', rgr.best_score

1   0.544886
5   0.544297
10   0.547673
20   0.546779
50   0.550753
80   0.551513
120   0.552604
180   0.556093
240   0.561224
300   0.561104


In [78]:
min_child_weight = 5
# 1   0.544886
# 5   0.544297
# 10   0.547673
# 20   0.546779
# 50   0.550753
# 80   0.551513
# 120   0.552604
# 180   0.556093
# 240   0.561224
# 300   0.561104


# SRK ori
# 1   0.549068
# 5   0.547466
# 10   0.548375
# 20   0.551334
# 50   0.551166
# 80   0.551872
# 120   0.553488
# 180   0.554964
# 240   0.55856
# 300   0.561123

In [79]:
for x in [0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=20,
        verbose=False
    )


    print rgr.get_xgb_params()['colsample_bytree'], '\t', rgr.best_score

0.3 	0.545806
0.4 	0.542401
0.5 	0.544305
0.6 	0.544499
0.7 	0.544259
0.8 	0.543949
0.9 	0.545657
1.0 	0.544297


In [80]:
colsample_bytree = 0.8
# 0.3 	0.545806
# 0.4 	0.542401
# 0.5 	0.544305
# 0.6 	0.544499
# 0.7 	0.544259
# 0.8 	0.543949
# 0.9 	0.545657
# 1.0 	0.544297


# SRK ori
# 0.3   0.547816
# 0.4   0.545827
# 0.5   0.547024
# 0.6   0.54467
# 0.7   0.543454
# 0.8   0.546214
# 0.9   0.547861
# 1.0   0.547466


In [81]:
for x in [0.5,0.6,0.7,0.8,0.9]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = colsample_bytree,
        subsample = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=20,
        verbose=False
    )

    print rgr.get_xgb_params()['subsample'], '\t', rgr.best_score

0.5 	0.55036
0.6 	0.544181
0.7 	0.545074
0.8 	0.544391
0.9 	0.541175
1.0 	0.543949


In [82]:
subsample = 1
# 0.5 	0.55036
# 0.6 	0.544181
# 0.7 	0.545074
# 0.8 	0.544391
# 0.9 	0.541175
# 1.0 	0.543949


# SRK ori
# 0.5   0.550979
# 0.6   0.54549
# 0.7   0.545445
# 0.8   0.543629
# 0.9   0.544821
# 1.0   0.543454


In [83]:
for x in [0.3, 0.6, 0.9, 1.2, 1.5, 1.8, 2.1, 2.4, 2.7, 3.0]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = colsample_bytree,
        subsample = subsample,
        gamma = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=20,
        verbose=False
    )

    print rgr.get_xgb_params()['gamma'], '\t', rgr.best_score

0.3 	0.542805
0.6 	0.54408
0.9 	0.545023
1.2 	0.543232
1.5 	0.543877
1.8 	0.544225
2.1 	0.544766
2.4 	0.547343
2.7 	0.550244
3.0 	0.553193


In [84]:
gamma = 0.3
# 0.3 	0.542805
# 0.6 	0.54408
# 0.9 	0.545023
# 1.2 	0.543232
# 1.5 	0.543877
# 1.8 	0.544225
# 2.1 	0.544766
# 2.4 	0.547343
# 2.7 	0.550244
# 3.0 	0.553193
# SRK ori
# 0   0.543454
# 0.3   0.545775
# 0.6   0.544581
# 0.9   0.544173
# 1.2   0.545422
# 1.5   0.545404
# 1.8   0.544855
# 2.1   0.544393
# 2.4   0.545129
# 2.7   0.546333
# 3.0   0.549442


In [86]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = max_depth
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = min_child_weight
    param['subsample'] = subsample
    param['colsample_bytree'] = colsample_bytree
    param['gamma'] = gamma
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [88]:
# cv_scores = []
# kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)
# for dev_index, val_index in kf.split(range(train_X.shape[0])):
#         dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
#         dev_y, val_y = train_y[dev_index], train_y[val_index]
#         preds, model = runXGB(dev_X, dev_y, val_X, val_y)
#         cv_scores.append(log_loss(val_y, preds))
#         print(cv_scores)
#         break

In [102]:
def xgb_blend(params, train_x, train_y, test_x, fold, early_stopping_rounds=0):
    N_params = len(params)
    print ("Blend %d estimators for %d folds" % (N_params, fold))
    skf = KFold(n_splits=fold,random_state=seed)
    N_class = len(set(train_y))
        
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros ((fold,N_params))
    best_rounds = np.zeros ((N_params))
    
    for j, param in enumerate(params):
        param['objective']='multi:softprob'
        param['eval_metric']='mlogloss',
        param['num_class']=3
        param['silent']= False
        param['eta'] = 0.03
#         param['verbose_eval'] = 10  
        print ("Model %d:" %(j+1))
        
        xgtrain = xgb.DMatrix(train_X, label=train_y)
        cv_result = xgb.cv(param, xgtrain,
                           num_boost_round=10000, nfold=fold,
                           metrics = 'mlogloss',
                           seed=seed,callbacks=[xgb.callback.early_stop(early_stopping_rounds)])    
        best_round = cv_result.shape[0] - 1
        print 'best_round',best_round
        best_rounds[j]=best_round
        
        param.pop('eval_metric')
        all_round = best_round / (1 - 1. / fold)
        est_test_blend = xgb.train(param, xgtrain,num_boost_round=int(all_round))

        test_blend_x[:,(j*N_class):(j+1)*N_class] = est_test_blend.predict(xgb.DMatrix(test_x))

        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x[val_index]
            val_y_fold = train_y[val_index]
            
            xgtrain_fold = xgb.DMatrix(train_x_fold, label=train_y_fold)
            est_train_blend = xgb.train(param, xgtrain_fold,num_boost_round=best_round)
            
            val_y_predict_fold = est_train_blend.predict(xgb.DMatrix(val_x_fold))
            score = log_loss(val_y_fold, val_y_predict_fold)
            print ("Score: ", score)
            scores[i,j]=score            
            
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start))            

        print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print ("Score for blended models is %f" % (np.mean(scores)))
    return (train_blend_x, test_blend_x, scores,best_rounds)

In [103]:

xgb_params = [{'max_depth':4,
               'min_child_weight':5,
               'colsample_bytree':0.8,
               'subsample':1,
               'gamma':0.3},
# #               score -0.530282              

#               {'max_depth':4,
#                'min_child_weight':2,
#                'colsample_bytree':0.828660,
#                'subsample':0.821156,
#                'gamma':2.751725},
# #               score -0.530455
          
#               {'max_depth':5,
#                'min_child_weight':12,
#                'colsample_bytree':0.736776,
#                'subsample':0.947351,
#                'gamma':2.677209},
# #               score -0.530621    
              
#               {'max_depth':5,
#                'min_child_weight':12,
#                'colsample_bytree':0.736769,
#                'subsample':0.947350,
#                'gamma':2.677208},
# #               score -0.530917                 

#               {'max_depth':4,
#                'min_child_weight':7,
#                'colsample_bytree':0.838006,
#                'subsample':0.930783,
#                'gamma':2.668471}
# #               score -0.530937
             ]

(train_blend_x_xgb,
 test_blend_x_xgb,
 blend_scores_xgb,
 best_rounds_xgb) = xgb_blend(xgb_params,
                              train_X,train_y,
                              test_X,
                              5,
                              300)

# print (np.mean(blend_scores_xgb_le,axis=0))
# print (np.mean(best_rounds_xgb_le,axis=0))

Blend 1 estimators for 5 folds
Model 1:
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 300 rounds.
Stopping. Best iteration:
[3087]	train-mlogloss:0.405973+0.00103189	test-mlogloss:0.545781+0.00657891

best_round 3087
Model 1 fold 1
('Score: ', 0.55203147819051201)
Model 1 fold 1 fitting finished in 272.326s
Model 1 fold 2
('Score: ', 0.5420850846688795)
Model 1 fold 2 fitting finished in 270.435s
Model 1 fold 3
('Score: ', 0.54142525511151163)
Model 1 fold 3 fitting finished in 270.289s
Model 1 fold 4
('Score: ', 0.5493672661635377)
Model 1 fold 4 fitting finished in 264.125s
Model 1 fold 5
('Score: ', 0.56116053485582518)
Model 1 fold 5 fitting finished in 262.688s
Score for model 1 is 0.549214
Score for blended models is 0.549214


In [106]:
train_blend_x_xgb[:20]

array([[  3.22046652e-02,   1.47814408e-01,   8.19980919e-01],
       [  7.82494491e-04,   1.47806276e-02,   9.84436870e-01],
       [  5.79654947e-02,   4.18617249e-01,   5.23417234e-01],
       [  1.62354447e-02,   5.85017875e-02,   9.25262749e-01],
       [  1.48545334e-03,   4.72688749e-02,   9.51245606e-01],
       [  3.11728679e-02,   1.84923366e-01,   7.83903718e-01],
       [  1.71255842e-02,   1.19503237e-01,   8.63371134e-01],
       [  1.92419603e-03,   8.05531368e-02,   9.17522669e-01],
       [  1.16785206e-01,   7.12138593e-01,   1.71076208e-01],
       [  3.31162512e-02,   2.65499383e-01,   7.01384366e-01],
       [  6.79918099e-04,   1.50236925e-02,   9.84296381e-01],
       [  2.32769016e-04,   3.40811606e-03,   9.96359050e-01],
       [  3.62770468e-01,   4.48342055e-01,   1.88887507e-01],
       [  3.06826551e-04,   1.26267700e-02,   9.87066448e-01],
       [  1.44303008e-03,   3.46196666e-02,   9.63937223e-01],
       [  4.79623079e-02,   2.70421147e-01,   6.8161654

In [107]:
train_y[:20]

array([1, 2, 0, 2, 2, 1, 2, 2, 1, 2, 2, 2, 0, 2, 2, 1, 2, 2, 2, 2])

In [110]:
# preds, model = runXGB(train_X, train_y, test_X, num_rounds=int(3087/.8))
out_df = pd.DataFrame(test_blend_x_xgb)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("../output/xgb_starter_SRK0222_test.csv", index=False)