In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
%matplotlib inline
# plt.rcdefaults()
mpl.style.use('additional')

def print_df_max_colwidth(df, width=200):
    with pd.option_context('display.max_colwidth', width):
        print(df)

In [3]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, LogisticRegressionCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict, train_test_split, StratifiedShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix, log_loss
from scipy import sparse
import xgboost as xgb

# Loading & Feature Engineering

In [4]:
train_df = pd.read_json("train.json")
to_test_df = pd.read_json("test.json")

In [5]:
def engineer_features(df):
    price_max = np.percentile(df.price, 99)
    df.loc[df.price > price_max, 'price'] = price_max
    latitude_max = np.percentile(df.latitude, 99)
    latitude_min = np.percentile(df.latitude, 1)
    df.loc[df.latitude > latitude_max, 'latitude'] = latitude_max
    df.loc[df.latitude < latitude_min, 'latitude'] = latitude_min
    longitude_max = np.percentile(df.longitude, 99)
    longitude_min = np.percentile(df.longitude, 1)
    df.loc[df.longitude > longitude_max, 'longitude'] = longitude_max
    df.loc[df.longitude < longitude_min, 'longitude'] = longitude_min
    df["num_photos"] = df.photos.apply(len)
    df["num_features"] = df.features.apply(len)
    df["num_description_words"] = df.description.apply(lambda x: len(x.split(" ")))
    df["created"] = pd.to_datetime(df.created)
    df["created_year"] = df.created.dt.year
    df["created_month"] = df.created.dt.month
    df["created_day"] = df.created.dt.day
    
    df['all_rooms'] = df['bathrooms'] + df['bedrooms']
    df['price_per_bed'] = df['price'] / df['bedrooms']    
    df['price_per_bath'] = df['price'] / df['bathrooms']
    df['price_per_room'] = df['price'] / df['all_rooms']

    df['price_per_bed'] = df['price_per_bed'].replace(np.Inf, 5000)
    df['price_per_bath'] = df['price_per_bath'].replace(np.Inf, 5000)
    df['price_per_room'] = df['price_per_room'].replace(np.Inf, 5000)
    
engineer_features(train_df)
engineer_features(to_test_df)

In [6]:
from itertools import chain

def features_cleanup_star(x):
    return list(chain.from_iterable(re.sub(r'\s*\*+\s*\**\s*', r'*', i).strip('*').split('*') for i in x))

def process_features(df):
    df['features_clean'] = df['features']\
                    .apply(lambda x: ' '.join([re.sub(r'\W', '', i) for i in x]).lower())

    df.loc[df.features_clean.str.contains('\*'), 'features_clean'] = \
            df.loc[df.features_clean.str.contains('\*'), 'features']\
                    .apply(features_cleanup_star)\
                    .apply(lambda x: ' '.join([re.sub(r'\W', '', i) for i in x]).lower())
                
process_features(train_df)
process_features(to_test_df)

In [7]:
countvec_features = CountVectorizer(stop_words='english', max_features=200)
countvec_features.fit(train_df['features_clean'].tolist() + to_test_df['features_clean'].tolist())
train_df_features = countvec_features.transform(train_df['features_clean'])
to_test_df_features = countvec_features.transform(to_test_df['features_clean'])

In [8]:
countvec_desc = CountVectorizer(stop_words='english', max_features=200)
countvec_desc.fit(train_df['description'].tolist() + to_test_df['description'].tolist())
train_df_desc = countvec_desc.transform(train_df['description'])
to_test_df_desc = countvec_desc.transform(to_test_df['description'])

In [9]:
X_vars = ['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price', 
          'num_photos', 'num_features', 'num_description_words', 
          'created_year', 'created_month', 'created_day', 
          'all_rooms', 'price_per_bed', 'price_per_bath', 'price_per_room']
# target_num_map = {'high':0, 'medium':1, 'low':2}
# train_df['interest_level_coded'] = train_df.interest_level.map(target_num_map)
train_idx, test_idx = next(StratifiedShuffleSplit(n_splits=1, test_size=0.25).split(train_df[X_vars], train_df.interest_level))

# The Numerical Part

Grid search and validate `XGBClassifier`.

In [116]:
gbc = GridSearchCV(xgb.XGBClassifier(n_estimators=50, objective='multi:softprob'), 
                   {'learning_rate': [0.01, 0.05, 0.1, 0.2], }, scoring='neg_log_loss')\
            .fit(train_df[X_vars].iloc[train_idx], train_df.interest_level.iloc[train_idx])
print(gbc.best_params_)
pd.DataFrame(gbc.cv_results_)

{'learning_rate': 0.2, 'subsample': 0.5}


Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_learning_rate,param_subsample,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,2.260482,0.118512,-0.899855,-0.898677,0.01,0.3,"{'learning_rate': 0.01, 'subsample': 0.3}",10,-0.900142,-0.898647,-0.899117,-0.899382,-0.900305,-0.898004,0.059935,0.002005,0.000526,0.000563
1,2.709633,0.118994,-0.900064,-0.898822,0.01,0.5,"{'learning_rate': 0.01, 'subsample': 0.5}",11,-0.900217,-0.898547,-0.899356,-0.89965,-0.900619,-0.898268,0.106461,0.00842,0.000527,0.000597
2,2.566909,0.115984,-0.900192,-0.898854,0.01,0.7,"{'learning_rate': 0.01, 'subsample': 0.7}",12,-0.900647,-0.898755,-0.899245,-0.899484,-0.900683,-0.898323,0.050328,0.003762,0.000669,0.000479
3,2.336265,0.137082,-0.694868,-0.689976,0.05,0.3,"{'learning_rate': 0.05, 'subsample': 0.3}",7,-0.695233,-0.690332,-0.692602,-0.691495,-0.696771,-0.6881,0.162832,0.024859,0.001721,0.001409
4,2.631484,0.121284,-0.696053,-0.690979,0.05,0.5,"{'learning_rate': 0.05, 'subsample': 0.5}",8,-0.696485,-0.690984,-0.693926,-0.692855,-0.697747,-0.689096,0.06845,0.007444,0.00159,0.001535
5,2.609694,0.13245,-0.696976,-0.691845,0.05,0.7,"{'learning_rate': 0.05, 'subsample': 0.7}",9,-0.697574,-0.691859,-0.694614,-0.693323,-0.698742,-0.690353,0.113866,0.021888,0.001737,0.001212
6,2.214132,0.136113,-0.658113,-0.648819,0.1,0.3,"{'learning_rate': 0.1, 'subsample': 0.3}",4,-0.658165,-0.648424,-0.656471,-0.651405,-0.659704,-0.646629,0.046932,0.004086,0.00132,0.00197
7,2.68114,0.13454,-0.659145,-0.6495,0.1,0.5,"{'learning_rate': 0.1, 'subsample': 0.5}",5,-0.659014,-0.649404,-0.65689,-0.651528,-0.661532,-0.647568,0.098771,0.008906,0.001897,0.001618
8,2.60104,0.133662,-0.659808,-0.650022,0.1,0.7,"{'learning_rate': 0.1, 'subsample': 0.7}",6,-0.659581,-0.650009,-0.657237,-0.651553,-0.662606,-0.648504,0.091532,0.004268,0.002198,0.001245
9,2.332261,0.14171,-0.638253,-0.620376,0.2,0.3,"{'learning_rate': 0.2, 'subsample': 0.3}",2,-0.639136,-0.62107,-0.635155,-0.621632,-0.640469,-0.618425,0.099334,0.015309,0.002257,0.001398


In [119]:
gbc = GridSearchCV(xgb.XGBClassifier(n_estimators=200, objective='multi:softprob'), 
                   {'max_depth': [3, 5, 7]}, scoring='neg_log_loss')\
            .fit(train_df[X_vars].iloc[train_idx], train_df.interest_level.iloc[train_idx])
print(gbc.best_params_)
pd.DataFrame(gbc.cv_results_)

{'max_depth': 5}


Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_max_depth,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,8.039676,0.409955,-0.625936,-0.595014,3,{'max_depth': 3},3,-0.625681,-0.594693,-0.62424,-0.596058,-0.627888,-0.594292,0.275498,0.07609,0.0015,0.000756
1,13.030548,0.633976,-0.615243,-0.515378,5,{'max_depth': 5},1,-0.614748,-0.516259,-0.613652,-0.5144,-0.617329,-0.515477,0.054695,0.041242,0.001541,0.000762
2,18.429672,0.981492,-0.618669,-0.40283,7,{'max_depth': 7},2,-0.61888,-0.401774,-0.618492,-0.398189,-0.618636,-0.408529,0.17732,0.019713,0.00016,0.004287


In [117]:
gbc = GridSearchCV(xgb.XGBClassifier(n_estimators=50, objective='multi:softprob'), 
                   {'colsample_bytree': [0.3, 0.5, 0.7], 'subsample': [0.3, 0.5, 0.7]}, scoring='neg_log_loss')\
            .fit(train_df[X_vars].iloc[train_idx], train_df.interest_level.iloc[train_idx])
print(gbc.best_params_)
pd.DataFrame(gbc.cv_results_)

{'colsample_bytree': 0.7, 'subsample': 0.3}


Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_colsample_bytree,param_subsample,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,1.37388,0.162061,-0.675198,-0.668141,0.3,0.3,"{'colsample_bytree': 0.3, 'subsample': 0.3}",9,-0.675846,-0.667723,-0.671948,-0.66979,-0.677802,-0.666911,0.341337,0.047884,0.002433,0.001212
1,1.255084,0.141151,-0.674692,-0.667616,0.3,0.5,"{'colsample_bytree': 0.3, 'subsample': 0.5}",7,-0.675617,-0.66736,-0.671238,-0.669445,-0.67722,-0.666043,0.047812,0.022392,0.002528,0.001401
2,1.353777,0.157904,-0.675033,-0.667651,0.3,0.7,"{'colsample_bytree': 0.3, 'subsample': 0.7}",8,-0.676761,-0.667423,-0.671026,-0.669067,-0.677313,-0.666465,0.118052,0.01442,0.002843,0.001074
3,1.49873,0.13389,-0.66492,-0.657101,0.5,0.3,"{'colsample_bytree': 0.5, 'subsample': 0.3}",5,-0.666236,-0.657503,-0.660686,-0.657986,-0.667837,-0.655815,0.153886,0.006623,0.003064,0.000931
4,1.626641,0.140055,-0.6649,-0.657003,0.5,0.5,"{'colsample_bytree': 0.5, 'subsample': 0.5}",4,-0.666057,-0.657479,-0.660767,-0.657866,-0.667876,-0.655666,0.050403,0.0151,0.003015,0.000959
5,1.915306,0.177613,-0.665378,-0.657341,0.5,0.7,"{'colsample_bytree': 0.5, 'subsample': 0.7}",6,-0.665846,-0.657158,-0.662325,-0.65871,-0.667963,-0.656154,0.106659,0.022779,0.002325,0.001052
6,1.778154,0.12763,-0.661042,-0.652855,0.7,0.3,"{'colsample_bytree': 0.7, 'subsample': 0.3}",1,-0.662248,-0.653493,-0.657987,-0.654465,-0.662891,-0.650607,0.174717,0.00214,0.002176,0.001638
7,1.923207,0.128062,-0.661776,-0.652725,0.7,0.5,"{'colsample_bytree': 0.7, 'subsample': 0.5}",2,-0.661931,-0.652561,-0.658557,-0.654114,-0.66484,-0.651501,0.011422,0.006554,0.002567,0.001073
8,1.893166,0.141564,-0.662276,-0.653384,0.7,0.7,"{'colsample_bytree': 0.7, 'subsample': 0.7}",3,-0.66259,-0.653385,-0.659611,-0.65539,-0.664628,-0.651378,0.061233,0.007453,0.00206,0.001638


End up using below.

In [10]:
gbc = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=100, objective='multi:softprob', subsample=0.7)\
         .fit(train_df[X_vars].iloc[train_idx], train_df.interest_level.iloc[train_idx])
print(log_loss(train_df.interest_level.iloc[test_idx], gbc.predict_proba(train_df[X_vars].iloc[test_idx])))

0.613285262066


# The Text Part

Grid Search and validate `MultinomialNB`.

In [12]:
train_df_text_combined = sparse.hstack([train_df_features, train_df_desc]).tocsr()

In [102]:
nb = GridSearchCV(MultinomialNB(), {'alpha': np.logspace(-3, 1, 10)}, scoring='neg_log_loss')\
            .fit(train_df_text_combined[train_idx], train_df.interest_level.iloc[train_idx])
print(nb.best_params_)
pd.DataFrame(nb.cv_results_)

{'alpha': 10.0}


Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_alpha,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.21052,0.079248,-1.136286,-1.090562,0.001,{'alpha': 0.001},10,-1.155309,-1.083001,-1.089628,-1.085074,-1.163922,-1.103611,0.02642,0.012483,0.033179,0.009266
1,0.188928,0.071923,-1.133565,-1.090592,0.002783,{'alpha': 0.00278255940221},9,-1.153012,-1.083014,-1.087132,-1.085128,-1.160552,-1.103634,0.002472,0.003376,0.032977,0.009262
2,0.184816,0.067084,-1.130892,-1.090644,0.007743,{'alpha': 0.00774263682681},8,-1.150825,-1.083039,-1.084635,-1.085226,-1.157217,-1.103667,0.0084,0.003688,0.032813,0.009252
3,0.209198,0.075314,-1.128211,-1.090717,0.021544,{'alpha': 0.0215443469003},7,-1.148508,-1.083076,-1.082192,-1.085375,-1.153934,-1.103698,0.002557,0.010877,0.032616,0.009227
4,0.22747,0.0713,-1.125511,-1.090773,0.059948,{'alpha': 0.0599484250319},6,-1.146064,-1.083099,-1.079795,-1.085539,-1.150675,-1.10368,0.060248,0.002177,0.032381,0.009181
5,0.188014,0.076167,-1.122686,-1.090654,0.16681,{'alpha': 0.16681005372},5,-1.143347,-1.082967,-1.077424,-1.085562,-1.147287,-1.103432,0.011661,0.00503,0.032045,0.009098
6,0.24246,0.083492,-1.11978,-1.090049,0.464159,{'alpha': 0.464158883361},4,-1.140499,-1.0825,-1.075221,-1.085048,-1.143619,-1.102599,0.01089,0.00274,0.031533,0.008935
7,0.209612,0.080055,-1.115859,-1.088276,1.29155,{'alpha': 1.29154966501},3,-1.136823,-1.081222,-1.071901,-1.083062,-1.138854,-1.100543,0.029166,0.003163,0.031094,0.008707
8,0.227095,0.082816,-1.108898,-1.083227,3.593814,{'alpha': 3.5938136638},2,-1.131564,-1.077183,-1.065103,-1.077728,-1.130026,-1.094769,0.034769,0.007598,0.030974,0.008164
9,0.211974,0.071439,-1.096291,-1.072447,10.0,{'alpha': 10.0},1,-1.123255,-1.065362,-1.052198,-1.069263,-1.113418,-1.082716,0.027598,0.005432,0.031436,0.007434


End up using below.

In [100]:
nb = MultinomialNB(alpha=10)\
         .fit(train_df_text_combined[train_idx], train_df.interest_level.iloc[train_idx])
print(log_loss(train_df.interest_level.iloc[test_idx], nb.predict_proba(train_df_text_combined[test_idx])))

1.12692032


Grid Search and validate `SGDClassifier`.

In [109]:
sgd = GridSearchCV(SGDClassifier(n_jobs=-1, loss='log'), {'alpha': np.logspace(-4, -1, 10)}, scoring='neg_log_loss')\
            .fit(train_df_text_combined[train_idx], train_df.interest_level.iloc[train_idx])
print(sgd.best_params_)
pd.DataFrame(sgd.cv_results_)

{'alpha': 0.001}


Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_alpha,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.181878,0.07334,-1.836156,-1.763813,0.0001,{'alpha': 0.0001},10,-1.912437,-1.831883,-1.949561,-1.913284,-1.646449,-1.546274,0.012826,0.005392,0.134989,0.157372
1,0.1748,0.103093,-0.974159,-0.944993,0.000215,{'alpha': 0.000215443469003},9,-0.876842,-0.852395,-1.078281,-1.053093,-0.96736,-0.929491,0.008547,0.02862,0.082379,0.082665
2,0.182999,0.079511,-0.812242,-0.789922,0.000464,{'alpha': 0.000464158883361},7,-0.866304,-0.835097,-0.761933,-0.745402,-0.808486,-0.789268,0.013444,0.012168,0.042693,0.036621
3,0.178709,0.077306,-0.751302,-0.734818,0.001,{'alpha': 0.001},1,-0.755007,-0.734498,-0.747953,-0.739454,-0.750945,-0.730502,0.016126,0.004648,0.002891,0.003661
4,0.171885,0.06686,-0.756982,-0.739404,0.002154,{'alpha': 0.00215443469003},2,-0.771129,-0.748521,-0.746961,-0.735934,-0.752855,-0.733757,0.001552,0.006522,0.010289,0.006508
5,0.174942,0.068327,-0.762057,-0.748879,0.004642,{'alpha': 0.00464158883361},3,-0.766359,-0.750671,-0.756635,-0.749018,-0.763176,-0.746947,0.005104,0.005661,0.004048,0.001524
6,0.168781,0.073671,-0.773503,-0.763341,0.01,{'alpha': 0.01},4,-0.774645,-0.761876,-0.773328,-0.766547,-0.772535,-0.761599,0.002393,0.008573,0.00087,0.00227
7,0.169284,0.066431,-0.785213,-0.777624,0.021544,{'alpha': 0.0215443469003},5,-0.785243,-0.775806,-0.785087,-0.778826,-0.785309,-0.778241,0.00165,0.004488,9.3e-05,0.001308
8,0.173103,0.063775,-0.80125,-0.79627,0.046416,{'alpha': 0.0464158883361},6,-0.802262,-0.795773,-0.80148,-0.796235,-0.800008,-0.796803,0.004597,0.001814,0.000934,0.000421
9,0.174201,0.072648,-0.818672,-0.815675,0.1,{'alpha': 0.1},8,-0.819119,-0.814884,-0.820983,-0.816247,-0.815913,-0.815892,0.006905,0.00094,0.002094,0.000577


End up using below.

In [13]:
sgd = SGDClassifier(alpha=1e-3, n_jobs=-1, loss='log')\
         .fit(train_df_text_combined[train_idx], train_df.interest_level.iloc[train_idx])
print(log_loss(train_df.interest_level.iloc[test_idx], sgd.predict_proba(train_df_text_combined[test_idx])))

0.745210826559


In [14]:
gbc_text = xgb.XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=100, objective='multi:softprob', subsample=0.7)\
         .fit(train_df_text_combined[train_idx], train_df.interest_level.iloc[train_idx])
print(log_loss(train_df.interest_level.iloc[test_idx], gbc_text.predict_proba(train_df_text_combined[test_idx])))

0.687420803746


`SGDClassifier` is more effective than `MultinomialNB`, but `XGBClassifier` is even better. But let's keep things fast and use `SGDClassifier`.

# Stacking Together

## Three Methodologies

**The first attempt**: use `XGBClassifier` and `SGDClassifier` to get two sets of predicted probs for the training set within `train_df`. Then use `LogisticRegressionCV` to validate against the training set's reponse. Obviously this is not right.

In [135]:
y_pred_train_gbc = gbc.predict_proba(train_df[X_vars].iloc[train_idx])
y_pred_train_sgd = sgd.predict_proba(train_df_text_combined[train_idx])
lrcv = LogisticRegressionCV(Cs=np.logspace(-3, 1, 10), scoring='neg_log_loss', n_jobs=-1)\
         .fit(np.hstack([y_pred_train_gbc, y_pred_train_sgd]), train_df.interest_level.iloc[train_idx])
lrcv.scores_['low'].mean(axis=0)

array([-0.40199251, -0.31352343, -0.25663374, -0.22867747, -0.21817649,
       -0.21525915, -0.21464798, -0.21454989, -0.21453925, -0.21453941])

In [137]:
print(log_loss(train_df.interest_level.iloc[test_idx], lrcv.predict_proba(np.hstack([y_pred_gbc, y_pred_sgd]))))

0.873679860425


**The second attempt**: use `XGBClassifier` and `SGDClassifier` to get two sets of predicted probs for the test set within `train_df`. Then use `LogisticRegressionCV` to validate against the test set's reponse.

In [152]:
lrcv = LogisticRegressionCV(Cs=np.logspace(-3, 1, 10), scoring='neg_log_loss', n_jobs=-1)\
         .fit(np.hstack([y_pred_gbc, y_pred_sgd]), train_df.interest_level.iloc[test_idx])
lrcv.scores_['low'].mean(axis=0)

array([-0.56942653, -0.52803948, -0.48904405, -0.46792262, -0.46059575,
       -0.45866504, -0.45824157, -0.45817531, -0.4581725 , -0.45817535])

In [150]:
print(log_loss(train_df.interest_level.iloc[test_idx], lrcv.predict_proba(np.hstack([y_pred_gbc, y_pred_sgd]))))

0.61368417946


**The third attempt**: use `XGBClassifier` and `SGDClassifier` to get two sets of predicted probs for the training set within `train_df` through cross validation. Then use `LogisticRegressionCV` to validate against the training set's reponse. This is the right way because we won't have test set to validate with like in attempt 2.

In [15]:
y_pred_train_cv_gbc = cross_val_predict(gbc, train_df[X_vars].iloc[train_idx], train_df.interest_level.iloc[train_idx], 
                                        n_jobs=-1, method='predict_proba')
y_pred_train_cv_sgd = cross_val_predict(sgd, train_df_text_combined[train_idx], train_df.interest_level.iloc[train_idx], 
                                        n_jobs=-1, method='predict_proba')
y_pred_gbc = gbc.predict_proba(train_df[X_vars].iloc[test_idx])
y_pred_sgd = sgd.predict_proba(train_df_text_combined[test_idx])

In [163]:
lrcv = LogisticRegressionCV(Cs=np.logspace(-3, 1, 10), scoring='neg_log_loss', n_jobs=-1)\
         .fit(np.hstack([y_pred_train_cv_gbc, y_pred_train_cv_sgd]), train_df.interest_level.iloc[train_idx])
lrcv.scores_['low'].mean(axis=0)

array([-0.52942097, -0.49304633, -0.47407423, -0.46772584, -0.46620131,
       -0.46594499, -0.46592832, -0.46593575, -0.46594105, -0.46594352])

In [164]:
print(log_loss(train_df.interest_level.iloc[test_idx], lrcv.predict_proba(np.hstack([y_pred_gbc, y_pred_sgd]))))

0.61735239762


## Other Meta-Models

`LogisticRegressionCV` may not be performing too well. Let's experiment on other meta-models.

Try another `SGDClassifier` for the meta-model.

In [166]:
sgd_meta = GridSearchCV(SGDClassifier(n_jobs=-1, loss='log'), {'alpha': np.logspace(-4, -1, 10)}, scoring='neg_log_loss')\
            .fit(np.hstack([y_pred_train_cv_gbc, y_pred_train_cv_sgd]), train_df.interest_level.iloc[train_idx])
print(sgd_meta.best_params_)
pd.DataFrame(sgd_meta.cv_results_)

{'alpha': 0.00021544346900318845}


Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_alpha,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.163967,0.089536,-0.627055,-0.625334,0.0001,{'alpha': 0.0001},3,-0.625406,-0.625072,-0.627195,-0.626385,-0.628565,-0.624547,0.011267,0.018212,0.001293,0.000773
1,0.154598,0.071983,-0.62511,-0.623015,0.000215,{'alpha': 0.000215443469003},1,-0.624096,-0.62315,-0.623562,-0.622762,-0.627673,-0.623133,0.010058,0.00276,0.001825,0.000179
2,0.146369,0.069727,-0.627628,-0.625141,0.000464,{'alpha': 0.000464158883361},4,-0.625585,-0.624177,-0.63198,-0.626854,-0.62532,-0.624393,0.004317,0.008803,0.003079,0.001214
3,0.146177,0.062691,-0.626631,-0.625356,0.001,{'alpha': 0.001},2,-0.625406,-0.624659,-0.625548,-0.625842,-0.628941,-0.625568,0.003861,0.003804,0.001634,0.000505
4,0.151066,0.063591,-0.629674,-0.628703,0.002154,{'alpha': 0.00215443469003},5,-0.629572,-0.628668,-0.630125,-0.629135,-0.629325,-0.628305,0.007471,0.003549,0.000334,0.00034
5,0.147501,0.063312,-0.638691,-0.638461,0.004642,{'alpha': 0.00464158883361},6,-0.639728,-0.63854,-0.638808,-0.639133,-0.637537,-0.63771,0.004216,0.000855,0.000898,0.000584
6,0.152194,0.07896,-0.656795,-0.656472,0.01,{'alpha': 0.01},7,-0.657586,-0.656415,-0.657242,-0.656405,-0.655557,-0.656597,0.010143,0.021691,0.000886,8.8e-05
7,0.156853,0.075454,-0.685676,-0.685544,0.021544,{'alpha': 0.0215443469003},8,-0.686139,-0.685053,-0.686079,-0.685465,-0.68481,-0.686113,0.008088,0.014735,0.000613,0.000436
8,0.154366,0.078454,-0.719098,-0.718993,0.046416,{'alpha': 0.0464158883361},9,-0.7194,-0.718545,-0.719646,-0.719047,-0.718248,-0.719388,0.007012,0.01259,0.000609,0.000346
9,0.15652,0.151367,-0.747878,-0.747832,0.1,{'alpha': 0.1},10,-0.747976,-0.747392,-0.748178,-0.747829,-0.74748,-0.748276,0.003265,0.080113,0.000293,0.000361


In [167]:
sgd_meta = SGDClassifier(alpha=1e-3, n_jobs=-1, loss='log')\
            .fit(np.hstack([y_pred_train_cv_gbc, y_pred_train_cv_sgd]), train_df.interest_level.iloc[train_idx])
print(log_loss(train_df.interest_level.iloc[test_idx], sgd_meta.predict_proba(np.hstack([y_pred_gbc, y_pred_sgd]))))

0.619235840253


Try fitting the predicted outcome rather than the probs.

In [179]:
y_pred_train_cv_gbc_1 = cross_val_predict(gbc, train_df[X_vars].iloc[train_idx], train_df.interest_level.iloc[train_idx], 
                                        n_jobs=-1, method='predict')
y_pred_train_cv_sgd_1 = cross_val_predict(sgd, train_df_text_combined[train_idx], train_df.interest_level.iloc[train_idx], 
                                        n_jobs=-1, method='predict')
y_pred_train_cv_combined = pd.get_dummies(pd.DataFrame(np.column_stack([y_pred_train_cv_gbc_1, y_pred_train_cv_sgd_1])))
y_pred_gbc_1 = gbc.predict(train_df[X_vars].iloc[test_idx])
y_pred_sgd_1 = sgd.predict(train_df_text_combined[test_idx])
y_pred_combined = pd.get_dummies(pd.DataFrame(np.column_stack([y_pred_gbc_1, y_pred_sgd_1])))

sgd_meta = SGDClassifier(alpha=1e-3, n_jobs=-1, loss='log')\
            .fit(y_pred_train_cv_combined, train_df.interest_level.iloc[train_idx])
print(log_loss(train_df.interest_level.iloc[test_idx], sgd_meta.predict_proba(y_pred_combined)))

0.691456951676


Try SVM with `rbf` kernel.

In [184]:
svm_meta = GridSearchCV(SVC(kernel='rbf', decision_function_shape='ovr', probability=True), 
                       {'C': np.logspace(-2, 10, 3), 'gamma': np.logspace(-9, 3, 3)})\
            .fit(np.hstack([y_pred_train_cv_gbc, y_pred_train_cv_sgd]), train_df.interest_level.iloc[train_idx])
print(svm_meta.best_params_)
pd.DataFrame(svm_meta.cv_results_)

{'C': 10000000000.0, 'gamma': 0.001}


Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_C,param_gamma,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,57.857401,3.292474,0.694683,0.694683,0.01,0.0,"{'C': 0.01, 'gamma': 1e-09}",3,0.694627,0.694711,0.694683,0.694683,0.694739,0.694655,2.557063,0.148861,4.6e-05,2.3e-05
1,90.937248,3.978159,0.694683,0.694683,0.01,0.001,"{'C': 0.01, 'gamma': 0.001}",3,0.694627,0.694711,0.694683,0.694683,0.694739,0.694655,2.630024,0.119397,4.6e-05,2.3e-05
2,211.1747,5.865651,0.694683,0.694683,0.01,1000.0,"{'C': 0.01, 'gamma': 1000.0}",3,0.694627,0.694711,0.694683,0.694683,0.694739,0.694655,25.83799,0.226458,4.6e-05,2.3e-05
3,51.83242,2.914677,0.694683,0.694683,10000.0,0.0,"{'C': 10000.0, 'gamma': 1e-09}",3,0.694627,0.694711,0.694683,0.694683,0.694739,0.694655,4.915702,0.34366,4.6e-05,2.3e-05
4,118.375814,3.601804,0.725347,0.72555,10000.0,0.001,"{'C': 10000.0, 'gamma': 0.001}",2,0.722587,0.727011,0.725239,0.725361,0.728216,0.724278,3.377007,0.028662,0.002299,0.001124
5,1235.056354,4.788669,0.619063,0.998055,10000.0,1000.0,"{'C': 10000.0, 'gamma': 1000.0}",9,0.623957,0.997812,0.621414,0.998784,0.611818,0.997569,297.74682,0.094532,0.005227,0.000525
6,41.359506,2.662345,0.689712,0.691103,10000000000.0,0.0,"{'C': 10000000000.0, 'gamma': 1e-09}",7,0.693573,0.6892,0.70141,0.690347,0.674151,0.693763,0.312963,0.083005,0.011458,0.001938
7,240.686356,2.846831,0.731021,0.730859,10000000000.0,0.001,"{'C': 10000000000.0, 'gamma': 0.001}",1,0.730124,0.731307,0.730507,0.730953,0.732431,0.730316,5.271223,0.013222,0.001009,0.00041
8,1764.622503,4.450449,0.6276,0.998946,10000000000.0,1000.0,"{'C': 10000000000.0, 'gamma': 1000.0}",8,0.628819,0.998784,0.619306,0.999352,0.634676,0.998703,515.99517,0.157602,0.006334,0.000288


End up using below.

In [181]:
svm_meta = SVC(kernel='rbf', decision_function_shape='ovr', probability=True, C=10000000000.0, gamma=1e-3)\
            .fit(np.hstack([y_pred_train_cv_gbc, y_pred_train_cv_sgd]), train_df.interest_level.iloc[train_idx])
print(log_loss(train_df.interest_level.iloc[test_idx], svm_meta.predict_proba(np.hstack([y_pred_gbc, y_pred_sgd]))))

0.608465397572


Try `XGBClassifier` again.

In [209]:
gbc_meta = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=100, objective='multi:softprob', subsample=0.7)\
            .fit(np.hstack([y_pred_train_cv_gbc, y_pred_train_cv_sgd]), train_df.interest_level.iloc[train_idx])
print(log_loss(train_df.interest_level.iloc[test_idx], gbc_meta.predict_proba(np.hstack([y_pred_gbc, y_pred_sgd]))))

0.624539737738


## Semi-Stacked Model

Not too much of an improvement. Try wrapping the probs outcome of `SGDClassifier` for the text portion as extra features for one single `XGBClassifier` model. This is essentially a semi-stacked model.

In [193]:
gbc_semi = xgb.XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=500, objective='multi:softprob', subsample=0.7)\
         .fit(np.hstack([train_df[X_vars].iloc[train_idx], y_pred_train_cv_sgd]), train_df.interest_level.iloc[train_idx])
print(log_loss(train_df.interest_level.iloc[test_idx], gbc_semi.predict_proba(np.hstack([train_df[X_vars].iloc[test_idx], y_pred_sgd]))))

0.612064452084


This is not that great. Try adding predicted probs from both models to all features.

In [18]:
train_df_with_text = sparse.hstack([train_df[X_vars], train_df_features, train_df_desc]).tocsr()

In [19]:
gbc_semi = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=100, objective='multi:softprob', subsample=0.7)\
         .fit(sparse.hstack([train_df_with_text[train_idx], y_pred_train_cv_gbc, y_pred_train_cv_sgd]).tocsr(), train_df.interest_level.iloc[train_idx])
print(log_loss(train_df.interest_level.iloc[test_idx], gbc_semi.predict_proba(sparse.hstack([train_df_with_text[test_idx], y_pred_gbc, y_pred_sgd]).tocsr())))

0.579410067619


Hmm. So using one single model is better?

In [21]:
to_test_df_with_text = sparse.hstack([to_test_df[X_vars], to_test_df_features, to_test_df_desc]).tocsr()
to_test_df_text_combined = sparse.hstack([to_test_df_features, to_test_df_desc]).tocsr()
to_test_y_pred_gbc = gbc.predict_proba(to_test_df[X_vars])
to_test_y_pred_sgd = sgd.predict_proba(to_test_df_text_combined)
to_test_y = gbc_semi.predict_proba(sparse.hstack([to_test_df_with_text, to_test_y_pred_gbc, to_test_y_pred_sgd]).tocsr())

In [29]:
pd.DataFrame(to_test_y, index=to_test_df.listing_id, columns=['high', 'low', 'medium'])[['high', 'medium', 'low']].to_csv('submission_xgb_semistack.csv')