In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn import model_selection, preprocessing, ensemble
import xgboost as xgb

# pull feature engineering notebook
# add your own feature engineering functions to features.py and import them
# only rule is the function must take a dataframe and return a dataframe (with your new features)
from features import *
from utils import *

color = sns.color_palette()
%matplotlib inline

df_train = pd.read_json("train.json")
df_test = pd.read_json("test.json")

### PRE-PROCESSING

In [13]:
#engineer features (from script)
scrub_and_engineer = [scrub, basic_numeric_features, n_log_price, n_expensive]
for func in scrub_and_engineer:
    try:
        df_train = func(df_train)
        df_test = func(df_test)
    except:
        continue

In [14]:
exclude = ['price']
feats_to_train = [x for x in df_train.columns.tolist() if df_train[x].dtype not in ['O', '<M8[ns]'] and x not in exclude]
feats_to_train

[u'bathrooms',
 u'bedrooms',
 u'latitude',
 u'listing_id',
 u'longitude',
 'num_photos',
 'num_features',
 'num_description_words',
 'created_year',
 'created_month',
 'created_day',
 'n_log_price',
 'n_expensive']

In [56]:
num_map = {'high':0, 'medium':1, 'low':2}
train_X = np.array(df_train[feats_to_train])
test_X = np.array(df_test[feats_to_train])
train_y = np.array(df_train['interest_level'].apply(lambda x: num_map[x]))
# test_Y

### Train XGB

In [16]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=10000):
    param = {
        'objective': 'multi:softprob',
        'eta': 0.1,
        'max_depth': 6,
        'silent': 1,
        'num_class': 3,
        'eval_metric': "mlogloss",
        'min_child_weight': 1,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'seed': seed_val
    }

    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [(xgtrain, 'train'), (xgtest, 'test')]
        model = xgb.train(plst, xgtrain, num_rounds,
                          watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [65]:
cv_scores = []
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)
for dev_index, val_index in kf.split(range(train_X.shape[0])):
        dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)
        break

[0]	train-mlogloss:1.0458	test-mlogloss:1.04641
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[1]	train-mlogloss:1.00059	test-mlogloss:1.00171
[2]	train-mlogloss:0.959909	test-mlogloss:0.961572
[3]	train-mlogloss:0.922966	test-mlogloss:0.925014
[4]	train-mlogloss:0.890925	test-mlogloss:0.893491
[5]	train-mlogloss:0.863205	test-mlogloss:0.866387
[6]	train-mlogloss:0.838811	test-mlogloss:0.842667
[7]	train-mlogloss:0.817922	test-mlogloss:0.822222
[8]	train-mlogloss:0.799245	test-mlogloss:0.804091
[9]	train-mlogloss:0.783691	test-mlogloss:0.789195
[10]	train-mlogloss:0.768711	test-mlogloss:0.774587
[11]	train-mlogloss:0.755515	test-mlogloss:0.761858
[12]	train-mlogloss:0.74432	test-mlogloss:0.751127
[13]	train-mlogloss:0.733878	test-mlogloss:0.741164
[14]	train-mlogloss:0.724564	test-mlogloss:0.732302
[15]	train-mlogloss:0.716234	test-mlogloss:0.724226
[16]	train-mlogloss:0.708845	test

In [64]:
preds, model = runXGB(train_X, train_y, test_X, num_rounds=400)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = df_test.listing_id.values
out_df.to_csv("xgb.csv", index=False)