In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn import model_selection, preprocessing, ensemble
import xgboost as xgb

# pull feature engineering notebook
# add your own feature engineering functions to features.py and import them
# only rule is the function must take a dataframe and return a dataframe (with your new features)
from features import *
from utils import *

color = sns.color_palette()
%matplotlib inline

In [None]:
df_train = pd.read_json("train.json")
df_test = pd.read_json("test.json")

### PRE-PROCESSING

In [None]:
# from features.py
scrub_and_engineer = [
                    scrub,
                    basic_numeric_features,
                    n_log_price,
                    n_expensive,
#                     price_vs_mean_72,
                    ,
                    count_caps,
                    laundry,
#                     preWar,
#                     furnished,
#                     dishwash,
#                     hardwood,
#                     fitness,
#                     doorman,
#                     no_fee,
                      ]
for func in scrub_and_engineer:
    try:
        df_train = func(df_train)
        df_test = func(df_test)
    except Exception as e:
        print e
        continue

In [None]:
exclude = ['price']
feats_to_train = [x for x in df_train.columns.tolist() if df_train[x].dtype not in ['O', '<M8[ns]'] and x not in exclude]
feats_to_train

In [None]:
df_test.columns

In [None]:
num_map = {'high':0, 'medium':1, 'low':2}
train_X = np.array(df_train[feats_to_train])
test_X = np.array(df_test[feats_to_train])
train_y = np.array(df_train['interest_level'].apply(lambda x: num_map[x]))
# test_Y

### Train XGB

In [None]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=10000):
    param = {
        'objective': 'multi:softprob',
        'eta': 0.1,
        'max_depth': 6,
        'silent': 1,
        'num_class': 3,
        'eval_metric': "mlogloss",
        'min_child_weight': 1,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'seed': seed_val
    }

    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [(xgtrain, 'train'), (xgtest, 'test')]
        model = xgb.train(plst, xgtrain, num_rounds,
                          watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [None]:
cv_scores = []
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)
for dev_index, val_index in kf.split(range(train_X.shape[0])):
        dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)
        break

In [None]:
preds, model = runXGB(train_X, train_y, test_X, num_rounds=400)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = df_test.listing_id.values
out_df.to_csv("xgb.csv", index=False)