In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

# pull feature engineering notebook
# add your own feature engineering functions to features.py and import them
# only rule is the function must take a dataframe and return a dataframe (with your new features)
from features import *

color = sns.color_palette()
%matplotlib inline

df_train = pd.read_json("train.json")
df_test = pd.read_json("test.json")

### PRE-PROCESSING

In [None]:
#engineer features (from script)
scrub_and_engineer = [scrub, engineer, n_log_price, n_expensive]
for func in scrub_and_engineer:
    try:
        df_train = func(df_train)
        df_test = func(df_test)
    except:
        continue

In [None]:
exclude = ['price']
feats_to_train = [x for x in df_train.columns.tolist() if df_train[x].dtype not in ['O', '<M8[ns]'] and x not in exclude]
feats_to_train

### Hyperparameters

In [None]:
# Cross val
test_size=0.20

# Random Forest
n_estimators=1000

### Baseline training

In [None]:
feats_to_train = ["bathrooms", "bedrooms", "latitude", "longitude", "price",
             "num_photos", "num_features", "num_description_words",
             "created_year", "created_month", "created_day"]
X = df_train[feats_to_train]
y = df_train["interest_level"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size)

clf = RandomForestClassifier(n_estimators=n_estimators)
clf.fit(X_train, y_train)
y_val_pred = clf.predict_proba(X_val)
baseline_logloss = log_loss(y_val, y_val_pred)

In [None]:
baseline_logloss

### Training the new model

In [None]:
# feats_to_train = ["bathrooms", "bedrooms", "latitude", "longitude", "price",
#              "num_photos", "num_features", "num_description_words",
#              "created_year", "created_month", "created_day", 'n_num_keyfeat_score', 'n_no_photo']

# filter out any object/string + timestamp variables and train the random forest on numerical columns
# feats_to_train = [x for x in df_train.columns.tolist() if df_train[x].dtype not in ['O', '<M8[ns]']] 
X = df_train[feats_to_train]
y = df_train["interest_level"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size)

clf = RandomForestClassifier(n_estimators=n_estimators)
clf.fit(X_train, y_train)
y_val_pred = clf.predict_proba(X_val)
test_logloss = log_loss(y_val, y_val_pred)

In [None]:
test_logloss

### Try XGB

In [None]:
import xgboost as xgb

dtrain = xgb.DMatrix('demo/data/agaricus.txt.train')
dtest = xgb.DMatrix('demo/data/agaricus.txt.test')
# specify parameters via map
param = {
        'max_depth':2, 
        'eta':1, 
        'silent':0, #prints running messages while training 
        'objective':'binary:logistic',
        'booster':'gbtree', #options: gbtree, gblinear or dart
        'early_stopping_rounds':10
        }
num_round = 2

In [None]:
bst = xgb.train(param, dtrain, num_round)
preds = bst.predict(dtest)

### Testing for raw improvement

In [None]:
if baseline_logloss - test_logloss > 0:
    print "Model improved, save and submit"
else:
    print "Use baseline model, did not improve"

### Clean up and output a submission file

In [None]:
def output(df_test, clf):
    X = df_test[feats_to_train]
    y = clf.predict_proba(X)
    labels2idx = {label: i for i, label in enumerate(clf.classes_)}
    sub = pd.DataFrame()
    sub["listing_id"] = df_test["listing_id"]
    for label in ["high", "medium", "low"]:
        sub[label] = y[:, labels2idx[label]]
    return sub

In [None]:
sub = output(df_test, clf)

In [None]:
sub.to_csv("submission_rf.csv", index=False)

In [None]:
submission = pd.read_csv('submission_rf.csv')

In [None]:
len(submission)

### DRACE

In [None]:
from sklearn import preprocessing
lbl = preprocessing.LabelEncoder()

#list of id's to encode
manager_ids = list(df_train['manager_id'].values)
#new var to create
new_var = 'manager_id'#'manager_id_encoded'
#response var
resp_var = 'interest_level'

#encode
# lbl.fit(manager_ids)
# df_train[new_var] = lbl.transform(manager_ids)

temp = pd.concat([df_train[new_var], pd.get_dummies(df_train[resp_var])], axis = 1).groupby(new_var).mean()
temp.columns = ['high_frac','low_frac', 'medium_frac']
temp['count'] = df_train.groupby(new_var).count().iloc[:,1]

# compute skill
temp['manager_skill'] = temp['high_frac']*2 + temp['medium_frac']

In [None]:
unranked_managers_ixes = temp['count']<20
ranked_managers_ixes = ~unranked_managers_ixes
mean_values = temp.loc[ranked_managers_ixes, ['high_frac','low_frac', 'medium_frac','manager_skill']].mean()
temp.loc[unranked_managers_ixes,['high_frac','low_frac', 'medium_frac','manager_skill']] = mean_values.values

In [None]:
temp

In [None]:
lbl = preprocessing.LabelEncoder()
lbl.fit(list(df_train['manager_id'].values))
df_train['manager_id'] = lbl.transform(list(df_train['manager_id'].values))

features_to_use = ["latitude", 
                   "longitude", 
                   "price", 
                   "num_photos", 
                   "num_features", 
                   "num_description_words",
                   "manager_id_encoded"
                   ]

# X = df_train[features_to_use]
# y = df_train["interest_level"]
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3)

# compute fractions and count for each manager
temp = pd.concat([X_train.manager_id,pd.get_dummies(y_train)], axis = 1).groupby('manager_id').mean()
temp.columns = ['high_frac','low_frac', 'medium_frac']
temp['count'] = X_train.groupby('manager_id').count().iloc[:,1]

# compute skill
temp['manager_skill'] = temp['high_frac']*2 + temp['medium_frac']

# get ixes for unranked managers...
unranked_managers_ixes = temp['count']<20
# ... and ranked ones
ranked_managers_ixes = ~unranked_managers_ixes

# compute mean values from ranked managers and assign them to unranked ones
mean_values = temp.loc[ranked_managers_ixes, ['high_frac','low_frac', 'medium_frac','manager_skill']].mean()
print(mean_values)
temp.loc[unranked_managers_ixes,['high_frac','low_frac', 'medium_frac','manager_skill']] = mean_values.values