In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

# pull feature engineering notebook
# add your own feature engineering functions to features.py and import them
# only rule is the function must take a dataframe and return a dataframe (with your new features)
from features import engineer, scrub

color = sns.color_palette()
%matplotlib inline

df_train = pd.read_json("train.json")
df_test = pd.read_json("test.json")

### PRE-PROCESSING

In [2]:
#engineer features (from script)
df_train = scrub(df_train)
df_test = scrub(df_test)
df_train = engineer(df_train)
df_test = engineer(df_test)

In [12]:
df_train.columns

Index([            u'bathrooms',              u'bedrooms',
                 u'building_id',               u'created',
                 u'description',       u'display_address',
                    u'features',        u'interest_level',
                    u'latitude',            u'listing_id',
                   u'longitude',            u'manager_id',
                      u'photos',                 u'price',
              u'street_address',            u'num_photos',
                u'num_features', u'num_description_words',
                u'created_year',         u'created_month',
                 u'created_day',   u'n_num_keyfeat_score'],
      dtype='object')

### Baseline training

In [17]:
feats_to_train = ["bathrooms", "bedrooms", "latitude", "longitude", "price",
             "num_photos", "num_features", "num_description_words",
             "created_year", "created_month", "created_day"]
X = df_train[feats_to_train]
y = df_train["interest_level"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20)

clf = RandomForestClassifier(n_estimators=1000)
clf.fit(X_train, y_train)
y_val_pred = clf.predict_proba(X_val)
baseline_logloss = log_loss(y_val, y_val_pred)

In [18]:
baseline_logloss

0.62741230333050824

### Training the new model

In [13]:
feats_to_train = ["bathrooms", "bedrooms", "latitude", "longitude", "price",
             "num_photos", "num_features", "num_description_words",
             "created_year", "created_month", "created_day", 'n_num_keyfeat_score']
X = df_train[feats_to_train]
y = df_train["interest_level"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33)

clf = RandomForestClassifier(n_estimators=1000)
clf.fit(X_train, y_train)
y_val_pred = clf.predict_proba(X_val)
test_logloss = log_loss(y_val, y_val_pred)

### Testing for raw improvement

In [17]:
test

0.62493902735508611

In [18]:
baseline_logloss

0.62489148464132027

In [20]:
baseline_logloss - test_logloss

NameError: name 'test_logloss' is not defined

### Clean up and output a submission file

In [7]:
X = df_test[feats_to_train]
y = clf.predict_proba(X)

In [9]:
labels2idx = {label: i for i, label in enumerate(clf.classes_)}

In [10]:
sub = pd.DataFrame()
sub["listing_id"] = df_test["listing_id"]
for label in ["high", "medium", "low"]:
    sub[label] = y[:, labels2idx[label]]

In [None]:
sub.to_csv("submission_rf.csv", index=False)