<a href="https://colab.research.google.com/github/tomzw11/rental-listing-inquiries/blob/master/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import os

# print(os.getcwd())
df_train = pd.read_json("/content/drive/My Drive/Colab Notebooks/train.json")

# df_train.head()

In [0]:
# distribution of manager_id.
from collections import Counter

manager_id = df_train['manager_id']
Counter(list(manager_id)).most_common(20)


In [0]:
import sklearn as sk
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# preprocessing.
labels = np.array(df_train["interest_level"])
le = preprocessing.LabelEncoder()
le.fit(labels)
labels_encoded = le.transform(labels)
# print(labels_encoded[0:10])
# print(labels_encoded.dtype)

# feature engineering.
df_train["num_features"] = df_train["features"].apply(len)
df_train["num_photos"] = df_train["photos"].apply(len)
df_train["num_description_words"] = df_train["description"].apply(lambda x: len(x.split(" ")))
df_train["created"] = pd.to_datetime(df_train["created"])
df_train["created_month"] = df_train["created"].dt.month
df_train["created_day"] = df_train["created"].dt.day

# create features on price and number of rooms.
df_train["bedroom per dollar"] = np.divide(df_train["bedrooms"],df_train["price"])*1000
df_train["bathroom per dollar"] = np.divide(df_train["bathrooms"],df_train["price"])*1000

# extract features to train.
features = np.array(df_train[["bathrooms","bedrooms","bedroom per dollar","bathroom per dollar","latitude","longitude","price","num_features",
             "num_photos", "num_features", "num_description_words", "created_month", "created_day"]])

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels_encoded, test_size = 0.25)

# print('Training Features Shape:', train_features.shape)
# print('Training Labels Shape:', train_labels.shape)
# print('Testing Features Shape:', test_features.shape)
# print('Testing Labels Shape:', test_labels.shape)

In [4]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(train_features,train_labels)


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [5]:
predictions = clf.predict(test_features)

accuracy = len(predictions)
for p in zip(predictions,test_labels):
  if p[0]!=p[1]:
    accuracy = accuracy - 1

print("Prediction Accuracy ",accuracy/len(predictions))


Prediction Accuracy  0.7230507375587616


In [6]:
# compute log loss.
from sklearn.metrics import log_loss
predictions_prob = clf.predict_proba(test_features)
logloss = log_loss(test_labels, predictions_prob)

print("Prediction Log Loss ", logloss)

Prediction Log Loss  0.6963354476272136


In [7]:
# visualize decision tree.

# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot

# Pull out one tree from the forest
tree = clf.estimators_[5]

feature_list = ["bathrooms","bedrooms","latitude","longitude","price"]

# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')
# Write graph to a png file
graph.write_png('tree.png')


ValueError: ignored

In [8]:
# figure out variable importances.

feature_list = ["bathrooms","bedrooms","bedroom per dollar","bathroom per dollar","latitude","longitude","price","num_features",
             "num_photos", "num_features", "num_description_words", "created_month", "created_day"]

# Get numerical feature importances
importances = list(clf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: latitude             Importance: 0.12
Variable: longitude            Importance: 0.12
Variable: num_description_words Importance: 0.12
Variable: bedroom per dollar   Importance: 0.1
Variable: price                Importance: 0.1
Variable: created_day          Importance: 0.1
Variable: bathroom per dollar  Importance: 0.09
Variable: num_photos           Importance: 0.08
Variable: num_features         Importance: 0.05
Variable: num_features         Importance: 0.05
Variable: created_month        Importance: 0.04
Variable: bedrooms             Importance: 0.02
Variable: bathrooms            Importance: 0.01


In [0]:
# run on submission test.
df_test = pd.read_json("/content/drive/My Drive/Colab Notebooks/test.json")


In [0]:
# feature engineering.
df_test["num_features"] = df_test["features"].apply(len)
df_test["num_photos"] = df_test["photos"].apply(len)
df_test["num_description_words"] = df_test["description"].apply(lambda x: len(x.split(" ")))
df_test["created"] = pd.to_datetime(df_test["created"])
df_test["created_month"] = df_test["created"].dt.month
df_test["created_day"] = df_test["created"].dt.day

# create features on price and number of rooms.
df_test["bedroom per dollar"] = np.divide(df_test["bedrooms"],df_test["price"])*1000
df_test["bathroom per dollar"] = np.divide(df_test["bathrooms"],df_test["price"])*1000


In [0]:
X = df_test[feature_list]

y = clf.predict_proba(X)


In [0]:
# print(y[0:20])

labels2idx = {'high': 0, 'low': 1, 'medium': 2}

labels2idx

sub = pd.DataFrame()
sub["listing_id"] = df_test["listing_id"]
for label in ["high", "medium", "low"]:
    sub[label] = y[:, labels2idx[label]]
sub.to_csv("submission_rf.csv", index=False)