# Build the model

This notebook is meant to present some models to fit our dataset with the boroughs one hot encoded

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV

In [24]:
filename = 'listings_borough.csv'
data_borough = pd.read_csv(filename)
data_borough.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50204 entries, 0 to 50203
Data columns (total 14 columns):
Unnamed: 0                           50204 non-null int64
price                                50204 non-null int64
minimum_nights                       50204 non-null int64
number_of_reviews                    50204 non-null int64
reviews_per_month                    50204 non-null float64
availability_365                     50204 non-null int64
neighbourhood_group_Bronx            50204 non-null int64
neighbourhood_group_Brooklyn         50204 non-null int64
neighbourhood_group_Manhattan        50204 non-null int64
neighbourhood_group_Queens           50204 non-null int64
neighbourhood_group_Staten Island    50204 non-null int64
room_type_Entire home/apt            50204 non-null int64
room_type_Private room               50204 non-null int64
room_type_Shared room                50204 non-null int64
dtypes: float64(1), int64(13)
memory usage: 5.4 MB


In [25]:
y = data_borough['price']
del data_borough['Unnamed: 0']
del data_borough['price']
del data_borough['number_of_reviews']
del data_borough['reviews_per_month']
data_borough['availability_365'] = (data_borough['availability_365'] - data_borough['availability_365'].mean())/(data_borough['availability_365'].std())**2
data_borough['minimum_nights'] = (data_borough['minimum_nights'] - data_borough['minimum_nights'].mean())/(data_borough['minimum_nights'].std())**2
X = data_borough

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

In [27]:
#Fit regression model
model = ensemble.GradientBoostingRegressor(
    n_estimators=500,
    learning_rate=0.1,
    max_depth=4,
    min_samples_leaf=3,
    max_features=0.3,
    loss='huber',
    random_state=0
)
model.fit(X_train, y_train)
# Save the trained model to a file so we can use it in other programs
#joblib.dump(model, 'trained_house_classifier_model_year_sq.pkl')

# Find the error rate on the training set
mse = mean_absolute_error(y_train, model.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

# Find the error rate on the test set
mse = mean_absolute_error(y_test, model.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mse)

Training Set Mean Absolute Error: 61.5655
Test Set Mean Absolute Error: 59.3221


Perform the grid search to ameliorate the model

In [28]:
# Create the model
model = ensemble.GradientBoostingRegressor()

# Parameters we want to try
param_grid = {
    'n_estimators': [500, 1000, 2000],
    'max_depth': [4, 6],
    'min_samples_leaf': [3, 5, 9],
    'learning_rate': [0.1, 0.05, 0.02],
    'max_features': [1.0, 0.3, 0.1],
    'loss': ['ls', 'lad', 'huber']
}

# Define the grid search we want to run. Run it with six cpus in parallel.
gs_cv = GridSearchCV(model, param_grid, n_jobs=6)

# Run the grid search - on only the training data!
gs_cv.fit(X_train, y_train)

# Print the parameters that gave us the best result!
print(gs_cv.best_params_)

# Find the error rate on the training set using the best parameters
mse = mean_absolute_error(y_train, gs_cv.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

# Find the error rate on the test set using the best parameters
mse = mean_absolute_error(y_test, gs_cv.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mse)




KeyboardInterrupt: 