# Build the model

This notebook is meant to present some models to fit our dataset with the boroughs one hot encoded

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV
import numpy as np

In [3]:
filename = '../files/listings_neighbourhood.csv'
data_borough = pd.read_csv(filename)
data_borough.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5296 entries, 0 to 5295
Data columns (total 13 columns):
Unnamed: 0                           5296 non-null int64
id                                   5296 non-null int64
price                                5296 non-null int64
minimum_nights                       5296 non-null int64
number_of_reviews                    5296 non-null int64
reviews_per_month                    5296 non-null float64
availability_365                     5296 non-null int64
room_type_Entire home/apt            5296 non-null int64
room_type_Private room               5296 non-null int64
room_type_Shared room                5296 non-null int64
neighbourhood_Harlem                 5296 non-null int64
neighbourhood_Morningside Heights    5296 non-null int64
neighbourhood_Upper West Side        5296 non-null int64
dtypes: float64(1), int64(12)
memory usage: 538.0 KB


In [3]:
y = data_borough['price']
del data_borough['Unnamed: 0']
del data_borough['price']
del data_borough['number_of_reviews']
del data_borough['reviews_per_month']
data_borough['availability_365'] = (data_borough['availability_365'] - data_borough['availability_365'].mean())/(data_borough['availability_365'].std())**2
data_borough['minimum_nights'] = (data_borough['minimum_nights'] - data_borough['minimum_nights'].mean())/(data_borough['minimum_nights'].std())**2
X = data_borough

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

In [5]:
#Fit regression model
model = ensemble.GradientBoostingRegressor(
    n_estimators=500,
    learning_rate=0.1,
    max_depth=4,
    min_samples_leaf=3,
    max_features=0.3,
    loss='huber',
    random_state=0
)
model.fit(X_train, y_train)
# Save the trained model to a file so we can use it in other programs
#joblib.dump(model, 'trained_house_classifier_model_year_sq.pkl')

# Find the error rate on the training set
mse = mean_absolute_error(y_train, model.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

# Find the error rate on the test set
mse = mean_absolute_error(y_test, model.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mse)

Training Set Mean Absolute Error: 59.9824
Test Set Mean Absolute Error: 54.0978


Perform the grid search to ameliorate the model

In [11]:
# Create the model
model = ensemble.GradientBoostingRegressor()

# Parameters we want to try
param_grid = {
    'n_estimators': [500, 1000, 2000],
    'max_depth': [4, 6],
    'min_samples_leaf': [3, 5, 9],
    'learning_rate': [0.1, 0.05, 0.02],
    'max_features': [1.0, 0.3, 0.1],
    'loss': ['ls', 'lad', 'huber']
}

# Define the grid search we want to run. Run it with six cpus in parallel.
gs_cv = GridSearchCV(model, param_grid, n_jobs=6)

# Run the grid search - on only the training data!
gs_cv.fit(X_train, y_train)

# Print the parameters that gave us the best result!
print(gs_cv.best_params_)

# Find the error rate on the training set using the best parameters
mse = mean_absolute_error(y_train, gs_cv.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

# Find the error rate on the test set using the best parameters
mse = mean_absolute_error(y_test, gs_cv.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mse)




{'learning_rate': 0.02, 'loss': 'huber', 'max_depth': 4, 'max_features': 0.1, 'min_samples_leaf': 3, 'n_estimators': 500}
Training Set Mean Absolute Error: 62.3416
Test Set Mean Absolute Error: 53.6109


After the grid search, we see that we have a worst training estimation but a test set estimation a bit better. Our model is not very accurate. It is not surprising since we don't have enough explanatory variables and that we have outlier variables (more that 2000 and less than 10).

In [10]:
# Save the trained model to a file so we can use it in other programs
joblib.dump(model, 'Neighborhoods_near_Columbia.pkl')

['Neighborhoods_near_Columbia.pkl']

In [6]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np
model = RandomForestClassifier()

parameters = {
     'n_estimators':(100, 30, 50), #the number of trees
     'max_depth':(4,8,15),
     'min_samples_split': (2, 4, 8),
     'min_samples_leaf': (4,8,16)
}

model = GridSearchCV(RandomForestClassifier(),parameters,cv=3,iid=False)
model.fit(X_train, y_train)
model.best_score_, model.best_params_



(0.07848099939369152,
 {'max_depth': 4,
  'min_samples_leaf': 4,
  'min_samples_split': 4,
  'n_estimators': 100})

In [7]:
#Fit regression model
model = RandomForestClassifier(n_estimators=100, max_depth=4, min_samples_leaf=4, min_samples_split=4)

model.fit(X_train, y_train)
# Save the trained model to a file so we can use it in other programs
#joblib.dump(model, 'trained_house_classifier_model_year_sq.pkl')

# Find the error rate on the training set
mse = mean_absolute_error(y_train, model.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

# Find the error rate on the test set
mse = mean_absolute_error(y_test, model.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mse)

Training Set Mean Absolute Error: 64.9576
Test Set Mean Absolute Error: 61.8994


Worst Results than with the previous model 

In [6]:
y_train.describe()

count     4501.000000
mean       154.699400
std        277.012429
min          0.000000
25%         75.000000
50%        109.000000
75%        174.000000
max      10000.000000
Name: price, dtype: float64

In [7]:
y_train_predict = model.predict(X_train)
pd.Series(y_train_predict).describe()

count    4501.000000
mean      129.425902
std        56.379811
min        18.160371
25%        79.034523
50%       122.947265
75%       178.409500
max       514.611432
dtype: float64

In [8]:
y_test.describe()

count     795.000000
mean      140.227673
std       160.582927
min        22.000000
25%        70.500000
50%       101.000000
75%       159.000000
max      2900.000000
Name: price, dtype: float64

In [9]:
y_predict = model.predict(X_test)
pd.Series(y_predict).describe()

count    795.000000
mean     129.255223
std       55.120134
min       37.151722
25%       78.728636
50%      125.126330
75%      178.409500
max      333.021529
dtype: float64

In [12]:
y_predict

array([166.37392534, 155.43575278, 169.25152191,  65.16476195,
       194.10020108,  97.54119922,  78.71838923, 135.21960137,
       178.40950022,  85.61430252,  90.49340857,  82.03072518,
        60.22432809, 176.07907405, 232.17815502,  64.91009751,
       131.13977007,  51.64404324, 135.21960137, 279.63009005,
       152.6181683 , 151.63951069, 172.66818874,  70.12176891,
        67.64445738,  73.22773747, 133.59183098,  60.30312911,
        73.22773747, 194.42866431, 142.46892027, 188.88556754,
       105.28836542, 211.23523252,  56.49375724, 129.48668094,
       139.17232151, 196.13658252,  91.41809558, 162.7271592 ,
       102.4775506 ,  96.79687475, 184.84735878, 189.17539444,
        68.06150405,  98.02399953, 101.40145609,  92.81950663,
        73.04247925, 104.79754522, 185.23916581,  97.80552046,
        99.59923327, 135.21960137, 150.06532023,  72.59554975,
       112.03015824,  44.58193466,  61.25607374, 194.11764293,
       135.85056796, 188.12008239, 191.04479787,  77.72

In [11]:
np.sum(y>500)/len(y)

0.01850453172205438

We see that our model is really bad at estimating the outliers values. In addition these values only represent 1.8% of the values in our dataset. Furthermore, if we have the goal to predict the airbnb our parent should rent for our graduation, they won't take listings that are more than $500 per day. Hence it seems reasonnable to fit our model on values that are less than 500. This is what we do below.

In [4]:
filename = '../files/listings_neibourhood.csv'
data_borough = pd.read_csv(filename)
data_borough.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5296 entries, 0 to 5295
Data columns (total 13 columns):
Unnamed: 0                           5296 non-null int64
id                                   5296 non-null int64
price                                5296 non-null int64
minimum_nights                       5296 non-null int64
number_of_reviews                    5296 non-null int64
reviews_per_month                    5296 non-null float64
availability_365                     5296 non-null int64
room_type_Entire home/apt            5296 non-null int64
room_type_Private room               5296 non-null int64
room_type_Shared room                5296 non-null int64
neighbourhood_Harlem                 5296 non-null int64
neighbourhood_Morningside Heights    5296 non-null int64
neighbourhood_Upper West Side        5296 non-null int64
dtypes: float64(1), int64(12)
memory usage: 538.0 KB


In [55]:
data_borough = data_borough[data_borough['price'] <= 500]
y = data_borough['price']
del data_borough['Unnamed: 0']
del data_borough['price']
del data_borough['number_of_reviews']
del data_borough['reviews_per_month']
data_borough['availability_365'] = (data_borough['availability_365'] - data_borough['availability_365'].mean())/(data_borough['availability_365'].std())**2
data_borough['minimum_nights'] = (data_borough['minimum_nights'] - data_borough['minimum_nights'].mean())/(data_borough['minimum_nights'].std())**2
X = data_borough
print('y:', y.shape)
print('X:', X.shape)

y: (5198,)
X: (5198, 8)


In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

In [57]:
#Fit regression model
model = ensemble.GradientBoostingRegressor(
    n_estimators=500,
    learning_rate=0.02,
    max_depth=4,
    min_samples_leaf=3,
    max_features=0.1,
    loss='huber',
    random_state=0
)
model.fit(X_train, y_train)
# Save the trained model to a file so we can use it in other programs
#joblib.dump(model, 'trained_house_classifier_model_year_sq.pkl')

# Find the error rate on the training set
mse = mean_absolute_error(y_train, model.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

# Find the error rate on the test set
mse = mean_absolute_error(y_test, model.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mse)

Training Set Mean Absolute Error: 40.2254
Test Set Mean Absolute Error: 40.1195


In [58]:
y_test.describe()

count    780.000000
mean     128.194872
std       79.906222
min       10.000000
25%       70.000000
50%      100.000000
75%      165.000000
max      500.000000
Name: price, dtype: float64

In [59]:
y_predict = model.predict(X_test)
pd.Series(y_predict).describe()

count    780.000000
mean     121.973005
std       47.590107
min       45.437776
25%       78.510950
50%      112.428569
75%      168.016473
max      269.770394
dtype: float64

Delete outlier values and see if the model performs better

In [60]:
list_ind = list(y_test[y_test<=250].index)
len(list_ind)

719

In [61]:
list_row = []
for i in range(len(y_test)):
    if y_test.index[i] in list_ind:
        list_row += [i]
len(list_row)

719

In [62]:
y_test = y_test[list_ind]
y_test.shape

(719,)

In [63]:
y_predict = y_predict[list_row]
y_predict.shape

(719,)

In [64]:
mse = mean_absolute_error(y_test, y_predict)
print("Test Set Mean Absolute Error: %.4f" % mse)

Test Set Mean Absolute Error: 31.4776


In [70]:
y_test.describe()

count    698.000000
mean     113.858166
std       54.545612
min       40.000000
25%       70.000000
50%       99.000000
75%      150.000000
max      250.000000
Name: price, dtype: float64

In [71]:
pd.Series(y_predict).describe()

count    698.000000
mean     118.251413
std       44.758398
min       54.190025
25%       78.450416
50%      106.665159
75%      159.743814
max      269.770394
dtype: float64

In conclusion, we can see that our model is still very bad to predict outlier values (more than 250 dollars) but is fair enough to have a first approximation for the listing that are lesser or equal to 250 dollars (which is approximately the range of price our parents would look for if they want to attend our graduation). $30 of difference with the true value of the listing in average is not that bad since our features are not very explanatory of the price. In addition our model seems to overestimate the price (by 5 dollars in average and by 15 dollars in if we look at the min/max/quartile values)

In [72]:
# Save the trained model to a file so we can use it in other programs
joblib.dump(model, 'Neighborhoods_near_Columbia_500.pkl')

['Neighborhoods_near_Columbia_500.pkl']