In [128]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import stat
%matplotlib inline
import matplotlib.gridspec as gridspec
import matplotlib as mpl

In [129]:
train = pd.read_csv('data/train.csv').dropna(axis=0)
test = pd.read_csv('data/test.csv').dropna(axis=0)

In [130]:
x_train = train.drop(columns=['logRent'])
x_test = test.drop(columns=['logRent'])

In [131]:
y_train = train.logRent
y_test = test.logRent

In [132]:
x_train.head()

Unnamed: 0,zipcode,year,nonfamily_households,family_households,median_year_structure_built,rent_burden_not_computed,rent_over_50_percent,rent_40_to_50_percent,rent_35_to_40_percent,rent_30_to_35_percent,...,workers_16_and_over,commute_35_44_mins,commute_60_more_mins,commute_less_10_mins,commuters_16_over,hispanic_any_race,pop_5_years_over,speak_only_english_at_home,speak_spanish_at_home,speak_spanish_at_home_low_english
0,-1.61987,-1.460364,0.158471,-0.198491,-0.176142,-0.528499,-0.306055,-0.425499,-0.315687,-0.087063,...,-0.084909,-0.371999,-0.301785,0.513761,-0.054378,-0.383211,0.030503,0.222105,-0.327087,-0.320742
1,-1.61987,-0.875045,0.130731,-0.180991,-0.158605,-0.507122,-0.331409,-0.70771,-0.428569,0.015281,...,-0.025237,-0.100543,-0.378111,0.636904,0.008004,-0.363406,0.066625,0.207152,-0.310458,-0.306629
2,-1.61987,-0.289726,0.19432,-0.209984,-0.105996,-0.533843,-0.146213,-0.724711,-0.381535,0.202323,...,-0.055929,-0.159153,-0.352669,0.369386,-0.024836,-0.349607,0.049607,0.192289,-0.312758,-0.31459
3,-1.61987,0.295593,0.134999,-0.188305,-0.105996,-0.710207,-0.091095,-0.70771,-0.29217,-0.055301,...,-0.072527,-0.227017,-0.368688,0.29826,-0.043523,-0.337596,0.04074,0.17089,-0.262162,-0.285642
4,-1.61987,0.880912,0.216513,-0.21129,-0.070923,-0.555221,0.081976,-0.663508,-0.282763,-0.189407,...,-0.051846,-0.373541,-0.404495,0.158132,-0.026622,-0.32418,0.066364,0.181813,-0.269592,-0.285642


### Models

In [100]:
import warnings
warnings.filterwarnings('ignore')

In [101]:
from sklearn.linear_model import Ridge, Lasso, LassoCV, ElasticNet, LinearRegression
import sklearn.model_selection as ms
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.model_selection import KFold, cross_val_score
from sklearn import svm

In [102]:
ridge = Ridge()
lasso = Lasso()
net = ElasticNet()
xgb = XGBRegressor()
lm  = LinearRegression()
gb  = GradientBoostingRegressor()
rforest = RandomForestRegressor()

In [103]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)

### Lasso 

In [104]:
lasso = Lasso()

lasso = lasso.set_params(random_state=40)

In [105]:
lasso.fit(x_train, y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=40,
      selection='cyclic', tol=0.0001, warm_start=False)

In [106]:
print("The train set R^2 is: %.5f" % lasso.score(x_train, y_train))
print("The test set R^2 is is: %.5f" % lasso.score(x_test, y_test))

The train set R^2 is: 0.00000
The test set R^2 is is: -0.01552


In [107]:
# Initial prediction
lasso_pred = lasso.predict(x_test)

# Print out MAE, MSE, and RMSE
errors = abs(np.expm1(lasso_pred) - np.expm1(y_test))

print('Mean Absolute Error (MAE): $', round(np.mean(errors), 2))
print('Mean Squared Error (MSE):', mean_squared_error(y_test, lasso_pred))
print('Root Mean Square Error (RMSE):', np.sqrt(mean_squared_error(y_test, lasso_pred)))

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / np.expm1(y_test))
print('Mean Absolute Percent Error (MAPE):', round(np.mean(mape), 2), '%.')

Mean Absolute Error (MAE): $ 455.47
Mean Squared Error (MSE): 0.14644906955627407
Root Mean Square Error (RMSE): 0.38268664669187774
Mean Absolute Percent Error (MAPE): 31.59 %.


### Ridge

In [108]:
ridge = Ridge()

ridge = ridge.set_params(random_state=40)

In [109]:
ridge.fit(x_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=40, solver='auto', tol=0.001)

In [110]:
print("The train set R^2 is: %.5f" % ridge.score(x_train, y_train))
print("The test set R^2 is is: %.5f" % ridge.score(x_test, y_test))

The train set R^2 is: 0.07394
The test set R^2 is is: 0.23820


In [111]:
# Initial prediction
ridge_pred = ridge.predict(x_test)

# Print out MAE, MSE, and RMSE
errors = abs(np.expm1(ridge_pred) - np.expm1(y_test))

print('Mean Absolute Error (MAE): $', round(np.mean(errors), 2))
print('Mean Squared Error (MSE):', mean_squared_error(y_test, ridge_pred))
print('Root Mean Square Error (RMSE):', np.sqrt(mean_squared_error(y_test, ridge_pred)))

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / np.expm1(y_test))
print('Mean Absolute Percent Error (MAPE):', round(np.mean(mape), 2), '%.')

Mean Absolute Error (MAE): $ 393.39
Mean Squared Error (MSE): 0.10985940353270253
Root Mean Square Error (RMSE): 0.33145045411449137
Mean Absolute Percent Error (MAPE): 27.88 %.


### E-Net

In [112]:
net = ElasticNet()

net.set_params(random_state=12, normalize=True)

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=True, positive=False, precompute=False,
           random_state=12, selection='cyclic', tol=0.0001, warm_start=False)

In [113]:
net.fit(x_train, y_train)

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=True, positive=False, precompute=False,
           random_state=12, selection='cyclic', tol=0.0001, warm_start=False)

In [114]:
print("The train set R^2 is: %.5f" % net.score(x_train, y_train))
print("The test set R^2 is is: %.5f" % net.score(x_test, y_test))

The train set R^2 is: 0.00000
The test set R^2 is is: -0.01552


In [115]:
# Initial prediction
net_pred = net.predict(x_test)

# Print out MAE, MSE, and RMSE
errors = abs(np.expm1(ridge_pred) - np.expm1(y_test))

print('Mean Absolute Error (MAE): $', round(np.mean(errors), 2))
print('Mean Squared Error (MSE):', mean_squared_error(y_test, ridge_pred))
print('Root Mean Square Error (RMSE):', np.sqrt(mean_squared_error(y_test, ridge_pred)))

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / np.expm1(y_test))
print('Mean Absolute Percent Error (MAPE):', round(np.mean(mape), 2), '%.')

Mean Absolute Error (MAE): $ 393.39
Mean Squared Error (MSE): 0.10985940353270253
Root Mean Square Error (RMSE): 0.33145045411449137
Mean Absolute Percent Error (MAPE): 27.88 %.


### Random Forest 

In [116]:
rforest = RandomForestRegressor()

rforest.set_params(random_state=52)

# Train the model on training data
rforest.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=52, verbose=0,
                      warm_start=False)

In [117]:
print("The train set R^2 is: %.5f" % rforest.score(x_train, y_train))
print("The test set R^2 is is: %.5f" % rforest.score(x_test, y_test))

The train set R^2 is: 0.84839
The test set R^2 is is: -0.04346


In [118]:
# Initial prediction
rforest_pred = rforest.predict(x_test)

# Calculate the absolute errors
errors = abs(np.expm1(rforest_pred) - np.expm1(y_test))

# Print out MAE, MSE, and RMSE
print('Mean Absolute Error (MAE): $', round(np.mean(errors), 2))
print('Mean Squared Error (MSE):', mean_squared_error(y_test, rforest_pred))
print('Root Mean Square Error (RMSE):', np.sqrt(mean_squared_error(y_test, rforest_pred)))

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / np.expm1(y_test))
print('Mean Absolute Percent Error (MAPE):', round(np.mean(mape), 2), '%.')

Mean Absolute Error (MAE): $ 463.54
Mean Squared Error (MSE): 0.1504774950852752
Root Mean Square Error (RMSE): 0.3879142883231748
Mean Absolute Percent Error (MAPE): 34.05 %.


### Gredien Boosting 

In [119]:
gbm = GradientBoostingRegressor()

gbm.set_params(random_state=42)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [120]:
# Train model on training data
gbm.fit(x_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [121]:
print("The train set R^2 is: %.5f" % gbm.score(x_train, y_train))
print("The test set R^2 is is: %.5f" % gbm.score(x_test, y_test))

The train set R^2 is: 0.10248
The test set R^2 is is: 0.27184


In [122]:
# Initial prediction
gbm_pred = gbm.predict(x_test)

# Calculate the absolute errors
errors = abs(np.expm1(gbm_pred) - np.expm1(y_test))

# Print out MAE, MSE, and RMSE
print('Mean Absolute Error (MAE): $', round(np.mean(errors), 2))
print('Mean Squared Error (MSE):', mean_squared_error(y_test, gbm_pred))
print('Root Mean Square Error (RMSE):', np.sqrt(mean_squared_error(y_test, gbm_pred)))

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / np.expm1(y_test))
print('Mean Absolute Percent Error (MAPE):', round(np.mean(mape), 2), '%.')

Mean Absolute Error (MAE): $ 382.82
Mean Squared Error (MSE): 0.1050084746572659
Root Mean Square Error (RMSE): 0.3240501113366047
Mean Absolute Percent Error (MAPE): 26.95 %.


### XGboost

In [123]:
xgb = XGBRegressor()

xgb.set_params(random_state=22)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=22,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [124]:
xgb.fit(x_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=22,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [125]:
print("The train set R^2 is: %.5f" % xgb.score(x_train, y_train))
print("The test set R^2 is is: %.5f" % xgb.score(x_test, y_test))

The train set R^2 is: 0.10134
The test set R^2 is is: 0.27225


In [126]:
# Initial prediction
xgb_pred = xgb.predict(x_test)

# Calculate the absolute errors
errors = abs(np.expm1(xgb_pred) - np.expm1(y_test))

# Print out MAE, MSE, and RMSE
print('Mean Absolute Error (MAE): $', round(np.mean(errors), 2))
print('Mean Squared Error (MSE):', mean_squared_error(y_test, xgb_pred))
print('Root Mean Square Error (RMSE):', np.sqrt(mean_squared_error(y_test, xgb_pred)))

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / np.expm1(y_test))
print('Mean Absolute Percent Error (MAPE):', round(np.mean(mape), 2), '%.')

Mean Absolute Error (MAE): $ 382.64
Mean Squared Error (MSE): 0.10494866683939943
Root Mean Square Error (RMSE): 0.3239578164505364
Mean Absolute Percent Error (MAPE): 26.94 %.
