In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import AdaBoostRegressor


In [2]:
train = pd.read_csv('data/Deep_housingdata.csv', index_col = 0, header = 0)

In [3]:
test = pd.read_csv('data/test_full.csv', index_col = 0, header = 0)

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Columns: 319 entries, mssubclass__20 to SalePrice
dtypes: float64(3), int64(316)
memory usage: 3.6 MB


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1459 entries, 1461 to 2919
Columns: 318 entries, mssubclass__20 to CentralAirCond
dtypes: float64(9), int64(309)
memory usage: 3.6 MB


In [7]:
X = train.drop('SalePrice', axis = 1)

In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Columns: 318 entries, mssubclass__20 to CentralAirCond
dtypes: float64(3), int64(315)
memory usage: 3.6 MB


In [13]:
Y = train['SalePrice']

In [15]:
Y.values.reshape(-1,1)

array([[208500],
       [181500],
       [223500],
       ...,
       [266500],
       [142125],
       [147500]], dtype=int64)

In [16]:
Y.ravel()

array([208500, 181500, 223500, ..., 266500, 142125, 147500], dtype=int64)

In [17]:
X.values.reshape(-1,1)

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)

In [104]:
dtree = DecisionTreeRegressor(max_depth = 10)

In [105]:
adabst = AdaBoostRegressor(base_estimator = dtree, n_estimators = 1000, learning_rate = 0.1, random_state = 42)

In [106]:
adabst.fit(X_train, Y_train)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
         learning_rate=0.1, loss='linear', n_estimators=1000,
         random_state=42)

In [107]:
adabst.feature_importances_

array([3.51143594e-04, 2.59566160e-04, 4.53732131e-06, 2.16716853e-06,
       1.18925156e-04, 1.47562061e-03, 9.28161416e-05, 4.02988907e-05,
       6.19570082e-05, 1.86705019e-06, 2.51208811e-05, 3.74677394e-05,
       3.20025466e-05, 1.10700477e-06, 1.94424571e-05, 0.00000000e+00,
       3.87348644e-05, 6.71226104e-05, 1.12551132e-05, 5.85591601e-04,
       1.12197409e-03, 3.41494471e-06, 1.85014109e-06, 1.39509096e-03,
       4.61227734e-05, 8.84147577e-05, 4.93880229e-04, 2.17550068e-04,
       2.11107243e-04, 1.49569497e-03, 2.13023928e-03, 5.39730720e-04,
       7.83851670e-05, 1.02250596e-03, 2.93763127e-07, 4.42227714e-04,
       1.31485585e-03, 1.01066662e-04, 5.46071429e-05, 1.92801621e-03,
       1.24899487e-03, 1.74734478e-04, 3.20818778e-04, 1.08113214e-05,
       1.02726798e-06, 5.20457754e-06, 5.12122474e-05, 1.77987374e-04,
       2.91223357e-04, 4.93370375e-04, 1.34696781e-03, 4.24113733e-05,
       4.56293717e-05, 2.53360909e-06, 6.34795485e-05, 2.40916121e-04,
      

In [108]:
ames_columns = pd.DataFrame(X.columns)

In [109]:
ames_columns.head()

Unnamed: 0,0
0,mssubclass__20
1,mssubclass__30
2,mssubclass__40
3,mssubclass__45
4,mssubclass__50


In [110]:
ames_columns['Feature Importance'] = adabst.feature_importances_

In [111]:
ames_columns = ames_columns.sort_values(by = 'Feature Importance', ascending = False)

In [112]:
ames_columns.head(15)

Unnamed: 0,0,Feature Importance
292,OverallQual,0.53569
297,GrLivArea,0.104072
296,TotalBsmtSF,0.051204
308,GarageCars,0.023197
291,LotArea,0.021321
315,PoolArea,0.020343
305,TotRmsAbvGrd,0.014137
290,LotFrontage,0.013166
309,GarageArea,0.012741
307,HomeAge,0.011792


In [113]:
Y_pred = adabst.predict(X_test)

In [114]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [115]:
r2_score(Y_test, Y_pred)

0.894038493417428

In [88]:
adabst2 = AdaBoostRegressor(base_estimator = dtree, n_estimators = 5000, learning_rate = 0.05, random_state = 42)

In [89]:
adabst2.fit(X_train, Y_train)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
         learning_rate=0.05, loss='linear', n_estimators=5000,
         random_state=42)

In [90]:
adabst2.feature_importances_

array([6.98244305e-05, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.04586960e-03, 4.40526562e-10, 3.84910738e-05,
       8.80086148e-06, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.71383914e-07,
       1.70932901e-05, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.51294130e-07, 0.00000000e+00, 6.25162298e-06, 4.10586824e-04,
       5.34980850e-05, 7.44108436e-04, 1.82276018e-02, 5.67325127e-03,
       7.37931759e-06, 1.41030052e-04, 0.00000000e+00, 1.86569083e-04,
       6.70396244e-03, 0.00000000e+00, 0.00000000e+00, 1.57286836e-03,
       2.83889062e-04, 1.32020283e-06, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.05916798e-04, 1.18240816e-02, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.84464759e-05,
      

In [91]:
pred2 = adabst2.predict(X_test)

In [92]:
r2_score(Y_test, pred2)

0.8520682083103145

In [46]:
X2 = X.drop(['exterior2nd__AsbShng', 'exterior2nd__AsphShn', 'exterior2nd__Brk Cmn', 'exterior2nd__BrkFace', 'exterior2nd__CBlock', 'exterior2nd__CmentBd', 'exterior2nd__HdBoard', 'exterior2nd__ImStucc', 'exterior2nd__MetalSd', 'exterior2nd__Other', 'exterior2nd__Plywood', 'exterior2nd__Stone', 'exterior2nd__Stucco', 'exterior2nd__VinylSd', 'exterior2nd__Wd Sdng', 'exterior2nd__Wd Shng'], axis = 1)

In [47]:
X2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Columns: 302 entries, mssubclass__20 to CentralAirCond
dtypes: float64(3), int64(299)
memory usage: 3.4 MB


In [48]:
X2 = X2.drop(['poolqc__0', 'poolqc__Ex', 'poolqc__Gd', 'PoolArea'], axis = 1)

In [49]:
X2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Columns: 298 entries, mssubclass__20 to CentralAirCond
dtypes: float64(3), int64(295)
memory usage: 3.4 MB


In [50]:
X2_train, X2_test, Y2_train, Y2_test = train_test_split(X2, Y, test_size = 0.3, random_state = 42)

In [51]:
adabst3 = AdaBoostRegressor(base_estimator = dtree, n_estimators = 5000, learning_rate = 0.05, random_state = 42)

In [52]:
adabst3.fit(X2_train,Y2_train)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=1, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
         learning_rate=0.05, loss='linear', n_estimators=5000,
         random_state=42)

In [53]:
pred3 = adabst3.predict(X2_test)

In [54]:
r2_score(Y2_test, pred3)

0.2574574322656352

## Gradient Boosting
*turned into Kaggle*

In [58]:
from sklearn.ensemble import GradientBoostingRegressor

In [59]:
gbc = GradientBoostingRegressor(loss='ls', learning_rate=0.05, n_estimators=5000, min_samples_split=2, min_samples_leaf=1, max_depth=3, random_state=42)

In [60]:
gbc.fit(X_train,Y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.05, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=5000, presort='auto', random_state=42,
             subsample=1.0, verbose=0, warm_start=False)

In [61]:
gbc.feature_importances_

array([1.74254812e-03, 3.15529003e-03, 2.25980157e-06, 6.65167688e-04,
       8.76427378e-04, 4.04686280e-03, 1.00868930e-03, 2.88286344e-06,
       1.50541404e-03, 4.15021529e-04, 1.09070935e-03, 5.46288773e-04,
       4.54777971e-04, 9.79892901e-05, 4.63261953e-05, 0.00000000e+00,
       1.79616514e-03, 1.91620992e-03, 1.73258135e-03, 5.92566292e-04,
       8.82809797e-04, 4.83900501e-04, 4.29221659e-04, 3.58583049e-04,
       2.19913279e-03, 8.80414234e-04, 2.03525866e-03, 1.76412082e-03,
       6.83376374e-04, 1.69741175e-03, 1.05183096e-03, 2.11118833e-03,
       7.18546885e-04, 9.98884171e-04, 2.59056602e-04, 2.69240320e-03,
       1.45382100e-03, 1.74165601e-03, 4.06923635e-03, 2.23842136e-03,
       9.17606128e-04, 5.58589956e-04, 0.00000000e+00, 1.73220626e-03,
       1.87447719e-05, 2.52562825e-04, 3.00637149e-03, 1.51599002e-03,
       9.04650334e-03, 1.78001648e-03, 4.54553303e-03, 5.95900555e-03,
       1.42817362e-03, 6.59314738e-04, 3.12930251e-03, 2.87925976e-03,
      

In [63]:
gbc_pred = gbc.predict(X_test)

In [64]:
r2_score(Y_test, gbc_pred)

0.9117203100975103

In [65]:
df_gbc = pd.DataFrame(test.index)

In [66]:
df_gbc['SalePrice'] = gbc.predict(test)

In [67]:
df_gbc.head()

Unnamed: 0,Id,SalePrice
0,1461,124009.638232
1,1462,168202.149924
2,1463,186034.118819
3,1464,194560.855146
4,1465,189290.25625


In [68]:
#df_gbc.to_csv('data/gbc_1.csv')

In [71]:
df_gb_features = pd.DataFrame(X.columns)

In [72]:
df_gb_features['Feature Importance'] = gbc.feature_importances_

In [74]:
df_gb_features.sort_values(by = 'Feature Importance', ascending = False).head(20)

Unnamed: 0,0,Feature Importance
297,GrLivArea,0.072329
291,LotArea,0.067443
295,BsmtUnfSF,0.064318
296,TotalBsmtSF,0.053387
309,GarageArea,0.045023
311,OpenPorchSF,0.034714
310,WoodDeckSF,0.033131
294,MasVnrArea,0.032824
307,HomeAge,0.03249
290,LotFrontage,0.031327


In [76]:
gbc.get_params()

{'alpha': 0.9,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.05,
 'loss': 'ls',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 5000,
 'presort': 'auto',
 'random_state': 42,
 'subsample': 1.0,
 'verbose': 0,
 'warm_start': False}

In [135]:
gbc2 = GradientBoostingRegressor(loss='ls', learning_rate=0.05, n_estimators=5000, min_samples_split=2, min_samples_leaf=1, max_depth=4, random_state=42)

In [136]:
gbc2.fit(X_train,Y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.05, loss='ls', max_depth=4, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=5000, presort='auto', random_state=42,
             subsample=1.0, verbose=0, warm_start=False)

In [137]:
df_f = pd.DataFrame(X.columns)

In [138]:
df_f['Feature Importance'] = gbc2.feature_importances_

In [139]:
df_f.sort_values(by = 'Feature Importance', ascending = False).head()

Unnamed: 0,0,Feature Importance
291,LotArea,0.078681
295,BsmtUnfSF,0.069082
297,GrLivArea,0.065018
296,TotalBsmtSF,0.050387
309,GarageArea,0.048785


In [140]:
r2_score(Y_test, gbc2.predict(X_test))

0.9028829667004089

In [141]:
mean_squared_error(Y_test, gbc2.predict(X_test))

677691764.1488115