# Intermediate ML - House Prices

In [46]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [3]:
iowa_file_path = '../data/home-data-for-ml-course/train.csv'

In [4]:
df = pd.read_csv(iowa_file_path)

In [5]:
df.shape

(1460, 81)

In [6]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [9]:
df.isna().sum().reset_index(name='count').query("count > 0").sort_values(by='count', ascending=False)



Unnamed: 0,index,count
72,PoolQC,1453
74,MiscFeature,1406
6,Alley,1369
73,Fence,1179
57,FireplaceQu,690
3,LotFrontage,259
58,GarageType,81
59,GarageYrBlt,81
60,GarageFinish,81
63,GarageQual,81


In [11]:
cols_with_missing = [col for col in df.columns
                     if df[col].isnull().any()]


In [12]:
cols_with_missing

['LotFrontage',
 'Alley',
 'MasVnrType',
 'MasVnrArea',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [13]:
df.drop(columns=cols_with_missing, inplace=True)

In [14]:
df.shape

(1460, 62)

In [29]:
modeled_columns = df.columns

In [15]:
num_cols = list(df.select_dtypes(exclude=['object']).columns)
num_cols.remove("SalePrice")
num_cols

['Id',
 'MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [19]:
obj_cols = list(df.select_dtypes(include=['object']).columns)
obj_cols

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'KitchenQual',
 'Functional',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [48]:
# Parameters of pipelines can be set using ‘__’ separated parameter names:
param_grid = {
    'gradientboostingregressor__n_estimators': [100,200,300,500]
}

In [49]:
# reg = RandomForestRegressor()
# reg = AdaBoostRegressor()
reg = GradientBoostingRegressor()
cat_pipe = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(dtype='int', handle_unknown='ignore')
)
ct = make_column_transformer(
    (cat_pipe, obj_cols),
    (SimpleImputer(strategy='constant', fill_value = 0), num_cols),
    remainder='passthrough'
)
model_pipeline = make_pipeline(ct, reg)

In [35]:
y=df['SalePrice']
X = df.drop(columns=['SalePrice'])

In [50]:
search = GridSearchCV(model_pipeline, param_grid, n_jobs=-1, cv=3)
search.fit(X, y)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.892):
{'gradientboostingregressor__n_estimators': 200}


In [51]:
# reg = RandomForestRegressor(n_estimators=500)
reg = GradientBoostingRegressor(n_estimators=200)


cat_pipe = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(dtype='int', handle_unknown='ignore')
)
ct = make_column_transformer(
    (cat_pipe, obj_cols),
    (SimpleImputer(strategy='constant', fill_value = 0), num_cols),
    remainder='passthrough'
)
model_pipeline = make_pipeline(ct, reg)

In [52]:
model_pipeline.fit(X, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pipeline',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(dtype='int',
                                                                                 handle_unknown='ignore'))]),
                                                  ['MSZoning', 'Street',
                                                   'LotShape', 'LandContour',
                                                   'Utilities', 'LotConfig',
                                                   'LandSlope', 'Neighborhood',
                                                   

In [33]:
!pwd

/Users/patryan/Development/KaggleCourses/intermediate_ml


In [28]:
# Read in test submission

In [53]:
# path to file you will use for predictions
test_data_path = '../data/home-data-for-ml-course/test.csv'

# read test data file using pandas
test_data = pd.read_csv(test_data_path)
feature_columns = modeled_columns.tolist()
feature_columns.remove('SalePrice')
test_data = test_data[feature_columns]

In [54]:
# make predictions which we will submit. 
test_preds = model_pipeline.predict(test_data)
print(test_preds)

[121723.88423003 164133.55041008 180287.53343264 ... 156326.71808651
 117818.95948854 242073.83900941]


In [55]:

# The lines below shows how to save predictions in format used for competition scoring
# Just uncomment them.

output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': test_preds})
output.to_csv('submission_intermediate_v2.csv', index=False)

Best Score

reg = GradientBoostingRegressor(n_estimators=200)


Rank: 2955
Your Best Entry 
Your submission scored 14648.88015, which is an improvement of your previous score of 16314.41116. Great job!
