# 5. Predict Sale Price for Test Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.metrics import r2_score

%matplotlib inline

In [2]:
ames_df = pd.read_csv('../data/ames_df_dummied_train.csv')
test_df = pd.read_csv('../data/ames_df_dummied_test.csv')
test_index = pd.read_csv('../data/test.csv')

In [3]:
ames_df.head()

Unnamed: 0,lot_frontage,lot_area,lot_shape,utilities,land_slope,overall_qual,overall_cond,year_built,year_remod/add,mas_vnr_area,...,sale_type_COD,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD,property_age
0,0,13517,2,4,1,6,8,1976,2005,289.0,...,0,0,0,0,0,0,0,0,1,34
1,43,11492,2,4,1,7,5,1996,1997,132.0,...,0,0,0,0,0,0,0,0,1,13
2,68,7922,1,4,1,5,7,1953,2007,0.0,...,0,0,0,0,0,0,0,0,1,57
3,73,9802,1,4,1,5,5,2006,2007,0.0,...,0,0,0,0,0,0,0,0,1,4
4,82,14235,2,4,1,6,8,1900,1993,0.0,...,0,0,0,0,0,0,0,0,1,110


In [4]:
test_df.head()

Unnamed: 0,lot_frontage,lot_area,lot_shape,utilities,land_slope,overall_qual,overall_cond,year_built,year_remod/add,mas_vnr_area,...,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_VWD,sale_type_WD,property_age
0,69,9142,1,4,1,6,8,1910,1950,0.0,...,0,0,0,0,0,0,0,0,1,96
1,0,9662,2,4,1,5,4,1977,1977,0.0,...,0,0,0,0,0,0,0,0,1,29
2,58,17104,2,4,1,7,5,2006,2006,0.0,...,0,0,0,0,0,1,0,0,0,0
3,60,8520,1,4,1,5,6,1923,2006,0.0,...,0,0,0,0,0,0,0,0,1,84
4,0,9500,2,4,1,6,5,1963,1963,247.0,...,0,0,0,0,0,0,0,0,1,46


In [5]:
#find columns that are in ames_df but missing in test_df
list(set(ames_df) - set(test_df))

['exterior_1st_Stone',
 'condition_2_RRAe',
 'heating_Wall',
 'misc_feature_Elev',
 'roof_matl_ClyTile',
 'condition_2_PosN',
 'condition_2_Artery',
 'exterior_1st_CBlock',
 'neighborhood_GrnHill',
 'saleprice',
 'misc_feature_TenC',
 'condition_2_RRAn',
 'ms_subclass_150',
 'condition_2_RRNn',
 'exterior_2nd_Stone',
 'neighborhood_Landmrk',
 'heating_OthW',
 'roof_matl_Membran',
 'exterior_1st_ImStucc']

In [6]:
#drop columns in ames_df that are missing in test_df except 'saleprice'
ames_df.drop(columns=['condition_2_RRAn',
 'condition_2_RRAe',
 'condition_2_PosN',
 'condition_2_Artery',
 'heating_Wall',
 'misc_feature_Elev',
 'neighborhood_GrnHill',
 'ms_subclass_150',
 'exterior_1st_ImStucc',
 'condition_2_RRNn',
 'exterior_1st_Stone',
 'exterior_1st_CBlock',
 'roof_matl_ClyTile',
 'exterior_2nd_Stone',
 'roof_matl_Membran',
 'misc_feature_TenC',
 'neighborhood_Landmrk',
 'heating_OthW'], inplace=True)

## Model Prep: Create features matrix ( X ) and target vector ( y ) 

In [7]:
# Create features matrix (X) and target vector (y)
features = [col for col in ames_df._get_numeric_data().columns if col != 'saleprice']
X = ames_df[features]
y = ames_df['saleprice']

## Model Prep: Train/test split

In [8]:
# perform train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

## Model Prep: Scaling

In [9]:
# perform scaling 
ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)

## Model 2: Ridge Regression

In [10]:
# instatiate model
ridge = RidgeCV (alphas=np.linspace(.1, 10, 100))

In [11]:
# perform cross validation using ridge regression
ridge_scores = cross_val_score(ridge, X_train, y_train, cv=5)
ridge_scores.mean()

0.81790978426637

In [12]:
# optimal ridge
optimal_ridge = RidgeCV(cv=5)
optimal_ridge.fit(X_train, y_train)

optimal_ridge.alpha_

10.0

In [13]:
# perform cross validation using optimal alpha
ridge = RidgeCV (alphas=optimal_ridge.alpha_)
ridge_scores = cross_val_score(ridge, X_train, y_train, cv=5)

In [14]:
#fit train dataset to ridge model
ridge.fit(X_train, y_train)

RidgeCV(alphas=array(10.))

## Remove columns in test data that are not in train data

In [15]:
#find columns that are in test_df but not in ames_df
list(set(test_df) - set(ames_df))

['sale_type_VWD',
 'roof_matl_Metal',
 'heating_Floor',
 'exterior_2nd_PreCast',
 'exterior_2nd_Other',
 'exterior_1st_PreCast',
 'ms_zoning_C (all)',
 'roof_matl_Roll',
 'mas_vnr_type_CBlock',
 'ms_zoning_I (all)']

In [16]:
#assign column names in test and train to lists
col_list_train = list(ames_df)
col_list_test = list(test_df)

In [17]:
test_df.drop(columns=['sale_type_VWD',
 'roof_matl_Metal',
 'exterior_2nd_Other',
 'heating_Floor',
 'exterior_1st_PreCast',
 'ms_zoning_I (all)',
 'ms_zoning_C (all)',
 'mas_vnr_type_CBlock',
 'roof_matl_Roll',
 'exterior_2nd_PreCast'], inplace=True)

In [18]:
list(set(test_df) - set(ames_df))

[]

In [19]:
np.any(np.isnan(test_df))

True

In [20]:
main_list = np.setdiff1d(test_df,ames_df)

In [21]:
np.all(np.isfinite(test_df))

False

In [22]:
pd.options.display.max_columns = 999

In [23]:
is_NaN = test_df.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = test_df[row_has_NaN]
rows_with_NaN

Unnamed: 0,lot_frontage,lot_area,lot_shape,utilities,land_slope,overall_qual,overall_cond,year_built,year_remod/add,mas_vnr_area,exter_qual,exter_cond,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_type_2,total_bsmt_sf,heating_qc,electrical,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,fireplace_qu,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_val,mo_sold,yr_sold,ms_subclass_20,ms_subclass_30,ms_subclass_40,ms_subclass_45,ms_subclass_50,ms_subclass_60,ms_subclass_70,ms_subclass_75,ms_subclass_80,ms_subclass_85,ms_subclass_90,ms_subclass_120,ms_subclass_160,ms_subclass_180,ms_subclass_190,ms_zoning_FV,ms_zoning_RH,ms_zoning_RL,ms_zoning_RM,street_Grvl,street_Pave,alley_Grvl,alley_None,alley_Pave,land_contour_Bnk,land_contour_HLS,land_contour_Low,land_contour_Lvl,lot_config_Corner,lot_config_CulDSac,lot_config_FR2,lot_config_FR3,lot_config_Inside,neighborhood_Blmngtn,neighborhood_Blueste,neighborhood_BrDale,neighborhood_BrkSide,neighborhood_ClearCr,neighborhood_CollgCr,neighborhood_Crawfor,neighborhood_Edwards,neighborhood_Gilbert,neighborhood_Greens,neighborhood_IDOTRR,neighborhood_MeadowV,neighborhood_Mitchel,neighborhood_NAmes,neighborhood_NPkVill,neighborhood_NWAmes,neighborhood_NoRidge,neighborhood_NridgHt,neighborhood_OldTown,neighborhood_SWISU,neighborhood_Sawyer,neighborhood_SawyerW,neighborhood_Somerst,neighborhood_StoneBr,neighborhood_Timber,neighborhood_Veenker,condition_1_Artery,condition_1_Feedr,condition_1_Norm,condition_1_PosA,condition_1_PosN,condition_1_RRAe,condition_1_RRAn,condition_1_RRNe,condition_1_RRNn,condition_2_Feedr,condition_2_Norm,condition_2_PosA,bldg_type_1Fam,bldg_type_2fmCon,bldg_type_Duplex,bldg_type_TwnhsE,bldg_type_TwnhsI,house_style_1.5Fin,house_style_1.5Unf,house_style_1Story,house_style_2.5Fin,house_style_2.5Unf,house_style_2Story,house_style_SFoyer,house_style_SLvl,roof_style_Flat,roof_style_Gable,roof_style_Gambrel,roof_style_Hip,roof_style_Mansard,roof_style_Shed,roof_matl_CompShg,roof_matl_Tar&Grv,roof_matl_WdShake,roof_matl_WdShngl,exterior_1st_AsbShng,exterior_1st_AsphShn,exterior_1st_BrkComm,exterior_1st_BrkFace,exterior_1st_CemntBd,exterior_1st_HdBoard,exterior_1st_MetalSd,exterior_1st_Plywood,exterior_1st_Stucco,exterior_1st_VinylSd,exterior_1st_Wd Sdng,exterior_1st_WdShing,exterior_2nd_AsbShng,exterior_2nd_AsphShn,exterior_2nd_Brk Cmn,exterior_2nd_BrkFace,exterior_2nd_CBlock,exterior_2nd_CmentBd,exterior_2nd_HdBoard,exterior_2nd_ImStucc,exterior_2nd_MetalSd,exterior_2nd_Plywood,exterior_2nd_Stucco,exterior_2nd_VinylSd,exterior_2nd_Wd Sdng,exterior_2nd_Wd Shng,mas_vnr_type_BrkCmn,mas_vnr_type_BrkFace,mas_vnr_type_None,mas_vnr_type_Stone,foundation_BrkTil,foundation_CBlock,foundation_PConc,foundation_Slab,foundation_Stone,foundation_Wood,heating_GasA,heating_GasW,heating_Grav,central_air_N,central_air_Y,garage_type_2Types,garage_type_Attchd,garage_type_Basment,garage_type_BuiltIn,garage_type_CarPort,garage_type_Detchd,garage_type_None,misc_feature_Gar2,misc_feature_None,misc_feature_Othr,misc_feature_Shed,sale_type_COD,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD,property_age
634,73,9735,1,4,1,5,5,2006,2007,0.0,3,3,4,3,1,1,1,384,4,,1394,0,0,2,1,3,1,4,7,8,0,0,3,2,400,3,3,3,100,0,0,0,0,0,0,0,0,5,2008,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,2


In [24]:
test_df['electrical'].value_counts(dropna=False)

5.0    813
4.0     48
3.0     15
NaN      1
2.0      1
Name: electrical, dtype: int64

In [25]:
#replace NaN with 1.0
test_df['electrical'] = test_df['electrical'].replace(np.nan, 1.0)

In [26]:
is_NaN = test_df.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = test_df[row_has_NaN]
rows_with_NaN

Unnamed: 0,lot_frontage,lot_area,lot_shape,utilities,land_slope,overall_qual,overall_cond,year_built,year_remod/add,mas_vnr_area,exter_qual,exter_cond,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_type_2,total_bsmt_sf,heating_qc,electrical,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,fireplace_qu,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_val,mo_sold,yr_sold,ms_subclass_20,ms_subclass_30,ms_subclass_40,ms_subclass_45,ms_subclass_50,ms_subclass_60,ms_subclass_70,ms_subclass_75,ms_subclass_80,ms_subclass_85,ms_subclass_90,ms_subclass_120,ms_subclass_160,ms_subclass_180,ms_subclass_190,ms_zoning_FV,ms_zoning_RH,ms_zoning_RL,ms_zoning_RM,street_Grvl,street_Pave,alley_Grvl,alley_None,alley_Pave,land_contour_Bnk,land_contour_HLS,land_contour_Low,land_contour_Lvl,lot_config_Corner,lot_config_CulDSac,lot_config_FR2,lot_config_FR3,lot_config_Inside,neighborhood_Blmngtn,neighborhood_Blueste,neighborhood_BrDale,neighborhood_BrkSide,neighborhood_ClearCr,neighborhood_CollgCr,neighborhood_Crawfor,neighborhood_Edwards,neighborhood_Gilbert,neighborhood_Greens,neighborhood_IDOTRR,neighborhood_MeadowV,neighborhood_Mitchel,neighborhood_NAmes,neighborhood_NPkVill,neighborhood_NWAmes,neighborhood_NoRidge,neighborhood_NridgHt,neighborhood_OldTown,neighborhood_SWISU,neighborhood_Sawyer,neighborhood_SawyerW,neighborhood_Somerst,neighborhood_StoneBr,neighborhood_Timber,neighborhood_Veenker,condition_1_Artery,condition_1_Feedr,condition_1_Norm,condition_1_PosA,condition_1_PosN,condition_1_RRAe,condition_1_RRAn,condition_1_RRNe,condition_1_RRNn,condition_2_Feedr,condition_2_Norm,condition_2_PosA,bldg_type_1Fam,bldg_type_2fmCon,bldg_type_Duplex,bldg_type_TwnhsE,bldg_type_TwnhsI,house_style_1.5Fin,house_style_1.5Unf,house_style_1Story,house_style_2.5Fin,house_style_2.5Unf,house_style_2Story,house_style_SFoyer,house_style_SLvl,roof_style_Flat,roof_style_Gable,roof_style_Gambrel,roof_style_Hip,roof_style_Mansard,roof_style_Shed,roof_matl_CompShg,roof_matl_Tar&Grv,roof_matl_WdShake,roof_matl_WdShngl,exterior_1st_AsbShng,exterior_1st_AsphShn,exterior_1st_BrkComm,exterior_1st_BrkFace,exterior_1st_CemntBd,exterior_1st_HdBoard,exterior_1st_MetalSd,exterior_1st_Plywood,exterior_1st_Stucco,exterior_1st_VinylSd,exterior_1st_Wd Sdng,exterior_1st_WdShing,exterior_2nd_AsbShng,exterior_2nd_AsphShn,exterior_2nd_Brk Cmn,exterior_2nd_BrkFace,exterior_2nd_CBlock,exterior_2nd_CmentBd,exterior_2nd_HdBoard,exterior_2nd_ImStucc,exterior_2nd_MetalSd,exterior_2nd_Plywood,exterior_2nd_Stucco,exterior_2nd_VinylSd,exterior_2nd_Wd Sdng,exterior_2nd_Wd Shng,mas_vnr_type_BrkCmn,mas_vnr_type_BrkFace,mas_vnr_type_None,mas_vnr_type_Stone,foundation_BrkTil,foundation_CBlock,foundation_PConc,foundation_Slab,foundation_Stone,foundation_Wood,heating_GasA,heating_GasW,heating_Grav,central_air_N,central_air_Y,garage_type_2Types,garage_type_Attchd,garage_type_Basment,garage_type_BuiltIn,garage_type_CarPort,garage_type_Detchd,garage_type_None,misc_feature_Gar2,misc_feature_None,misc_feature_Othr,misc_feature_Shed,sale_type_COD,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD,property_age


In [27]:
test_df['electrical'].value_counts(dropna=False)

5.0    813
4.0     48
3.0     15
2.0      1
1.0      1
Name: electrical, dtype: int64

In [28]:
test_df.shape

(878, 205)

In [29]:
ames_df.shape

(1996, 206)

In [30]:
list(set(ames_df) - set(test_df))

['saleprice']

In [31]:
ames_df.shape

(1996, 206)

In [32]:
test_df.shape

(878, 205)

In [33]:
list(set(test_df) - set(ames_df))

[]

## Prediction of sale price

In [34]:
# predict sale price from x_test
test_pred = ridge.predict(test_df)

In [35]:
# add predicted sale price to test data as 'SalePrice' column
test_index['SalePrice']=test_pred

In [36]:
test_index.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Roof Style,Roof Matl,Exterior 1st,Exterior 2nd,Mas Vnr Type,Mas Vnr Area,Exter Qual,Exter Cond,Foundation,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin SF 1,BsmtFin Type 2,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,Heating,Heating QC,Central Air,Electrical,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,Kitchen Qual,TotRms AbvGrd,Functional,Fireplaces,Fireplace Qu,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,2fmCon,2Story,6,8,1910,1950,Gable,CompShg,AsbShng,AsbShng,,0.0,TA,Fa,Stone,Fa,TA,No,Unf,0,Unf,0,1020,1020,GasA,Gd,N,FuseP,908,1020,0,1928,0,0,2,0,4,2,Fa,9,Typ,0,,Detchd,1910.0,Unf,1,440,Po,Po,Y,0,60,112,0,0,0,,,,0,4,2006,WD,98487800.0
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Sawyer,Norm,Norm,Duplex,1Story,5,4,1977,1977,Gable,CompShg,Plywood,Plywood,,0.0,TA,TA,CBlock,Gd,TA,No,Unf,0,Unf,0,1967,1967,GasA,TA,Y,SBrkr,1967,0,0,1967,0,0,2,0,6,2,TA,10,Typ,0,,Attchd,1977.0,Fin,2,580,TA,TA,Y,170,0,0,0,0,0,,,,0,8,2006,WD,108247700.0
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,7,5,2006,2006,Gable,CompShg,VinylSd,VinylSd,,0.0,Gd,TA,PConc,Gd,Gd,Av,GLQ,554,Unf,0,100,654,GasA,Ex,Y,SBrkr,664,832,0,1496,1,0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,2006.0,RFn,2,426,TA,TA,Y,100,24,0,0,0,0,,,,0,9,2006,New,116060100.0
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,1Fam,1Story,5,6,1923,2006,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,Gd,TA,CBlock,TA,TA,No,Unf,0,Unf,0,968,968,GasA,TA,Y,SBrkr,968,0,0,968,0,0,1,0,2,1,TA,5,Typ,0,,Detchd,1935.0,Unf,2,480,Fa,TA,N,0,0,184,0,0,0,,,,0,7,2007,WD,73588150.0
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,5,1963,1963,Gable,CompShg,Plywood,Plywood,BrkFace,247.0,TA,TA,CBlock,Gd,TA,No,BLQ,609,Unf,0,785,1394,GasA,Gd,Y,SBrkr,1394,0,0,1394,1,0,1,1,3,1,TA,6,Typ,2,Gd,Attchd,1963.0,RFn,2,514,TA,TA,Y,0,76,0,0,185,0,,,,0,7,2009,WD,90924700.0


In [37]:
# create new df with 'Id' and 'SalePrice' columns
pred_df = test_index[['Id', 'SalePrice']]

In [38]:
pred_df

Unnamed: 0,Id,SalePrice
0,2658,9.848780e+07
1,2718,1.082477e+08
2,2414,1.160601e+08
3,1989,7.358815e+07
4,625,9.092470e+07
...,...,...
873,1662,9.441102e+07
874,1234,1.218095e+08
875,1373,7.813458e+07
876,1672,7.282767e+07


In [39]:
#save pred df to csv without the index
pred_df.to_csv('../data/test_pred.csv', index=False)