# Kaggle: Intro to Machine Learning

Home Price Competition

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

In [2]:
iowa_file_path = './data/home-data-for-ml-course/train.csv'

In [4]:
df = pd.read_csv(iowa_file_path)

In [7]:
df.shape

(1460, 81)

In [5]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
df.isna().sum().reset_index(name='count').query("count > 0").sort_values(by='count', ascending=False)



Unnamed: 0,index,count
72,PoolQC,1453
74,MiscFeature,1406
6,Alley,1369
73,Fence,1179
57,FireplaceQu,690
3,LotFrontage,259
58,GarageType,81
59,GarageYrBlt,81
60,GarageFinish,81
63,GarageQual,81


## Drop Columns with Missing Value

In [8]:
df = df.dropna(axis=1)

In [9]:
df.shape

(1460, 62)

In [28]:
modeled_columns = df.columns

In [11]:
df.dtypes

Id                int64
MSSubClass        int64
MSZoning         object
LotArea           int64
Street           object
                  ...  
MoSold            int64
YrSold            int64
SaleType         object
SaleCondition    object
SalePrice         int64
Length: 62, dtype: object

In [12]:
obj_cols = list(df.select_dtypes(include=['object']).columns)
obj_cols

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'KitchenQual',
 'Functional',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [14]:
df['SaleCondition'].value_counts()

Normal     1198
Partial     125
Abnorml     101
Family       20
Alloca       12
AdjLand       4
Name: SaleCondition, dtype: int64

In [43]:
num_cols = list(df.select_dtypes(exclude=['object']).columns)
num_cols.remove("SalePrice")
num_cols

['Id',
 'MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

## Pipelines and DataPrep

In [55]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [61]:
rf = RandomForestRegressor()
ct = make_column_transformer(
    (OneHotEncoder(dtype='int', handle_unknown='ignore'), obj_cols),
    (SimpleImputer(), num_cols),
    remainder='passthrough'
)
model_pipeline = make_pipeline(ct, rf)

In [57]:
X = df.drop(columns=['SalePrice'])
y = df.SalePrice

In [58]:
X.shape

(1460, 61)

In [59]:
y.shape

(1460,)

In [62]:
model_pipeline.fit(X,y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(dtype='int',
                                                                handle_unknown='ignore'),
                                                  ['MSZoning', 'Street',
                                                   'LotShape', 'LandContour',
                                                   'Utilities', 'LotConfig',
                                                   'LandSlope', 'Neighborhood',
                                                   'Condition1', 'Condition2',
                                                   'BldgType', 'HouseStyle',
                                                   'RoofStyle', 'RoofMatl',
                                                   'Exterior1st', 'Exte...
                                                   'B

## Read In test data for submission


In [63]:
# path to file you will use for predictions
test_data_path = './data/home-data-for-ml-course/test.csv'

# read test data file using pandas
test_data = pd.read_csv(test_data_path)
feature_columns = modeled_columns.tolist()
feature_columns.remove('SalePrice')
test_data = test_data[feature_columns]

In [64]:
test_data.isna().sum()

Id               0
MSSubClass       0
MSZoning         4
LotArea          0
Street           0
                ..
MiscVal          0
MoSold           0
YrSold           0
SaleType         1
SaleCondition    0
Length: 61, dtype: int64

In [65]:
test_data.isna().sum().reset_index(name='count').query("count > 0").sort_values(by='count', ascending=False)



Unnamed: 0,index,count
2,MSZoning,4
7,Utilities,2
37,BsmtFullBath,2
38,BsmtHalfBath,2
45,Functional,2
21,Exterior1st,1
22,Exterior2nd,1
26,BsmtFinSF1,1
27,BsmtFinSF2,1
28,BsmtUnfSF,1


In [66]:
# make predictions which we will submit. 
test_preds = model_pipeline.predict(test_data)
print(test_preds)

[128734.   157740.5  176545.7  ... 153494.46 113487.58 227971.05]


In [67]:





# The lines below shows how to save predictions in format used for competition scoring
# Just uncomment them.

output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': test_preds})
output.to_csv('submission_v2.csv', index=False)