In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [2]:
train_df = pd.read_csv('train.csv')
#test_df = pd.read_csv('test.csv')
print(train_df.shape)
train_df.head()

(1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


<h4>Counting Null Values</h4>

In [3]:
def null_info(data):
    null_df = data.isnull().sum().sort_values(ascending=False).reset_index()
    rows = data.shape[0]
    null_df.columns = ['column_name', 'null_count']
    null_df['null_percentage'] = null_df['null_count']/rows
    return null_df

In [4]:
null_table_train = null_info(train_df)
#null_table_test = null_info(test_df)
null_table_train.head()

Unnamed: 0,column_name,null_count,null_percentage
0,PoolQC,1453,0.995205
1,MiscFeature,1406,0.963014
2,Alley,1369,0.937671
3,Fence,1179,0.807534
4,FireplaceQu,690,0.472603


<h4>Column types and # of unique categorical varibles for categorical columns</h4>

In [5]:
train_df.dtypes.value_counts()

object     43
int64      35
float64     3
dtype: int64

In [6]:
train_object = train_df.select_dtypes('object').apply(pd.Series.nunique, axis=0).sort_index()
# test_object = test_df.select_dtypes('object').apply(pd.Series.nunique, axis=0).sort_index()
# pd.concat([train_object, test_object], axis=1).sort_values(by=0, ascending=False).head(10)
train_object.sort_values(ascending=False).head()

Neighborhood    25
Exterior2nd     16
Exterior1st     15
SaleType         9
Condition1       9
dtype: int64

<h5>Handling Null Values</h5>

In [7]:
cols_to_drop = list(null_table_train[null_table_train['null_percentage'] > 0.15]['column_name'])
train_df2 = train_df.drop(cols_to_drop, axis=1)
#test_df2 = test_df.drop(cols_to_drop, axis=1)
train_df2.shape

(1460, 75)

In [8]:
objs = (train_df2.dtypes == 'object')
obj_columns = list(objs[objs].index)
train_df2[obj_columns].head()

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,Electrical,KitchenQual,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
0,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
1,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,...,SBrkr,TA,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
2,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
3,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,...,SBrkr,Gd,Typ,Detchd,Unf,TA,TA,Y,WD,Abnorml
4,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal


In [9]:
train_df2[obj_columns] = train_df2[obj_columns].fillna('NA', axis=1)

<h5>Splitting to train and test data</h5>

In [10]:
train_features = train_df2.drop(['Id', 'SalePrice'], axis=1)
train_labels = train_df2['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=.8, random_state=0)

In [17]:
X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
1111,60,RL,10480,Pave,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,...,69,0,0,0,0,0,9,2008,WD,Normal
750,50,RM,8800,Pave,Reg,Lvl,AllPub,Corner,Gtl,OldTown,...,160,0,0,0,0,0,6,2010,WD,Normal
1117,20,RL,9764,Pave,IR1,Lvl,AllPub,FR2,Gtl,Sawyer,...,0,0,0,0,0,0,5,2008,WD,Normal
859,60,RL,11029,Pave,IR1,Lvl,AllPub,Corner,Gtl,NWAmes,...,65,0,0,222,0,0,8,2006,WD,Normal
952,85,RL,7200,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,0,0,0,0,0,4,2009,WD,Normal


<h5>Using One Hot Encoder for categorical variables</h5>

In [11]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse='False')
OH_train_cols = ohe.fit_transform(X_train[obj_columns])
OH_test_cols = ohe.transform(X_test[obj_columns])

In [16]:
#OH_train_cols.toarray()
#OH_test_cols.toarray()

In [33]:
encoded_train = pd.DataFrame(OH_train_cols.toarray())
encoded_test = pd.DataFrame(OH_test_cols.toarray())

num_train = X_train.drop(obj_columns, axis=1)
num_test = X_test.drop(obj_columns, axis=1)

OH_train = pd.concat([num_train, encoded_train], axis=1)
OH_test = pd.concat([num_test, encoded_test], axis=1)
OH_train.head()
#OH_test.head()

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,200,201,202,203,204,205,206,207,208,209
0,,,,,,,,,,,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,,,,,,,,,,,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,,,,,,,,,,,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,,,,,,,,,,,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,,,,,,,,,,,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [14]:
# encoded indexes may not be matching original df indexes