In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('train.csv')
#test_df = pd.read_csv('test.csv')
print(df.shape)
df.head()

(1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


<h4>Counting Null Values</h4>

In [3]:
def null_info(data):
    null_df = data.isnull().sum().sort_values(ascending=False).reset_index()
    rows = data.shape[0]
    null_df.columns = ['column_name', 'null_count']
    null_df['null_percentage'] = null_df['null_count']/rows
    return null_df

In [4]:
null_table = null_info(df)
null_table.head()
#list(null_table.column_name)

Unnamed: 0,column_name,null_count,null_percentage
0,PoolQC,1453,0.995205
1,MiscFeature,1406,0.963014
2,Alley,1369,0.937671
3,Fence,1179,0.807534
4,FireplaceQu,690,0.472603


<h4>Column types and # of unique categorical varibles for categorical columns</h4>

In [5]:
df.dtypes.value_counts()

object     43
int64      35
float64     3
dtype: int64

In [6]:
categorical_unique_counts = df.select_dtypes('object').apply(pd.Series.nunique, axis=0).sort_index()
categorical_unique_counts.sort_values(ascending=False).head()

Neighborhood    25
Exterior2nd     16
Exterior1st     15
SaleType         9
Condition1       9
dtype: int64

In [7]:
categorical_columns = df.select_dtypes(include='object').columns
numerical_columns = df.select_dtypes(exclude='object').columns
target = df['SalePrice']

<h5>Normalizing Values<h5>

In [8]:
from sklearn.preprocessing import MinMaxScaler

scalar = MinMaxScaler()

df_scaled = scalar.fit_transform(df[numerical_columns])
df_scaled = pd.DataFrame(df_scaled, columns=numerical_columns)

In [9]:
df2 = pd.concat([df_scaled, df[categorical_columns]], axis=1)
df2.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,0.0,0.235294,0.150685,0.03342,0.666667,0.5,0.949275,0.883333,0.1225,0.125089,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,0.000685,0.0,0.202055,0.038795,0.555556,0.875,0.753623,0.433333,0.0,0.173281,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,0.001371,0.235294,0.160959,0.046507,0.666667,0.5,0.934783,0.866667,0.10125,0.086109,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,0.002056,0.294118,0.133562,0.038561,0.666667,0.5,0.311594,0.333333,0.0,0.038271,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,0.002742,0.235294,0.215753,0.060576,0.777778,0.5,0.927536,0.833333,0.21875,0.116052,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


<h5>Handling Null Values</h5>

In [10]:
cols_to_drop = list(null_table[null_table['null_percentage'] > 0.15]['column_name'])
df2 = df2.drop(cols_to_drop, axis=1)
#test_df2 = test_df.drop(cols_to_drop, axis=1)
df2.shape

(1460, 75)

In [11]:
categorical_columns2 = df2.select_dtypes(include='object').columns
numerical_columns2 = df2.select_dtypes(exclude='object').columns

In [27]:
#objs = (df2.dtypes == 'object')
#obj_columns = list(objs[objs].index)
#df2[obj_columns].head(1)
#df2[df2.isna().any(axis=1)].columns

In [13]:
# categorical_columns
df2[categorical_columns2] = df2[categorical_columns2].fillna('NA', axis=1)
df2.head(1)

Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,Electrical,KitchenQual,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
0,0.0,0.235294,0.03342,0.666667,0.5,0.949275,0.883333,0.1225,0.125089,0.0,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal


In [14]:
# numerical columns
df2[numerical_columns2] = df2[numerical_columns2].fillna(df2.mean(axis=0))
df2.head(1)

Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,Electrical,KitchenQual,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
0,0.0,0.235294,0.03342,0.666667,0.5,0.949275,0.883333,0.1225,0.125089,0.0,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal


<h5>Splitting to train and test data</h5>

In [15]:
from sklearn.model_selection import train_test_split

train_features = df2.drop(['Id', 'SalePrice'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(train_features, target, test_size=0.25, random_state=0)

In [16]:
X_train.head(1)

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,Electrical,KitchenQual,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
1292,0.294118,0.024773,0.444444,0.375,0.144928,0.25,0.0,0.0,0.0,0.425514,...,SBrkr,TA,Min2,Attchd,RFn,TA,TA,Y,WD,Normal


<h5>Using One Hot Encoder for categorical variables</h5>

In [18]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(handle_unknown='ignore', sparse='False')
OH_train_cols = ohe.fit_transform(X_train[categorical_columns2])
OH_test_cols = ohe.transform(X_test[categorical_columns2])

In [20]:
#OH_train_cols.toarray()
#OH_test_cols.toarray()
#pd.DataFrame(OH_train_cols.toarray()).shape
#pd.DataFrame(OH_test_cols.toarray()).shape
#X_train.drop(obj_columns, axis=1).shape
#X_test.drop(obj_columns, axis=1).shape

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,200,201,202,203,204,205,206,207,208,209
0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
288,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
289,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
290,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [20]:
encoded_train = pd.DataFrame(OH_train_cols.toarray())
encoded_test = pd.DataFrame(OH_test_cols.toarray())

encoded_train.index = X_train.index
encoded_test.index = X_test.index

num_train = X_train.drop(categorical_columns2, axis=1)
num_test = X_test.drop(categorical_columns2, axis=1)

OH_train = pd.concat([num_train, encoded_train], axis=1)
OH_test = pd.concat([num_test, encoded_test], axis=1)
OH_train.head()

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,230,231,232,233,234,235,236,237,238,239
1292,0.294118,0.024773,0.444444,0.375,0.144928,0.25,0.0,0.0,0.0,0.425514,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1018,0.352941,0.044329,0.666667,0.5,0.862319,0.7,0.0475,0.0,0.0,0.164384,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1213,0.352941,0.041814,0.333333,1.0,0.673913,0.85,0.0,0.114812,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1430,0.235294,0.096427,0.444444,0.5,0.963768,0.916667,0.0,0.0,0.0,0.313356,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
810,0.0,0.041319,0.555556,0.625,0.73913,0.816667,0.061875,0.11747,0.255767,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [21]:
#OH_train[OH_train.head().isnull() == True]
#OH_train[OH_train.isna().any(axis=1)].iloc[0,10:]
#OH_train.columns[OH_train.isna().any()].tolist()
#list(OH_train['MasVnrArea'])

In [38]:
from sklearn.linear_model import SGDRegressor

clf = SGDRegressor()
clf.fit(OH_train, y_train)

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=1000,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=None,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [39]:
prediction = clf.predict(OH_test)

In [40]:
from sklearn import metrics
import math

In [41]:
print('Mean Absolute Error Score:')
metrics.mean_absolute_error(y_test, prediction)

Mean Absolute Error Score:


19563.52234014646

In [43]:
print('Root Mean Squared Error Score:')
math.sqrt(metrics.mean_squared_error(y_test, prediction))

Root Mean Squared Error Score:


36924.67647131168