In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [2]:
data_dir = "../../data/"

In [3]:
df_train=pd.read_csv(f"{data_dir}train.csv")
df_test=pd.read_csv(f"{data_dir}test.csv")
df = pd.concat([df_train.drop(["SalePrice"],axis=1),df_test])
y_train = df_train["SalePrice"]

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2919 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             2919 non-null   int64  
 1   MSSubClass     2919 non-null   int64  
 2   MSZoning       2915 non-null   object 
 3   LotFrontage    2433 non-null   float64
 4   LotArea        2919 non-null   int64  
 5   Street         2919 non-null   object 
 6   Alley          198 non-null    object 
 7   LotShape       2919 non-null   object 
 8   LandContour    2919 non-null   object 
 9   Utilities      2917 non-null   object 
 10  LotConfig      2919 non-null   object 
 11  LandSlope      2919 non-null   object 
 12  Neighborhood   2919 non-null   object 
 13  Condition1     2919 non-null   object 
 14  Condition2     2919 non-null   object 
 15  BldgType       2919 non-null   object 
 16  HouseStyle     2919 non-null   object 
 17  OverallQual    2919 non-null   int64  
 18  OverallCond  

In [5]:
missing_data_cols = df.isnull().sum()/len(df)

In [6]:
missing_data_cols

Id               0.000000
MSSubClass       0.000000
MSZoning         0.001370
LotFrontage      0.166495
LotArea          0.000000
                   ...   
MiscVal          0.000000
MoSold           0.000000
YrSold           0.000000
SaleType         0.000343
SaleCondition    0.000000
Length: 80, dtype: float64

In [7]:
missing_data_features = missing_data_cols.loc[missing_data_cols >0.5].index

In [8]:
missing_data_features

Index(['Alley', 'MasVnrType', 'PoolQC', 'Fence', 'MiscFeature'], dtype='object')

In [9]:
df_2=df.drop(missing_data_features,axis=1)

In [10]:
df.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MiscVal            int64
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
Length: 80, dtype: object

In [15]:
df_2.drop(columns=["Id","YrSold","MoSold","MiscVal","PoolArea","ScreenPorch","3SsnPorch","EnclosedPorch","KitchenAbvGr","BedroomAbvGr", "HalfBath","BsmtFullBath","BsmtHalfBath","LowQualFinSF","BsmtUnfSF","BsmtFinSF2","MSSubClass","OverallCond", "LotArea"],inplace=True)

In [12]:
df.columns.tolist()

['MSZoning',
 'LotFrontage',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'OverallQual',
 'YearBuilt',
 'YearRemodAdd',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'MasVnrArea',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinSF1',
 'BsmtFinType2',
 'TotalBsmtSF',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 '1stFlrSF',
 '2ndFlrSF',
 'GrLivArea',
 'FullBath',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Functional',
 'Fireplaces',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'WoodDeckSF',
 'OpenPorchSF',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [13]:
bsmt_cols = ["BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2"]
gar_cols=['GarageType', 'GarageFinish', 'GarageQual',"GarageYrBlt","GarageCond"]
gar2_cols=['GarageCars', 'GarageArea']
bsmt2_cols = ["LotFrontage","MasVnrArea","TotalBsmtSF","BsmtFinSF1"]
_cols = ["MasVnrType","MSZoning","Utilities","Exterior1st","Exterior2nd","Functional","KitchenQual","SaleType","Electrical"]

In [17]:
for bsmt in bsmt_cols:
    df[bsmt] = df[bsmt].fillna("No")
for gar in gar_cols:
    df[gar] = df[gar].fillna("No")
for gar in gar2_cols:
    df[gar] = df[gar].fillna(0)
for _ in _cols:
    df[_] = df[_].fillna(df[_].mode()[0])

df["FireplaceQu"].fillna("No",inplace=True)

In [18]:
df_3=pd.get_dummies(df,columns=["MSZoning","Street","LotShape","LandContour","Utilities","LotConfig","LandSlope","Neighborhood","Condition1","Condition2","BldgType","HouseStyle","RoofStyle","RoofMatl","Exterior1st","Exterior2nd","MasVnrType","ExterQual","ExterCond","Foundation","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2","Heating","HeatingQC","CentralAir","Electrical","KitchenQual","Functional","FireplaceQu","GarageType","GarageYrBlt","GarageFinish","GarageQual","GarageCond","PavedDrive","SaleType","SaleCondition"],prefix=["MSZoning","Street","LotShape","LandContour","Utilities","LotConfig","LandSlope","Neighborhood","Condition1","Condition2","BldgType","HouseStyle","RoofStyle","RoofMatl","Exterior1st","Exterior2nd","MasVnrType","ExterQual","ExterCond","Foundation","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2","Heating","HeatingQC","CentralAir","Electrical","KitchenQual","Functional","FireplaceQu","GarageType","GarageYrBlt","GarageFinish","GarageQual","GarageCond","PavedDrive","SaleType","SaleCondition"],drop_first=True)

In [19]:
df_3.head()

Unnamed: 0,LotFrontage,Alley,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,65.0,,7,2003,2003,196.0,706.0,856.0,856,854,...,False,False,False,False,True,False,False,False,True,False
1,80.0,,6,1976,1976,0.0,978.0,1262.0,1262,0,...,False,False,False,False,True,False,False,False,True,False
2,68.0,,7,2001,2002,162.0,486.0,920.0,920,866,...,False,False,False,False,True,False,False,False,True,False
3,60.0,,7,1915,1970,0.0,216.0,756.0,961,756,...,False,False,False,False,True,False,False,False,False,False
4,84.0,,8,2000,2000,350.0,655.0,1145.0,1145,1053,...,False,False,False,False,True,False,False,False,True,False


In [20]:
X_train=df.iloc[:1460,:].values
X_test=df.iloc[1460:,:].values

In [21]:
X_train

array([['RL', 65.0, 'Pave', ..., nan, 'WD', 'Normal'],
       ['RL', 80.0, 'Pave', ..., nan, 'WD', 'Normal'],
       ['RL', 68.0, 'Pave', ..., nan, 'WD', 'Normal'],
       ...,
       ['RL', 66.0, 'Pave', ..., 'Shed', 'WD', 'Normal'],
       ['RL', 68.0, 'Pave', ..., nan, 'WD', 'Normal'],
       ['RL', 75.0, 'Pave', ..., nan, 'WD', 'Normal']], dtype=object)

In [23]:
sc_x = StandardScaler()
# X_train=sc_x.fit_transform(X_train)
# m,n = X_train.shape[0],X_train.shape[1]
# X_train = np.append(np.ones((m,1)), X_train,axis=1)
# theta = np.zeros((n+1,1))

In [25]:
X_train

array([['RL', 65.0, 'Pave', ..., nan, 'WD', 'Normal'],
       ['RL', 80.0, 'Pave', ..., nan, 'WD', 'Normal'],
       ['RL', 68.0, 'Pave', ..., nan, 'WD', 'Normal'],
       ...,
       ['RL', 66.0, 'Pave', ..., 'Shed', 'WD', 'Normal'],
       ['RL', 68.0, 'Pave', ..., nan, 'WD', 'Normal'],
       ['RL', 75.0, 'Pave', ..., nan, 'WD', 'Normal']], dtype=object)