# Housing Prices Classification

## Data Preparation

### Data Cleaning

In [201]:
import pandas as pd
from pandas import DataFrame
import numpy as np

In [202]:
df = pd.read_csv('Ames_Housing_Sales.csv')

In [203]:
df.head(100)

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,SalePrice
0,856.0,854.0,0.0,,3,1Fam,TA,No,706.0,0.0,...,0.0,Pave,8,856.0,AllPub,0.0,2003,2003,2008,208500.0
1,1262.0,0.0,0.0,,3,1Fam,TA,Gd,978.0,0.0,...,0.0,Pave,6,1262.0,AllPub,298.0,1976,1976,2007,181500.0
2,920.0,866.0,0.0,,3,1Fam,TA,Mn,486.0,0.0,...,0.0,Pave,6,920.0,AllPub,0.0,2001,2002,2008,223500.0
3,961.0,756.0,0.0,,3,1Fam,Gd,No,216.0,0.0,...,0.0,Pave,7,756.0,AllPub,0.0,1915,1970,2006,140000.0
4,1145.0,1053.0,0.0,,4,1Fam,TA,Av,655.0,0.0,...,0.0,Pave,9,1145.0,AllPub,192.0,2000,2000,2008,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,977.0,755.0,0.0,,3,1Fam,,,0.0,0.0,...,0.0,Pave,7,741.0,AllPub,192.0,1985,1985,2010,178000.0
96,1535.0,0.0,0.0,,4,Duplex,,,0.0,0.0,...,0.0,Pave,8,0.0,AllPub,0.0,1979,1979,2009,118964.0
97,1226.0,0.0,0.0,,3,1Fam,,,0.0,0.0,...,0.0,Pave,6,1226.0,AllPub,0.0,2009,2009,2010,198900.0
98,1226.0,592.0,0.0,,4,1Fam,TA,No,224.0,0.0,...,184.0,Pave,7,1040.0,AllPub,0.0,1931,1950,2007,169500.0


In [204]:
df.isna().sum()

1stFlrSF           0
2ndFlrSF           0
3SsnPorch          0
Alley           1297
BedroomAbvGr       0
                ... 
WoodDeckSF         0
YearBuilt          0
YearRemodAdd       0
YrSold             0
SalePrice          0
Length: 80, dtype: int64

In [205]:
df.shape

(1379, 80)

In [206]:
df.dropna(axis=1, inplace = True)

In [207]:
df.shape

(1379, 69)

In [208]:
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1374    False
1375    False
1376    False
1377    False
1378    False
Length: 1379, dtype: bool

In [209]:
df.drop_duplicates()
print(df.shape)
df.describe().T

(1379, 69)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
1stFlrSF,1379.0,1177.129804,387.014961,438.0,894.0,1098.0,1414.0,4692.0
2ndFlrSF,1379.0,353.424946,439.553171,0.0,0.0,0.0,738.5,2065.0
3SsnPorch,1379.0,3.609862,30.154682,0.0,0.0,0.0,0.0,508.0
BedroomAbvGr,1379.0,2.86512,0.783961,0.0,2.0,3.0,3.0,6.0
BsmtFinSF1,1379.0,455.57868,459.691379,0.0,0.0,400.0,732.0,5644.0
BsmtFinSF2,1379.0,48.102248,164.324665,0.0,0.0,0.0,0.0,1474.0
BsmtFullBath,1379.0,0.430747,0.514052,0.0,0.0,0.0,1.0,2.0
BsmtHalfBath,1379.0,0.058738,0.238285,0.0,0.0,0.0,0.0,2.0
BsmtUnfSF,1379.0,570.765047,443.677845,0.0,228.0,476.0,811.0,2336.0
EnclosedPorch,1379.0,21.039159,60.535107,0.0,0.0,0.0,0.0,552.0


In [210]:
df.nunique()

1stFlrSF        738
2ndFlrSF        401
3SsnPorch        20
BedroomAbvGr      7
BldgType          5
               ... 
WoodDeckSF      267
YearBuilt       109
YearRemodAdd     61
YrSold            5
SalePrice       640
Length: 69, dtype: int64

### Encoding of Categorical Features

In [211]:
cols = df.columns
num_cols = df.select_dtypes(np.number).columns
num_cols

Index(['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'BedroomAbvGr', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtUnfSF',
       'EnclosedPorch', 'Fireplaces', 'FullBath', 'GarageArea', 'GarageCars',
       'GarageYrBlt', 'GrLivArea', 'HalfBath', 'KitchenAbvGr', 'LotArea',
       'LotFrontage', 'LowQualFinSF', 'MSSubClass', 'MasVnrArea', 'MiscVal',
       'MoSold', 'OpenPorchSF', 'OverallCond', 'OverallQual', 'PoolArea',
       'ScreenPorch', 'TotRmsAbvGrd', 'TotalBsmtSF', 'WoodDeckSF', 'YearBuilt',
       'YearRemodAdd', 'YrSold', 'SalePrice'],
      dtype='object')

In [212]:
cat_cols = list(set(cols) - set(num_cols))
cat_cols

['GarageType',
 'LotShape',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'Street',
 'RoofMatl',
 'Exterior2nd',
 'Utilities',
 'SaleCondition',
 'HouseStyle',
 'LandSlope',
 'GarageCond',
 'Exterior1st',
 'Foundation',
 'Condition1',
 'LotConfig',
 'MSZoning',
 'Neighborhood',
 'Heating',
 'Condition2',
 'ExterCond',
 'ExterQual',
 'BldgType',
 'LandContour',
 'Functional',
 'KitchenQual',
 'GarageFinish',
 'GarageQual',
 'SaleType',
 'PavedDrive',
 'RoofStyle']

In [213]:
catdf = df[cat_cols]
catdf.nunique()

GarageType        6
LotShape          4
HeatingQC         5
CentralAir        2
Electrical        5
Street            2
RoofMatl          8
Exterior2nd      16
Utilities         2
SaleCondition     6
HouseStyle        8
LandSlope         3
GarageCond        5
Exterior1st      14
Foundation        6
Condition1        9
LotConfig         5
MSZoning          5
Neighborhood     25
Heating           6
Condition2        8
ExterCond         4
ExterQual         4
BldgType          5
LandContour       4
Functional        7
KitchenQual       4
GarageFinish      3
GarageQual        5
SaleType          9
PavedDrive        3
RoofStyle         6
dtype: int64

In [214]:
from sklearn.preprocessing import LabelEncoder
# Create LabelEncoder object
encoder = LabelEncoder()

# Encode categorical variables as integers
for var in cat_cols:
    df[var] = encoder.fit_transform(df[var])

In [215]:
df.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BldgType,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtUnfSF,...,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,SalePrice
0,856.0,854.0,0.0,3,0,706.0,0.0,1,0,150.0,...,0.0,1,8,856.0,0,0.0,2003,2003,2008,208500.0
1,1262.0,0.0,0.0,3,0,978.0,0.0,0,1,284.0,...,0.0,1,6,1262.0,0,298.0,1976,1976,2007,181500.0
2,920.0,866.0,0.0,3,0,486.0,0.0,1,0,434.0,...,0.0,1,6,920.0,0,0.0,2001,2002,2008,223500.0
3,961.0,756.0,0.0,3,0,216.0,0.0,1,0,540.0,...,0.0,1,7,756.0,0,0.0,1915,1970,2006,140000.0
4,1145.0,1053.0,0.0,4,0,655.0,0.0,1,0,490.0,...,0.0,1,9,1145.0,0,192.0,2000,2000,2008,250000.0


### Feature Selection

In [216]:
y = df['SalePrice']
X = df.drop('SalePrice', axis = 1)

In [220]:
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif

#The top 40% of features will be selected using chi2
selector = SelectPercentile(score_func= f_classif, percentile = 50)
selector.fit_transform(X,y)

print(X.shape)
print('Selected Features are \n',selector.get_support())

(1379, 68)
Selected Features are 
 [ True  True False False False  True False False False  True  True False
  True False False False  True False False  True  True  True False  True
  True False  True False  True  True  True  True  True  True False False
  True False False  True False  True  True False False  True  True  True
 False False  True False  True False False False False False False False
  True  True  True False  True  True  True False]


In [221]:
# Get the indices of the selected features
selected_features = selector.get_support(indices=True)

# Drop the non-selected features from the dataframe
X_selected = X.iloc[:, selected_features]

# Print the resulting dataframe with only the selected features
X_selected = DataFrame(X_selected)
X_selected.head(20)

Unnamed: 0,1stFlrSF,2ndFlrSF,BsmtFinSF1,BsmtUnfSF,CentralAir,Condition2,ExterQual,Fireplaces,Foundation,FullBath,...,MasVnrArea,MiscVal,OpenPorchSF,OverallQual,Street,TotRmsAbvGrd,TotalBsmtSF,WoodDeckSF,YearBuilt,YearRemodAdd
0,856.0,854.0,706.0,150.0,1,2,2,0,2,2,...,196.0,0.0,61.0,7,1,8,856.0,0.0,2003,2003
1,1262.0,0.0,978.0,284.0,1,2,3,1,1,2,...,0.0,0.0,0.0,6,1,6,1262.0,298.0,1976,1976
2,920.0,866.0,486.0,434.0,1,2,2,1,2,2,...,162.0,0.0,42.0,7,1,6,920.0,0.0,2001,2002
3,961.0,756.0,216.0,540.0,1,2,3,1,0,1,...,0.0,0.0,35.0,7,1,7,756.0,0.0,1915,1970
4,1145.0,1053.0,655.0,490.0,1,2,2,1,2,2,...,350.0,0.0,84.0,8,1,9,1145.0,192.0,2000,2000
5,796.0,566.0,732.0,64.0,1,2,3,0,5,1,...,0.0,700.0,30.0,5,1,5,796.0,40.0,1993,1995
6,1694.0,0.0,1369.0,317.0,1,2,2,1,2,2,...,186.0,0.0,57.0,8,1,7,1686.0,255.0,2004,2005
7,1107.0,983.0,859.0,216.0,1,2,3,2,1,2,...,240.0,350.0,204.0,7,1,7,1107.0,235.0,1973,1973
8,1022.0,752.0,0.0,952.0,1,2,3,2,0,2,...,0.0,0.0,0.0,7,1,8,952.0,90.0,1931,1950
9,1077.0,0.0,851.0,140.0,1,0,3,2,0,1,...,0.0,0.0,4.0,5,1,5,991.0,0.0,1939,1950


### Target Discretization

### Feature Scaling

## Classification