# Housing Price prediction using Classification


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
data = pd.read_csv("/content/drive/MyDrive/coding/housing-classification-iter6.csv")

In [None]:
data 

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,8450,65.0,856,3,0,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,9600,80.0,1262,3,1,0,2,298,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,11250,68.0,920,3,1,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,9550,60.0,756,3,1,0,3,0,0,0,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,14260,84.0,1145,4,1,0,3,192,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,7917,62.0,953,3,1,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1456,13175,85.0,1542,3,2,0,2,349,0,0,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1457,9042,66.0,1152,4,2,0,1,0,0,1,...,Attchd,RFn,TA,TA,Y,,GdPrv,Shed,WD,Normal
1458,9717,68.0,1078,2,0,0,1,366,0,0,...,Attchd,Unf,TA,TA,Y,,,,WD,Normal


# Data Explorartion

In [None]:
data.info()

In [None]:
X= data


(1460, 80)

In [None]:
# making y data frame(target)
y = X.pop("Expensive")

In [None]:
# checking the shape again after pop
X.shape

(1460, 80)

# Splitting Training and Testing data

In [None]:
# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123000)

# Creating a pipeline

Imputatation transformer

Creating pipeline for imputing missing values in categorical and numerical data

In [None]:
# select categorical and numerical column names
X_cat_columns = X.select_dtypes(exclude="number").copy().columns
X_num_columns = X.select_dtypes(include="number").copy().columns

# create numerical pipeline, only with the SimpleImputer(strategy="mean")
numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean"))
 
 # create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown = 'ignore')           # using one hot encoder to convert categorical values in numeric value
)

In [None]:
numeric_pipe

Pipeline(steps=[('simpleimputer', SimpleImputer())])

In [None]:
categoric_pipe

Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')),
                ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))])

# Coloumn Transformer
This estimator allows different columns or column subsets of the input to be transformed separately and the features generated by each transformer will be concatenated to form a single feature space. This is useful for heterogeneous or columnar data, to combine several feature extraction mechanisms or transformations into a single transformer. https://www.sktime.org/en/stable/api_reference/auto_generated/sktime.transformations.panel.compose.ColumnTransformer.html

In [None]:
# ColumnTransformer applies transformations to columns of an array or pandas DataFrame. 
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num_columns),
        ("cat_pipe", categoric_pipe, X_cat_columns)
        #("scale", StandardScaler())
    ]
)

In [None]:
# checking the dictionary generated in preprocessor, looking for all the steps applied
preprocessor.transformers

In [None]:
# creating a complete pipeline using all the steps in preprocessor and decisiontree classifier
full_pipeline = make_pipeline(preprocessor, 
                              DecisionTreeClassifier())

In [None]:
# fitting the data in pipline
full_pipeline.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer())]),
                                                  Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'MSSubClass',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtFinSF1'...
       'Neighborhood', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'BsmtFinType2',
       'HeatingQC', 'Electrical', 'Functional', 'GarageType', 'GarageFinish',
       'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence',
       'MiscFeature', 'SaleType', 'SaleCondition'],
      dtype='object'))])),
                ('decisiontreeclassifier',

In [None]:
full_pipeline.named_steps.columntransformer

ColumnTransformer(transformers=[('num_pipe',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer())]),
                                 Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'MSSubClass',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF',...
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'BsmtFinType2',
       'HeatingQC', 'Electrical', 'Functional', 'GarageType', 'GarageFinish',
       'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence',
       'MiscFeature', 'SaleType', 'SaleCondition'],
      dtype='object'))])

In [None]:
full_pipeline.predict(X_train)

array([0, 1, 0, ..., 0, 0, 0])

In [None]:
# checking accuracy score for train set w/o best parameter(gridsearch_cv)


train_accuracy_no_grid= accuracy_score(y_true = y_train,
               y_pred = full_pipeline.predict(X_train))
train_accuracy_no_grid
round(train_accuracy_no_grid, 5) 

1.0

In [None]:
y_predict_no_grid = full_pipeline.predict(X_test)

In [None]:
y_test

1398    0
955     0
480     1
1088    0
647     0
       ..
1149    0
399     0
1226    0
1389    0
1307    0
Name: Expensive, Length: 292, dtype: int64

In [None]:
y_predict_no_grid

array([0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0])

In [None]:
# checking accuracy score for test set w/o best parameter(gridsearch_cv)


test_accuracy_no_grid= accuracy_score(y_true = y_test,
               y_pred = y_predict_no_grid)
test_accuracy_no_grid
round(test_accuracy_no_grid, 5) 


0.91781

# Using gridSearchCV  

In [None]:
from sklearn.model_selection import GridSearchCV

complete_pipeline = make_pipeline(preprocessor, 
                              DecisionTreeClassifier())

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    "decisiontreeclassifier__max_depth": range(2, 25, 2),
    "decisiontreeclassifier__min_samples_leaf": range(3, 15, 2),
    "decisiontreeclassifier__criterion":["gini", "entropy"]
}

search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      verbose=1)

search.fit(X_train, y_train)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num_pipe',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer())]),
                                                                         Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'MSSubClass',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAd...
       'MiscFeature', 'SaleType', 'SaleCondition'],
      dtype='object'))])),
                                       ('decisiontreeclassifier',
                                        DecisionTreeClassifier())]),
             param_grid={'columntransformer__num_pipe__simpleimputer__strategy': ['mean',
                     

In [None]:
search.best_params_

{'columntransformer__num_pipe__simpleimputer__strategy': 'median',
 'decisiontreeclassifier__criterion': 'entropy',
 'decisiontreeclassifier__max_depth': 6,
 'decisiontreeclassifier__min_samples_leaf': 7}

In [None]:
search.best_score_

0.9315175525475954

In [None]:
# display pipeline
from sklearn import set_config
set_config(display = 'diagram')

In [None]:
# creating new pipwline by using the best tuning parameters got from GridSearchCV
new_pipeline_grid = make_pipeline(preprocessor, 
                              DecisionTreeClassifier(max_depth=14,
                                 min_samples_leaf=7, criterion = 'entropy'))

In [None]:
new_pipeline_grid.fit(X_train, y_train)

In [None]:
y_train_pred_grid = new_pipeline_grid.predict(X_train)

In [None]:
# predicting accuracy
train_accuracy_grid= accuracy_score(y_true = y_train,
               y_pred = y_train_pred_grid)
train_accuracy_grid
round(train_accuracy_grid, 5) 


0.96147

In [None]:
new_pipeline_grid.fit(X_test, y_test)

In [None]:
y_test_pred_grid = new_pipeline_grid.predict(X_test)

In [None]:
# checking accuracy score for test set best parameter(gridsearch_cv)


test_accuracy_grid = accuracy_score(y_true = y_test,y_pred= y_test_pred_grid)
test_accuracy_grid
round(test_accuracy_grid, 5) 

0.9726

In [None]:
# checking for classification reports
from sklearn.metrics import classification_report
print(classification_report(y_test, y_test_pred_grid))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98       245
           1       0.95      0.87      0.91        47

    accuracy                           0.97       292
   macro avg       0.96      0.93      0.95       292
weighted avg       0.97      0.97      0.97       292



In [None]:
# loading new test data to check the model
new_pipeline  = pd.read_csv("/content/drive/MyDrive/coding/test.csv")

In [None]:
test = search.predict(new_pipeline)
test

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
new_pipeline["Expensive"] = test

In [None]:
new_pipeline["Expensive"]

0       0
1       0
2       0
3       0
4       0
       ..
1454    0
1455    0
1456    0
1457    0
1458    0
Name: Expensive, Length: 1459, dtype: int64

In [None]:
Submission = new_pipeline[["Id","Expensive"]]

In [None]:
Submission.to_csv("results.csv", index = False)