In [11]:
import numpy as np
import pandas as pd
import sklearn as skit
from sklearn.cluster import KMeans
from scipy.stats import chi2_contingency

import seaborn as sns
import re

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.rc('figure', figsize=(12, 8))

In [12]:
hf = pd.read_csv('task_data.csv')
hf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [13]:
# Nominal features : MSZoning and RoofStyle
# Dichotomous feature : CentralAir
# nominal features : OverallQual and HeatingQC
# discrete numeric features : GarageCars, FirePlaces
# continuous numeric features : GarageArea, 1stFlrSF

In [14]:
#Validate domains of all those selected features.
#Convert those features to proper format.

# MSZoning 
if not pd.api.types.is_categorical_dtype(hf['MSZoning']):
    hf['MSZoning'] = hf['MSZoning'].astype('category', ordered=False)

#CentralAir
if not pd.api.types.is_categorical_dtype(hf['CentralAir']):
    hf['CentralAir'] = hf['CentralAir'].astype('category', ordered=False)
    
#RoofStyle    
if not pd.api.types.is_categorical_dtype(hf['RoofStyle']):
    hf['RoofStyle'] = hf['RoofStyle'].astype('category', ordered=False)

#RCentralAir    
if not pd.api.types.is_categorical_dtype(hf['CentralAir']):
    hf['CentralAir'] = hf['CentralAir'].astype('category', ordered=False)

#OverallQual    
if not pd.api.types.is_categorical_dtype(hf['OverallQual']):
    hf['OverallQual'] = hf['OverallQual'].astype('category', ordered=True)

#HeatingQC    
if not pd.api.types.is_categorical_dtype(hf['HeatingQC']):
    hf['HeatingQC'] = hf['HeatingQC'].astype('category', ordered=True, categories=["Po","Fa","TA","Gd","Ex"])

#check if the type is ok (it is)
hf[["GarageCars","Fireplaces","GarageArea","1stFlrSF"]].info()
#hf["GarageCars","Fireplaces","GarageArea","1stFlrSF"] = hf["GarageCars","Fireplaces","GarageArea","1stFlrSF"].apply(pd.to_numeric)

# show results
# display(hf['MSZoning'].describe())
# display(hf['RoofStyle'].describe())
# display(hf['RoofStyle'].unique())
# display(hf['CentralAir'].describe())
# display(hf['CentralAir'].unique())
# display(hf['OverallQual'].describe())
# display(hf['OverallQual'].unique())
#display(hf['HeatingQC'].describe())
#display(hf['HeatingQC'].unique())
#display(hf["GarageCars"].describe())

#We could change the dtype of numeric features in float (to make subsequent prediction for example) 
# but I am not sure if it's  part of the homework or not
#hf[['GarageArea']] = hf[['GarageArea']].astype(float)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 4 columns):
GarageCars    1460 non-null int64
Fireplaces    1460 non-null int64
GarageArea    1460 non-null int64
1stFlrSF      1460 non-null int64
dtypes: int64(4)
memory usage: 45.7 KB


In [15]:
#Try to detect some outliers (hint: use GrLivArea and SalePrice).

#We compute the IQR
Q1 = hf.quantile(0.25)
Q3 = hf.quantile(0.75)
IQR =  Q3 - Q1

#outliers 

display(hf[(hf['GrLivArea'] > Q3['GrLivArea'] + 5*IQR['GrLivArea']) | (hf['GrLivArea'] < Q1['GrLivArea'] - 1.5*IQR['GrLivArea'])])
display(hf[(hf['SalePrice'] > Q3['SalePrice'] + 5*IQR['SalePrice']) | (hf['SalePrice'] < Q1['SalePrice'] - 1.5*IQR['SalePrice'])])


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1298,1299,60,RL,313.0,63887,Pave,,IR3,Bnk,AllPub,...,480,Gd,,,0,1,2008,New,Partial,160000


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
691,692,60,RL,104.0,21535,Pave,,IR1,Lvl,AllPub,...,0,,,,0,1,2007,WD,Normal,755000
1182,1183,60,RL,160.0,15623,Pave,,IR1,Lvl,AllPub,...,555,Ex,MnPrv,,0,7,2007,WD,Abnorml,745000


In [16]:
#* Determine all features with missing data.
display(hf.isnull().any())
#Every "False" columns contains missing datas

Id               False
MSSubClass       False
MSZoning         False
LotFrontage       True
LotArea          False
Street           False
Alley             True
LotShape         False
LandContour      False
Utilities        False
LotConfig        False
LandSlope        False
Neighborhood     False
Condition1       False
Condition2       False
BldgType         False
HouseStyle       False
OverallQual      False
OverallCond      False
YearBuilt        False
YearRemodAdd     False
RoofStyle        False
RoofMatl         False
Exterior1st      False
Exterior2nd      False
MasVnrType        True
MasVnrArea        True
ExterQual        False
ExterCond        False
Foundation       False
                 ...  
BedroomAbvGr     False
KitchenAbvGr     False
KitchenQual      False
TotRmsAbvGrd     False
Functional       False
Fireplaces       False
FireplaceQu       True
GarageType        True
GarageYrBlt       True
GarageFinish      True
GarageCars       False
GarageArea       False
GarageQual 

In [17]:
#* Find a feature that probably should be droped from further analysis.
display(hf.isnull().sum())
#Alley has a lot of missing values so we should drop it from further analysis
#hf.drop('Alley',1,inplace=True)

#(that is also the case of PoolQC but this feature could be interesting regarding 
# the value of the house for houses which have a pool)



Id                  0
MSSubClass          0
MSZoning            0
LotFrontage       259
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          8
MasVnrArea          8
ExterQual           0
ExterCond           0
Foundation          0
                 ... 
BedroomAbvGr        0
KitchenAbvGr        0
KitchenQual         0
TotRmsAbvGrd        0
Functional          0
Fireplaces          0
FireplaceQu       690
GarageType         81
GarageYrBlt        81
GarageFinish       81
GarageCars          0
GarageArea          0
GarageQual         81
GarageCond         81
PavedDrive

In [10]:
#* Select a feature with reasonable small portion of missing values (but with at least 5) and try to fill them properly.
#MasVnrType and MasVnrArea have 8 missing values, we can replace them respectively by none and 0
hf['MasVnrArea'] = hf['MasVnrArea'].fillna(0)
hf['MasVnrType'] = hf['MasVnrType'].fillna("None")
#We can also treat the features related to the garage which have 81 missing values and consider that there are no
# garages but thay may be false 
hf['GarageType'] = hf['GarageType'].fillna("NA")
hf['GarageFinish'] = hf['GarageFinish'].fillna("NA")
hf['GarageQual'] = hf['GarageQual'].fillna("NA")
hf['GarageCond'] = hf['GarageCond'].fillna("NA")
