In [35]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import gaussian_kde
import numpy as np

plt.style.use('ggplot')

# Data

In [36]:
data = pd.read_csv('train.csv', ',', keep_default_na=False)

data.info

  data = pd.read_csv('train.csv', ',', keep_default_na=False)


<bound method DataFrame.info of         Id  MSSubClass MSZoning LotFrontage  LotArea Street Alley LotShape  \
0        1          60       RL          65     8450   Pave    NA      Reg   
1        2          20       RL          80     9600   Pave    NA      Reg   
2        3          60       RL          68    11250   Pave    NA      IR1   
3        4          70       RL          60     9550   Pave    NA      IR1   
4        5          60       RL          84    14260   Pave    NA      IR1   
...    ...         ...      ...         ...      ...    ...   ...      ...   
1455  1456          60       RL          62     7917   Pave    NA      Reg   
1456  1457          20       RL          85    13175   Pave    NA      Reg   
1457  1458          70       RL          66     9042   Pave    NA      Reg   
1458  1459          20       RL          68     9717   Pave    NA      Reg   
1459  1460          20       RL          75     9937   Pave    NA      Reg   

     LandContour Utilities  ...

# Data Cleaning

## Add PriceType

In [37]:
data['PriceType'] = np.where(
                            data['SalePrice'] >= 300000,
                            'HIGH',
                            np.where(
                                data['SalePrice'] <= 150000,
                                'LOW',
                                'MEDIUM'
                                )
                            )

In [38]:
data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,PriceType
0,1,60,RL,65,8450,Pave,,Reg,Lvl,AllPub,...,,,,0,2,2008,WD,Normal,208500,MEDIUM
1,2,20,RL,80,9600,Pave,,Reg,Lvl,AllPub,...,,,,0,5,2007,WD,Normal,181500,MEDIUM
2,3,60,RL,68,11250,Pave,,IR1,Lvl,AllPub,...,,,,0,9,2008,WD,Normal,223500,MEDIUM
3,4,70,RL,60,9550,Pave,,IR1,Lvl,AllPub,...,,,,0,2,2006,WD,Abnorml,140000,LOW
4,5,60,RL,84,14260,Pave,,IR1,Lvl,AllPub,...,,,,0,12,2008,WD,Normal,250000,MEDIUM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62,7917,Pave,,Reg,Lvl,AllPub,...,,,,0,8,2007,WD,Normal,175000,MEDIUM
1456,1457,20,RL,85,13175,Pave,,Reg,Lvl,AllPub,...,,MnPrv,,0,2,2010,WD,Normal,210000,MEDIUM
1457,1458,70,RL,66,9042,Pave,,Reg,Lvl,AllPub,...,,GdPrv,Shed,2500,5,2010,WD,Normal,266500,MEDIUM
1458,1459,20,RL,68,9717,Pave,,Reg,Lvl,AllPub,...,,,,0,4,2010,WD,Normal,142125,LOW


## Rm value in col with real missing values

from all the columns, the following are columns with real missing values:
- LotFrontage
- MasVnrType
- MasVnrArea
- Electrical
- GarageYrBlt

### Apply nan to certain columns

In [39]:
columns = ['LotFrontage', 'MasVnrType', 'MasVnrArea', 'Electrical', 'GarageYrBlt']

for column in columns:
    data[column] = data[column].replace('NA', np.nan) 

In [40]:
data.info

<bound method DataFrame.info of         Id  MSSubClass MSZoning LotFrontage  LotArea Street Alley LotShape  \
0        1          60       RL          65     8450   Pave    NA      Reg   
1        2          20       RL          80     9600   Pave    NA      Reg   
2        3          60       RL          68    11250   Pave    NA      IR1   
3        4          70       RL          60     9550   Pave    NA      IR1   
4        5          60       RL          84    14260   Pave    NA      IR1   
...    ...         ...      ...         ...      ...    ...   ...      ...   
1455  1456          60       RL          62     7917   Pave    NA      Reg   
1456  1457          20       RL          85    13175   Pave    NA      Reg   
1457  1458          70       RL          66     9042   Pave    NA      Reg   
1458  1459          20       RL          68     9717   Pave    NA      Reg   
1459  1460          20       RL          75     9937   Pave    NA      Reg   

     LandContour Utilities  ...

### Check which col have true missing value

In [41]:
data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,PriceType
0,1,60,RL,65,8450,Pave,,Reg,Lvl,AllPub,...,,,,0,2,2008,WD,Normal,208500,MEDIUM
1,2,20,RL,80,9600,Pave,,Reg,Lvl,AllPub,...,,,,0,5,2007,WD,Normal,181500,MEDIUM
2,3,60,RL,68,11250,Pave,,IR1,Lvl,AllPub,...,,,,0,9,2008,WD,Normal,223500,MEDIUM
3,4,70,RL,60,9550,Pave,,IR1,Lvl,AllPub,...,,,,0,2,2006,WD,Abnorml,140000,LOW
4,5,60,RL,84,14260,Pave,,IR1,Lvl,AllPub,...,,,,0,12,2008,WD,Normal,250000,MEDIUM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62,7917,Pave,,Reg,Lvl,AllPub,...,,,,0,8,2007,WD,Normal,175000,MEDIUM
1456,1457,20,RL,85,13175,Pave,,Reg,Lvl,AllPub,...,,MnPrv,,0,2,2010,WD,Normal,210000,MEDIUM
1457,1458,70,RL,66,9042,Pave,,Reg,Lvl,AllPub,...,,GdPrv,Shed,2500,5,2010,WD,Normal,266500,MEDIUM
1458,1459,20,RL,68,9717,Pave,,Reg,Lvl,AllPub,...,,,,0,4,2010,WD,Normal,142125,LOW


### Remove the missing lines

In [42]:
data = data.dropna()

data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,PriceType
0,1,60,RL,65,8450,Pave,,Reg,Lvl,AllPub,...,,,,0,2,2008,WD,Normal,208500,MEDIUM
1,2,20,RL,80,9600,Pave,,Reg,Lvl,AllPub,...,,,,0,5,2007,WD,Normal,181500,MEDIUM
2,3,60,RL,68,11250,Pave,,IR1,Lvl,AllPub,...,,,,0,9,2008,WD,Normal,223500,MEDIUM
3,4,70,RL,60,9550,Pave,,IR1,Lvl,AllPub,...,,,,0,2,2006,WD,Abnorml,140000,LOW
4,5,60,RL,84,14260,Pave,,IR1,Lvl,AllPub,...,,,,0,12,2008,WD,Normal,250000,MEDIUM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62,7917,Pave,,Reg,Lvl,AllPub,...,,,,0,8,2007,WD,Normal,175000,MEDIUM
1456,1457,20,RL,85,13175,Pave,,Reg,Lvl,AllPub,...,,MnPrv,,0,2,2010,WD,Normal,210000,MEDIUM
1457,1458,70,RL,66,9042,Pave,,Reg,Lvl,AllPub,...,,GdPrv,Shed,2500,5,2010,WD,Normal,266500,MEDIUM
1458,1459,20,RL,68,9717,Pave,,Reg,Lvl,AllPub,...,,,,0,4,2010,WD,Normal,142125,LOW


# Remove the SalePrice column

In [43]:
data = data.drop(['SalePrice'], axis=1)

In [45]:
data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,PriceType
0,1,60,RL,65,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,MEDIUM
1,2,20,RL,80,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,MEDIUM
2,3,60,RL,68,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,MEDIUM
3,4,70,RL,60,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,LOW
4,5,60,RL,84,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,MEDIUM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,MEDIUM
1456,1457,20,RL,85,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,MEDIUM
1457,1458,70,RL,66,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,MEDIUM
1458,1459,20,RL,68,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,LOW


# Save the cleaned data

In [44]:
data.to_csv('clean_data.csv', index=False)