# 1. Importing Datasets/Libraries

In [1]:
#Importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns
import numpy as np
#sns.set_style('darkgrid')
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, PowerTransformer, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
#change wd
import os
os.chdir("C:/Users/annitan/Desktop/GA/DSIF-SG-7 - Copy/Project_2/datasets")

In [3]:
#importing datasets
#import test
test = pd.read_csv('test.csv')

#Change columns names to lowercase
test.columns = test.columns.str.lower().str.replace(' ', '_')

# 2. Cleaning of data - train set

## 2.1 Understanding Columns

In [4]:
# Check the shape of data set
print(f'shape of test set: {test.shape}')

shape of test set: (878, 80)


In [5]:
# Check datatypes and numbers of non-null values; for train
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 80 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               878 non-null    int64  
 1   pid              878 non-null    int64  
 2   ms_subclass      878 non-null    int64  
 3   ms_zoning        878 non-null    object 
 4   lot_frontage     718 non-null    float64
 5   lot_area         878 non-null    int64  
 6   street           878 non-null    object 
 7   alley            58 non-null     object 
 8   lot_shape        878 non-null    object 
 9   land_contour     878 non-null    object 
 10  utilities        878 non-null    object 
 11  lot_config       878 non-null    object 
 12  land_slope       878 non-null    object 
 13  neighborhood     878 non-null    object 
 14  condition_1      878 non-null    object 
 15  condition_2      878 non-null    object 
 16  bldg_type        878 non-null    object 
 17  house_style     

In [6]:
#Drop 'Id' and 'PID'. Not needed. 
test = test.drop(['id','pid'],axis=1)
# Check the shape of data set
print(f'shape of test set: {test.shape}')

shape of test set: (878, 78)


In [7]:
#Change data type to string
test["ms_subclass"]=test["ms_subclass"].astype('str')

## 2.2 Null Values

### 2.2.1 Checking which columns has Null Values

In [8]:
#Check for null, sorted; train data set
test.isnull().sum().sort_values(ascending=False)

pool_qc            874
misc_feature       837
alley              820
fence              706
fireplace_qu       422
lot_frontage       160
garage_yr_blt       45
garage_finish       45
garage_qual         45
garage_cond         45
garage_type         44
bsmtfin_type_2      25
bsmt_cond           25
bsmt_exposure       25
bsmtfin_type_1      25
bsmt_qual           25
electrical           1
mas_vnr_type         1
mas_vnr_area         1
totrms_abvgrd        0
functional           0
kitchen_abvgr        0
bedroom_abvgr        0
half_bath            0
full_bath            0
bsmt_half_bath       0
bsmt_full_bath       0
kitchen_qual         0
ms_subclass          0
fireplaces           0
low_qual_fin_sf      0
garage_cars          0
garage_area          0
paved_drive          0
wood_deck_sf         0
open_porch_sf        0
enclosed_porch       0
3ssn_porch           0
screen_porch         0
pool_area            0
misc_val             0
mo_sold              0
yr_sold              0
gr_liv_area

In [9]:
#Drop columns with >= 1000 missing data (almost 50%)
#columns=['pool_qc','misc_feature','alley','fence','fireplace_qu']
test.drop(columns=['pool_qc','misc_feature','alley','fence','fireplace_qu'],axis = 1, inplace=True)
#Check the new shape for train
test.shape

(878, 73)

### 2.2.2 Clean up column 'lot_fontage'

In [10]:
#LotFrontage: Linear feet of street connected to property
#print(f'Number of missing values in lot_fontage {train['lot_frontage'].isna().sum()}')
rows = test.shape[0]
print(f'Number of missing values in lot_fontage: {test.lot_frontage.isna().sum()}')
print(f'% of missing values in lot_fontage: {round(test.lot_frontage.isna().sum()/rows*100,1)}%')

Number of missing values in lot_fontage: 160
% of missing values in lot_fontage: 18.2%


In [11]:
#Look at the value counts
test['lot_frontage'].value_counts()

60.0     97
80.0     43
75.0     37
70.0     37
50.0     27
85.0     24
65.0     22
21.0     18
24.0     16
68.0     16
90.0     15
78.0     13
64.0     12
51.0     11
55.0     10
76.0      9
63.0      9
59.0      9
72.0      9
79.0      9
74.0      8
52.0      8
73.0      8
86.0      8
61.0      8
44.0      7
66.0      7
40.0      7
82.0      7
53.0      6
120.0     6
71.0      6
35.0      6
88.0      6
57.0      6
69.0      6
110.0     6
98.0      5
34.0      5
48.0      5
100.0     5
42.0      4
89.0      4
56.0      4
67.0      4
36.0      4
81.0      4
84.0      4
94.0      4
77.0      4
95.0      4
93.0      4
54.0      4
58.0      4
124.0     3
62.0      3
118.0     3
83.0      3
96.0      3
43.0      3
102.0     3
105.0     3
87.0      3
41.0      3
99.0      3
121.0     2
45.0      2
115.0     2
104.0     2
92.0      2
160.0     2
39.0      2
49.0      2
149.0     2
107.0     2
108.0     2
32.0      2
47.0      2
130.0     2
33.0      2
133.0     1
46.0      1
122.0     1
106.

In [12]:
#Understand the data in the column
test['lot_frontage'].describe()

count    718.000000
mean      69.545961
std       23.533945
min       21.000000
25%       59.000000
50%       68.000000
75%       80.000000
max      182.000000
Name: lot_frontage, dtype: float64

In [13]:
# Group lots by lots configuration; find the mean of their lot_frontage
grouped_lots = test.groupby(['lot_config']).agg({'lot_frontage' : 'mean'})
grouped_lots
#Might be meaningful to replace missing values with mean based on their lot_config

Unnamed: 0_level_0,lot_frontage
lot_config,Unnamed: 1_level_1
Corner,81.452381
CulDSac,59.727273
FR2,62.625
FR3,48.5
Inside,67.496377


In [14]:
#replace np.nan with average for lot_config of that type
test.loc[(test['lot_frontage'].isna()) & (test['lot_config'] == 'Corner'), 'lot_frontage'] = 83.25
test.loc[(test['lot_frontage'].isna()) & (test['lot_config'] == 'CulDSac'), 'lot_frontage'] = 54.734375        
test.loc[(test['lot_frontage'].isna()) & (test['lot_config'] == 'FR2'), 'lot_frontage'] = 60.836735        
test.loc[(test['lot_frontage'].isna()) & (test['lot_config'] == 'FR3'), 'lot_frontage'] = 87.000000        
test.loc[(test['lot_frontage'].isna()) & (test['lot_config'] == 'Inside'), 'lot_frontage'] = 66.759571

In [15]:
#Check if there are anymore missing values
test['lot_frontage'].isna().sum()

0

### 2.2.3 Clean up columns related to garage - 'garage_finish', 'garage_qual', 'garage_yr_blt, 'garage_cond', 'garage_type'

GarageFinish: Interior finish of the garage

GarageYrBlt: Year garage was built

GarageCond: Garage condition

GarageCond: Garage condition

GarageType: Garage location

For all the columns above, if data is missing, we would assume that data should be NA (which essentially means no Garage). 
This is especially true when in all the columns, there are no data recorded as NA. 
This suggest that data might be left blank when there is no garage.

In [16]:
print(f'Number of missing values in garage_finish: {test.garage_finish.isna().sum()}')
print(f'% of missing values in garage_finish: {round(test.garage_finish.isna().sum()/rows*100,1)}%')

Number of missing values in garage_finish: 45
% of missing values in garage_finish: 5.1%


In [17]:
#Check the value counts
test['garage_finish'].value_counts()
#No data recorded as NA. Hence, we would assome that Missing values might be no garbage (NA No Garage)

Unf    382
RFn    233
Fin    218
Name: garage_finish, dtype: int64

In [18]:
#replace np.nan with NA
test.loc[(test['garage_finish'].isna()), 'garage_finish'] = 'NA'
#Check the value counts again
test['garage_finish'].value_counts()
#All 114 missing data replaced as NA

Unf    382
RFn    233
Fin    218
NA      45
Name: garage_finish, dtype: int64

In [19]:
print(f'Number of missing values in garage_qual: {test.garage_qual.isna().sum()}')
print(f'% of missing values in garage_qual: {round(test.garage_qual.isna().sum()/rows*100,1)}%')

Number of missing values in garage_qual: 45
% of missing values in garage_qual: 5.1%


In [20]:
#Check the value counts
test['garage_qual'].value_counts()
#No data recorded as NA. Hence, we would assome that Missing values might be no garbage (NA No Garage)

TA    782
Fa     42
Gd      6
Po      3
Name: garage_qual, dtype: int64

In [21]:
#replace np.nan with NA
test.loc[(test['garage_qual'].isna()), 'garage_qual'] = 'NA'
#Check the value counts again
test['garage_qual'].value_counts()
#All 114 missing data replaced as NA

TA    782
NA     45
Fa     42
Gd      6
Po      3
Name: garage_qual, dtype: int64

In [22]:
print(f'Number of missing values in garage_yr_blt: {test.garage_yr_blt.isna().sum()}')
print(f'% of missing values in garage_yr_blt: {round(test.garage_yr_blt.isna().sum()/rows*100,1)}%')

Number of missing values in garage_yr_blt: 45
% of missing values in garage_yr_blt: 5.1%


In [23]:
#Have to remove NA first, before we can remove the decimal points
test.loc[(test['garage_yr_blt'].isna()), 'garage_yr_blt'] = 0.0

In [24]:
test.loc[(test['garage_yr_blt'].isna())]

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type


In [25]:
#Remove decimal places for year
test['garage_yr_blt'] = test['garage_yr_blt'].astype(int)

In [26]:
#Check the value counts
test['garage_yr_blt'].value_counts()
#No data recorded as NA. Hence, we would assome that Missing values might be no garbage (NA No Garage)

0       45
2005    37
2006    35
2007    30
2004    27
2003    26
1977    20
1950    19
1997    18
2008    17
1968    16
1974    16
1960    16
1993    16
1999    15
1976    15
1980    14
1969    14
1957    14
1994    14
1998    14
2000    14
2001    14
2002    13
1959    13
2009    12
1954    12
1970    12
1963    12
1920    12
1978    11
1961    11
1956    10
1958    10
1966    10
1962    10
1967    10
1972    10
1979    10
1964    10
1955     9
1973     9
1925     9
1996     9
1995     9
1984     8
1930     7
1965     7
1926     7
1991     7
1989     7
1952     6
1971     6
1988     6
1985     6
1992     5
1990     5
1981     5
1951     5
1948     5
1938     5
1910     5
1940     5
1939     5
1924     4
1949     4
1946     4
1941     4
1983     4
1953     4
2010     3
1900     3
1986     3
1982     3
1915     3
1975     3
1922     2
1918     2
1947     2
1932     2
1934     2
1937     2
1987     2
1927     1
1928     1
1931     1
1923     1
1921     1
1935     1
1917     1
1916     1

In [27]:
#Now, we can replace 0 with NA
test.loc[(test['garage_yr_blt'] == 0), 'garage_yr_blt'] = 'NA'
test['garage_yr_blt'].value_counts()
#All 114 missing data replaced as NA

NA      45
2005    37
2006    35
2007    30
2004    27
2003    26
1977    20
1950    19
1997    18
2008    17
1968    16
1960    16
1974    16
1993    16
1976    15
1999    15
1957    14
1980    14
1969    14
1994    14
1998    14
2000    14
2001    14
2002    13
1959    13
1954    12
1970    12
1963    12
2009    12
1920    12
1961    11
1978    11
1966    10
1964    10
1962    10
1979    10
1958    10
1956    10
1972    10
1967    10
1995     9
1996     9
1973     9
1925     9
1955     9
1984     8
1965     7
1989     7
1926     7
1930     7
1991     7
1971     6
1988     6
1952     6
1985     6
1940     5
1992     5
1938     5
1939     5
1981     5
1948     5
1951     5
1910     5
1990     5
1924     4
1941     4
1949     4
1983     4
1946     4
1953     4
1975     3
1986     3
1982     3
1915     3
1900     3
2010     3
1922     2
1987     2
1947     2
1932     2
1934     2
1937     2
1918     2
1928     1
1927     1
1943     1
1921     1
1931     1
1923     1
1906     1
1908     1

In [28]:
print(f'Number of missing values in garage_cond: {test.garage_cond.isna().sum()}')
print(f'% of missing values in garage_cond: {round(test.garage_cond.isna().sum()/rows*100,1)}%')

Number of missing values in garage_cond: 45
% of missing values in garage_cond: 5.1%


In [29]:
test['garage_cond'].value_counts()
#GarageCond: Garage condition

TA    796
Fa     27
Po      6
Gd      3
Ex      1
Name: garage_cond, dtype: int64

In [30]:
test.loc[(test['garage_cond'].isna()), 'garage_cond'] = 'NA'

In [31]:
test['garage_cond'].isna().sum()

0

In [32]:
print(f'Number of missing values in garage_type: {test.garage_type.isna().sum()}')
print(f'% of missing values in garage_type: {round(test.garage_type.isna().sum()/rows*100,1)}%')

Number of missing values in garage_type: 44
% of missing values in garage_type: 5.0%


In [33]:
#Check the value counts
test['garage_type'].value_counts()
#No data recorded as NA. Hence, we would assome that Missing values might be no garbage (NA No Garage)

Attchd     518
Detchd     246
BuiltIn     53
Basment      9
2Types       4
CarPort      4
Name: garage_type, dtype: int64

In [34]:
#replace np.nan with NA
test.loc[(test['garage_type'].isna()), 'garage_type'] = 'NA'
#Check value counts again
test['garage_type'].value_counts()
#Note that missing data only has 113 - and all changed to NA. 
#It suggests that there is one row of data with garage columns being NA, yet garage type is not NA.

Attchd     518
Detchd     246
BuiltIn     53
NA          44
Basment      9
2Types       4
CarPort      4
Name: garage_type, dtype: int64

In [35]:
#Which is the row with garage type but everything else NA?
test[(test['garage_yr_blt'] == 'NA') & (test['garage_type'] !='NA')]

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type
764,60,RM,57.0,8094,Pave,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,2fmCon,2.5Unf,6,8,1910,1983,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,TA,TA,PConc,TA,TA,Mn,Rec,196,Unf,0,1046,1242,GasA,Gd,Y,SBrkr,1242,742,0,1984,0,0,2,0,5,1,TA,8,Typ,0,Detchd,,,1,360,,,Y,64,0,180,0,0,0,1000,9,2008,WD


In [36]:
#Might be a data error. Hence, should change garage_type to NA
test.loc[(test['garage_yr_blt'] == 'NA') & (test['garage_type'] !='NA'), 'garage_type'] = 'NA'
#Check if data has been replaced
test[(test['garage_yr_blt'] == 'NA') & (test['garage_type'] !='NA')]

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type


### 2.2.4 Clean up columns related to basement - 'bsmt_exposure', 'bsmtfin_type_2', 'bsmtfin_type_1, 'bsmt_cond', 'bsmt_qual'

Bsmt Exposure: Refers to walkout or garden level walls

BsmtFinType2: Quality of second finished area (if present)

BsmtFinType1: Quality of basement finished area

BsmtCond: General condition of the basement            

BsmtQual: Height of the basement

For all the columns above, if data is missing, we would assume that data should be NA (which essentially means no basement). 
This is especially true when in all the columns, there are no data recorded as NA. 
This suggest that data might be left blank when there is no basement.

In [37]:
print(f'Number of missing values in bsmt_exposure: {test.bsmt_exposure.isna().sum()}')
print(f'% of missing values in bsmt_exposure: {round(test.bsmt_exposure.isna().sum()/rows*100,1)}%')

Number of missing values in bsmt_exposure: 25
% of missing values in bsmt_exposure: 2.8%


In [38]:
#Check the value counts
test['bsmt_exposure'].value_counts()
#No data recorded as NA. Hence, we would assome that Missing values might be no basement (NA No Basement)

No    567
Av    130
Gd     80
Mn     76
Name: bsmt_exposure, dtype: int64

In [39]:
#replace np.nan with NA
test.loc[(test['bsmt_exposure'].isna()), 'bsmt_exposure'] = 'NA'
#Check the value counts again
test['bsmt_exposure'].value_counts()
#58 missing values replaced as NA

No    567
Av    130
Gd     80
Mn     76
NA     25
Name: bsmt_exposure, dtype: int64

In [40]:
print(f'Number of missing values in bsmtfin_type_2: {test.bsmtfin_type_2.isna().sum()}')
print(f'% of missing values in bsmtfin_type_2: {round(test.bsmtfin_type_2.isna().sum()/rows*100,1)}%')

Number of missing values in bsmtfin_type_2: 25
% of missing values in bsmtfin_type_2: 2.8%


In [41]:
#Check the value counts
test['bsmtfin_type_2'].value_counts()
#No data recorded as NA. Hence, we would assome that Missing values might be no basement (NA No Basement)

Unf    749
LwQ     29
Rec     26
BLQ     20
ALQ     18
GLQ     11
Name: bsmtfin_type_2, dtype: int64

In [42]:
#replace np.nan with NA
test.loc[(test['bsmtfin_type_2'].isna()), 'bsmtfin_type_2'] = 'NA'
#Check the value counts again
test['bsmtfin_type_2'].value_counts()
#56 missing values replaced as NA
#Note that missing data only has 56 - and all changed to NA. 
#It suggests that there are 2 rows of data with bsmt_exposure being NA, yet bsmtfin_type_2 is not NA.

Unf    749
LwQ     29
Rec     26
NA      25
BLQ     20
ALQ     18
GLQ     11
Name: bsmtfin_type_2, dtype: int64

In [43]:
#Which are the rows with bsmt_exposure = NA but bsmtfin_type_2 is not NA?
test[(test['bsmtfin_type_2'] != 'NA') & (test['bsmt_exposure'] =='NA')]

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type


In [44]:
#Might be error, so change bsmt_exposure to NA
#Change this code to the one with OR
test.loc[(test['bsmtfin_type_2'] != 'NA') & (test['bsmt_exposure'] =='NA'), 'bsmtfin_type_2'] = 'NA'

In [45]:
#Check
test[(test['bsmtfin_type_2'] != 'NA') & (test['bsmt_exposure'] =='NA')]

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type


In [46]:
print(f'Number of missing values in bsmtfin_type_1: {test.bsmtfin_type_1.isna().sum()}')
print(f'% of missing values in bsmtfin_type_1: {round(test.bsmtfin_type_1.isna().sum()/rows*100,1)}%')

Number of missing values in bsmtfin_type_1: 25
% of missing values in bsmtfin_type_1: 2.8%


In [47]:
test['bsmtfin_type_1'].value_counts()
#No data recorded as NA. Hence, we would assome that Missing values might be no basement (NA No Basement)

Unf    248
GLQ    243
ALQ    136
Rec    105
BLQ     69
LwQ     52
Name: bsmtfin_type_1, dtype: int64

In [48]:
test.loc[(test['bsmtfin_type_1'].isna()), 'bsmtfin_type_1'] = 'NA'
test['bsmtfin_type_1'].value_counts()

Unf    248
GLQ    243
ALQ    136
Rec    105
BLQ     69
LwQ     52
NA      25
Name: bsmtfin_type_1, dtype: int64

In [49]:
#55 missing values replaced as NA
#Note that missing data only has 55 - and all changed to NA. 
#It suggests that there are 3 rows of data with bsmt_exposure being NA, yet bsmtfin_type_2 is not NA.

In [50]:
#Which are the rows with bsmt_exposure = NA but bsmtfin_type_2 is not NA?
test[(test['bsmtfin_type_1'] != 'NA') & (test['bsmt_exposure'] =='NA')]

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type


In [51]:
#Might be error, so change bsmt_exposure to NA
#Change this code to the one with OR
test.loc[(test['bsmtfin_type_1'] != 'NA') & (test['bsmt_exposure'] =='NA'), 'bsmtfin_type_1'] = 'NA'

In [52]:
#Check
test['bsmtfin_type_1'].value_counts()

Unf    248
GLQ    243
ALQ    136
Rec    105
BLQ     69
LwQ     52
NA      25
Name: bsmtfin_type_1, dtype: int64

In [53]:
print(f'Number of missing values in bsmt_cond: {test.bsmt_cond.isna().sum()}')
print(f'% of missing values in bsmt_cond: {round(test.bsmt_cond.isna().sum()/rows*100,1)}%')

Number of missing values in bsmt_cond: 25
% of missing values in bsmt_cond: 2.8%


In [54]:
test['bsmt_cond'].value_counts()
#BsmtCond: General condition of the basement            

TA    781
Fa     39
Gd     33
Name: bsmt_cond, dtype: int64

In [55]:
test.loc[(test['bsmt_cond'].isna()), 'bsmt_cond'] = 'NA'
test['bsmt_cond'].value_counts()

TA    781
Fa     39
Gd     33
NA     25
Name: bsmt_cond, dtype: int64

In [56]:
#55 missing values replaced as NA
#Note that missing data only has 55 - and all changed to NA. 
#It suggests that there are 3 rows of data with bsmt_exposure being NA, yet bsmtfin_type_2 is not NA.

In [57]:
#Which are the rows with bsmt_exposure = NA but bsmtfin_type_2 is not NA?
test[(test['bsmt_cond'] != 'NA') & (test['bsmt_exposure'] =='NA')]

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type


In [58]:
#Might be error, so change bsmt_exposure to NA
#Change this code to the one with OR
test.loc[(test['bsmt_cond'] != 'NA') & (test['bsmt_exposure'] =='NA'), 'bsmt_cond'] = 'NA'

In [59]:
#Check
test['bsmt_cond'].value_counts()

TA    781
Fa     39
Gd     33
NA     25
Name: bsmt_cond, dtype: int64

In [60]:
print(f'Number of missing values in bsmt_qual: {test.bsmt_qual.isna().sum()}')
print(f'% of missing values in bsmt_qual: {round(test.bsmt_qual.isna().sum()/rows*100,1)}%')

Number of missing values in bsmt_qual: 25
% of missing values in bsmt_qual: 2.8%


In [61]:
test['bsmt_qual'].value_counts()
#BsmtQual: Height of the basement

TA    396
Gd    355
Ex     73
Fa     28
Po      1
Name: bsmt_qual, dtype: int64

In [62]:
test.loc[(test['bsmt_qual'].isna()), 'bsmt_qual'] = 'NA'
test['bsmt_qual'].value_counts()

TA    396
Gd    355
Ex     73
Fa     28
NA     25
Po      1
Name: bsmt_qual, dtype: int64

In [63]:
#But this only has 55. Which is the row with bsmt_exposure but everything else NA?
test[(test['bsmt_qual'] != 'NA') & (test['bsmt_exposure'] =='NA')]

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type


In [64]:
#Might be error, so change bsmt_exposure to NA
#Change this code to the one with OR
test.loc[(test['bsmt_qual'] != 'NA') & (test['bsmt_exposure'] =='NA'), 'bsmt_qual'] = 'NA'

In [65]:
#Check
test[(test['bsmt_qual'] != 'NA') & (test['bsmt_exposure'] =='NA')]

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type


In [66]:
print(f'Number of missing values in mas_vnr_area: {test.mas_vnr_area.isna().sum()}')
print(f'% of missing values in mas_vnr_area: {round(test.mas_vnr_area.isna().sum()/rows*100,1)}%')

Number of missing values in mas_vnr_area: 1
% of missing values in mas_vnr_area: 0.1%


In [67]:
test['mas_vnr_area'].value_counts()
#MasVnrArea: Masonry veneer area in square feet

0.0       532
216.0       7
80.0        5
420.0       5
196.0       5
340.0       4
144.0       4
120.0       4
180.0       3
456.0       3
302.0       3
285.0       3
194.0       3
90.0        3
270.0       3
149.0       3
182.0       3
50.0        3
200.0       3
176.0       3
198.0       3
240.0       3
306.0       3
88.0        3
128.0       3
260.0       2
16.0        2
174.0       2
209.0       2
161.0       2
621.0       2
305.0       2
106.0       2
256.0       2
300.0       2
45.0        2
156.0       2
280.0       2
246.0       2
108.0       2
226.0       2
352.0       2
164.0       2
360.0       2
450.0       2
232.0       2
147.0       2
350.0       2
178.0       2
206.0       2
162.0       2
76.0        2
169.0       2
268.0       2
252.0       2
286.0       2
104.0       2
266.0       2
215.0       2
20.0        2
153.0       2
98.0        2
82.0        2
423.0       2
150.0       2
265.0       2
130.0       2
53.0        2
166.0       2
123.0       2
44.0        2
72.0  

In [68]:
print(f'Number of missing values in mas_vnr_type: {test.mas_vnr_type.isna().sum()}')
print(f'% of missing values in mas_vnr_type: {round(test.mas_vnr_type.isna().sum()/rows*100,1)}%')

Number of missing values in mas_vnr_type: 1
% of missing values in mas_vnr_type: 0.1%


In [69]:
test['mas_vnr_type'].value_counts()
#MasVnrType: Masonry veneer type

None       534
BrkFace    250
Stone       80
BrkCmn      12
CBlock       1
Name: mas_vnr_type, dtype: int64

In [70]:
test[(test['mas_vnr_area'].isna()) & (test['mas_vnr_type'].isna())]

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type
865,60,RL,70.0,8749,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2002,2002,Gable,CompShg,VinylSd,VinylSd,,,Gd,TA,PConc,Gd,TA,No,Unf,0,Unf,0,840,840,GasA,Ex,Y,SBrkr,840,885,0,1725,0,0,2,1,3,1,Gd,6,Typ,0,Attchd,2002,RFn,2,550,TA,TA,Y,0,48,0,0,0,0,0,11,2009,WD


In [71]:
#If both mas_vnr_area and mas_vnr_type is missing, then let area be 0 and Type be None
test.loc[(test['mas_vnr_area'].isna()) & (test['mas_vnr_type'].isna()), 'mas_vnr_area'] = 0.0
test.loc[(test['mas_vnr_area'] == 0.0) & (test['mas_vnr_type'].isna()), 'mas_vnr_type'] = 'None'

In [72]:
#Check
test[(test['mas_vnr_area'].isna()) & (test['mas_vnr_type'].isna())]

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type


In [73]:
print(f'Number of missing values in bsmt_half_bath: {test.bsmt_half_bath.isna().sum()}')
print(f'% of missing values in bsmt_half_bath: {round(test.bsmt_half_bath.isna().sum()/rows*100,1)}%')

Number of missing values in bsmt_half_bath: 0
% of missing values in bsmt_half_bath: 0.0%


In [74]:
#Look at the rows with bsmt half bath is missing
#these are the same 2 rows that bsmt full bath is missing
#since there bsmt details are all NA, we should change this to NA

test[(test['bsmt_half_bath'].isna())]

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type


In [75]:
print(f'Number of missing values in bsmt_full_bath: {test.bsmt_full_bath.isna().sum()}')
print(f'% of missing values in bsmt_full_bath: {round(test.bsmt_full_bath.isna().sum()/rows*100,1)}%')

Number of missing values in bsmt_full_bath: 0
% of missing values in bsmt_full_bath: 0.0%


In [76]:
test.loc[(test['bsmt_half_bath'].isna()), 'bsmt_full_bath'] = 0
test.loc[(test['bsmt_half_bath'].isna()), 'bsmt_half_bath'] = 0

In [77]:
test[(test['bsmt_half_bath'].isna())]

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type


In [78]:
print(f'Number of missing values in garage_area: {test.garage_area.isna().sum()}')
print(f'% of missing values in garage_area: {round(test.garage_area.isna().sum()/rows*100,1)}%')

Number of missing values in garage_area: 0
% of missing values in garage_area: 0.0%


In [79]:
test[(test['garage_area'].isna())]
#All garage type are NA

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type


In [80]:
test.loc[(test['garage_area'].isna()), 'garage_area'] = 0
test[(test['garage_area'].isna())]

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type


In [81]:
print(f'Number of missing values in total_bsmt_sf: {test.total_bsmt_sf.isna().sum()}')
print(f'% of missing values in total_bsmt_sf: {round(test.total_bsmt_sf.isna().sum()/rows*100,1)}%')

Number of missing values in total_bsmt_sf: 0
% of missing values in total_bsmt_sf: 0.0%


In [82]:
test[(test['total_bsmt_sf'].isna())]

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type


In [83]:
test.loc[(test['total_bsmt_sf'].isna()), 'total_bsmt_sf'] = 0
test[(test['total_bsmt_sf'].isna())]

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type


In [84]:
print(f'Number of missing values in bsmt_unf_sf: {test.bsmt_unf_sf.isna().sum()}')
print(f'% of missing values in bsmt_unf_sf: {round(test.bsmt_unf_sf.isna().sum()/rows*100,1)}%')

Number of missing values in bsmt_unf_sf: 0
% of missing values in bsmt_unf_sf: 0.0%


In [85]:
test[(test['bsmt_unf_sf'].isna())]

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type


In [86]:
test.loc[(test['bsmt_unf_sf'].isna()), 'bsmt_unf_sf'] = 0
test[(test['bsmt_unf_sf'].isna())]

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type


In [87]:
print(f'Number of missing values in bsmtfin_sf_2: {test.bsmtfin_sf_2.isna().sum()}')
print(f'% of missing values in bsmtfin_sf_2: {round(test.bsmtfin_sf_2.isna().sum()/rows*100,1)}%')

Number of missing values in bsmtfin_sf_2: 0
% of missing values in bsmtfin_sf_2: 0.0%


In [88]:
test[(test['bsmtfin_sf_2'].isna())]

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type


In [89]:
test.loc[(test['bsmtfin_sf_2'].isna()), 'bsmtfin_sf_2'] = 0
test[(test['bsmtfin_sf_2'].isna())]

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type


In [90]:
print(f'Number of missing values in bsmtfin_sf_1: {test.bsmtfin_sf_1.isna().sum()}')
print(f'% of missing values in bsmtfin_sf_1: {round(test.bsmtfin_sf_1.isna().sum()/rows*100,1)}%')

Number of missing values in bsmtfin_sf_1: 0
% of missing values in bsmtfin_sf_1: 0.0%


In [91]:
test[(test['bsmtfin_sf_1'].isna())]

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type


In [92]:
test.loc[(test['bsmtfin_sf_1'].isna()), 'bsmtfin_sf_1'] = 0
test[(test['bsmtfin_sf_1'].isna())]

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type


In [93]:
print(f'Number of missing values in garage_cars: {test.garage_cars.isna().sum()}')
print(f'% of missing values in garage_cars: {round(test.garage_cars.isna().sum()/rows*100,1)}%')

Number of missing values in garage_cars: 0
% of missing values in garage_cars: 0.0%


In [94]:
test[(test['garage_cars'].isna())]

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type


In [95]:
test.loc[(test['garage_cars'].isna()), 'garage_cars'] = 0
test[(test['garage_cars'].isna())]

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type


In [96]:
test[(test['electrical'].isna())]

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type
634,80,RL,73.0,9735,Pave,Reg,Lvl,AllPub,Inside,Gtl,Timber,Norm,Norm,1Fam,SLvl,5,5,2006,2007,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,Unf,0,Unf,0,384,384,GasA,Gd,Y,,754,640,0,1394,0,0,2,1,3,1,Gd,7,Typ,0,BuiltIn,2007,Fin,2,400,TA,TA,Y,100,0,0,0,0,0,0,5,2008,WD


In [97]:
test['electrical'].value_counts()

SBrkr    813
FuseA     48
FuseF     15
FuseP      1
Name: electrical, dtype: int64

In [98]:
test.loc[(test['electrical'].isna()), 'electrical'] = 'SBrkr'


In [99]:
test[(test['electrical'].isna())]

Unnamed: 0,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type


In [100]:
#to drop this row as it's tough to estimate
#test = test.drop(634)
#To check if it's dropped
#test[(test['electrical'].isna())]

In [101]:
#Check for missing data again
test.isnull().sum().sort_values(ascending=False)

ms_subclass        0
heating            0
functional         0
totrms_abvgrd      0
kitchen_qual       0
kitchen_abvgr      0
bedroom_abvgr      0
half_bath          0
full_bath          0
bsmt_half_bath     0
bsmt_full_bath     0
gr_liv_area        0
low_qual_fin_sf    0
2nd_flr_sf         0
1st_flr_sf         0
electrical         0
central_air        0
fireplaces         0
garage_type        0
garage_yr_blt      0
enclosed_porch     0
yr_sold            0
mo_sold            0
misc_val           0
pool_area          0
screen_porch       0
3ssn_porch         0
open_porch_sf      0
garage_finish      0
wood_deck_sf       0
paved_drive        0
garage_cond        0
garage_qual        0
garage_area        0
garage_cars        0
heating_qc         0
total_bsmt_sf      0
ms_zoning          0
bsmt_unf_sf        0
overall_cond       0
overall_qual       0
house_style        0
bldg_type          0
condition_2        0
condition_1        0
neighborhood       0
land_slope         0
lot_config   

In [102]:
test.shape

(878, 73)

In [103]:
test.describe()

Unnamed: 0,lot_frontage,lot_area,overall_qual,overall_cond,year_built,year_remod/add,mas_vnr_area,bsmtfin_sf_1,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,totrms_abvgrd,fireplaces,garage_cars,garage_area,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold
count,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0
mean,69.371768,10307.03303,6.050114,5.566059,1970.492027,1984.417995,106.115034,441.328018,53.896355,539.133257,1034.357631,1145.787016,348.398633,2.728929,1496.914579,0.439636,0.055809,1.539863,0.398633,2.879271,1.047836,6.454442,0.618451,1.742597,470.362187,93.430524,47.070615,24.06492,2.59795,14.830296,1.884966,48.498861,6.202733,2007.82574
std,21.679801,10002.674602,1.369065,1.128903,30.395895,20.450725,187.08653,438.457329,178.632355,426.172161,413.446291,375.264111,432.985566,32.572548,506.468967,0.529948,0.229683,0.55906,0.505927,0.830712,0.223966,1.596632,0.669571,0.750391,212.734075,121.181702,68.180939,73.249463,24.962482,53.003794,29.916672,550.169317,2.642498,1.327861
min,21.0,1477.0,2.0,1.0,1880.0,1950.0,0.0,0.0,0.0,0.0,0.0,407.0,0.0,0.0,407.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,60.0,7297.25,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,216.0,789.0,864.0,0.0,0.0,1114.0,0.0,0.0,1.0,0.0,2.0,1.0,5.0,0.0,1.0,322.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0
50%,66.759571,9446.0,6.0,5.0,1972.0,1992.0,0.0,373.0,0.0,452.0,975.0,1063.0,0.0,0.0,1436.0,0.0,0.0,2.0,0.0,3.0,1.0,6.0,1.0,2.0,473.0,0.0,27.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,80.0,11589.0,7.0,6.0,1999.75,2003.0,171.75,734.75,0.0,780.0,1247.0,1358.0,720.0,0.0,1779.0,1.0,0.0,2.0,1.0,3.0,1.0,7.0,1.0,2.0,576.0,170.75,69.75,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,182.0,215245.0,10.0,9.0,2010.0,2010.0,1378.0,2288.0,1526.0,2046.0,2630.0,2674.0,2065.0,481.0,4476.0,2.0,1.0,4.0,2.0,6.0,3.0,12.0,3.0,4.0,1488.0,690.0,742.0,1012.0,360.0,576.0,555.0,15500.0,12.0,2010.0


In [104]:
num_col_to_drop = ['yr_sold', '1st_flr_sf', 'totrms_abvgrd', 'garage_cars']

#Drop columns from train and test
test = test.drop(num_col_to_drop, axis=1)
#test = test.drop(num_col_to_drop, axis=1)

#Drop from train_num too
#test_num = test_num.drop(num_col_to_drop, axis=1)
#list_num = list(test_num)

# 3 EDA

## 3.1 Summary Statistics

In [105]:
#split data columns into numerical and categorical
test_num = test.select_dtypes(exclude='object')
test_cat = test.select_dtypes(include='object')

In [106]:
test.shape, test_num.shape, test_cat.shape

((878, 69), (878, 30), (878, 39))

In [107]:
#From categorical data, split into Ordinal/Nominal

test_cat_ordinal = ['lot_shape','utilities','land_slope','exter_qual',
       'exter_cond','bsmt_qual', 'bsmt_cond', 'bsmt_exposure',
       'bsmtfin_type_1', 'bsmtfin_type_2','heating_qc','electrical','kitchen_qual', 'functional']

test_cat_nominal = ['ms_zoning', 'street','land_contour','lot_config','neighborhood', 'condition_1',
       'condition_2', 'bldg_type', 'house_style','roof_style', 'roof_matl',
       'exterior_1st', 'exterior_2nd', 'mas_vnr_type', 'foundation','heating','central_air','garage_type','garage_yr_blt','garage_finish', 'garage_qual',
       'garage_cond', 'paved_drive','sale_type']



In [108]:
#For ordinal data, change from categories to numerical

test["lot_shape"].replace({"IR3": 1, "IR2":2, "IR1":3, "Reg":4}, inplace = True)
test["utilities"].replace({"AllPub": 4, "NoSewr":3, "NoSeWa":2, "ELO":1}, inplace = True)
test["land_slope"].replace({"Sev": 3, "Mod":2, "Gtl":1}, inplace = True)
test["exter_qual"].replace({"Ex": 5, "Gd":4, "TA":3, "Fa":2, "Po":1}, inplace = True)
test["exter_cond"].replace({"Ex": 5, "Gd":4, "TA":3, "Fa":2, "Po":1}, inplace = True)
test["bsmt_qual"].replace({"Ex": 5, "Gd":4, "TA":3, "Fa":2, "Po":1, "NA":0}, inplace = True)
test["bsmt_cond"].replace({"Ex": 5, "Gd":4, "TA":3, "Fa":2, "Po":1, "NA":0}, inplace = True)
test['bsmt_exposure'].replace({"Gd": 4, "Av":3, "Mn":2, "No":1, "NA":0}, inplace = True)
test["bsmtfin_type_1"].replace({"GLQ": 6, "ALQ": 5, "BLQ": 4, "Rec":3, "LwQ":2, "Unf":1, "NA":0}, inplace = True)
test["bsmtfin_type_2"].replace({"GLQ": 6, "ALQ": 5, "BLQ": 4, "Rec":3, "LwQ":2, "Unf":1, "NA":0}, inplace = True)
test["heating_qc"].replace({"Ex": 5, "Gd":4, "TA":3, "Fa":2, "Po":1}, inplace = True)
test["electrical"].replace({"SBrkr": 5, "FuseA":4, "FuseF":3, "FuseP":2, "Mix":1}, inplace = True)
test["kitchen_qual"].replace({"Ex": 5, "Gd":4, "TA":3, "Fa":2, "Po":1}, inplace = True)
test["functional"].replace({"Typ": 8, "Min1": 7, "Min2": 6, "Mod":5, "Maj1":4, "Maj2":3, "Sev": 2, "Sal":1}, inplace = True)



In [109]:
test.shape, test_num.shape, test_cat.shape

((878, 69), (878, 30), (878, 39))

In [110]:
#save the file as train.csv
os.chdir("C:/Users/annitan/Desktop/GA/DSIF-SG-7 - Copy/Project_2/data")
test.to_csv('test_cleaned.csv')