# 1. Data Cleaning

In [1]:
#import packages
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline


In [2]:
#load dataset
ames_df = pd.read_csv('../data/train.csv')
ames_df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [3]:
#convert column names to lower case
ames_df.columns= ames_df.columns.str.lower()

In [4]:
#replace spaces in column names with _
ames_df.columns = ames_df.columns.str.replace(' ', '_')
ames_df.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,saleprice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [5]:
#look at the info
ames_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 81 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               2051 non-null   int64  
 1   pid              2051 non-null   int64  
 2   ms_subclass      2051 non-null   int64  
 3   ms_zoning        2051 non-null   object 
 4   lot_frontage     1721 non-null   float64
 5   lot_area         2051 non-null   int64  
 6   street           2051 non-null   object 
 7   alley            140 non-null    object 
 8   lot_shape        2051 non-null   object 
 9   land_contour     2051 non-null   object 
 10  utilities        2051 non-null   object 
 11  lot_config       2051 non-null   object 
 12  land_slope       2051 non-null   object 
 13  neighborhood     2051 non-null   object 
 14  condition_1      2051 non-null   object 
 15  condition_2      2051 non-null   object 
 16  bldg_type        2051 non-null   object 
 17  house_style   

In [6]:
#display number of null values for each variable
pd.options.display.min_rows = 60
ames_df.isnull().sum().sort_values(ascending=False)

pool_qc           2042
misc_feature      1986
alley             1911
fence             1651
fireplace_qu      1000
lot_frontage       330
garage_finish      114
garage_qual        114
garage_yr_blt      114
garage_cond        114
garage_type        113
bsmt_exposure       58
bsmtfin_type_2      56
bsmtfin_type_1      55
bsmt_cond           55
bsmt_qual           55
mas_vnr_area        22
mas_vnr_type        22
bsmt_half_bath       2
bsmt_full_bath       2
garage_area          1
total_bsmt_sf        1
bsmt_unf_sf          1
bsmtfin_sf_2         1
bsmtfin_sf_1         1
garage_cars          1
mo_sold              0
sale_type            0
full_bath            0
half_bath            0
                  ... 
ms_zoning            0
lot_area             0
street               0
lot_shape            0
land_contour         0
utilities            0
lot_config           0
land_slope           0
neighborhood         0
condition_1          0
condition_2          0
bldg_type            0
house_style

In [7]:
#view unique values of 'ms_subclass'
np.array(np.unique(ames_df['ms_subclass'], return_counts=True)).T

#'ms_subclass' does not show any unusual values 

array([[ 20, 770],
       [ 30, 101],
       [ 40,   4],
       [ 45,  11],
       [ 50, 198],
       [ 60, 394],
       [ 70,  90],
       [ 75,  16],
       [ 80,  86],
       [ 85,  28],
       [ 90,  75],
       [120, 132],
       [150,   1],
       [160,  88],
       [180,  11],
       [190,  46]], dtype=int64)

In [8]:
#view unique values of 'ms_zoning'
np.array(np.unique(ames_df['ms_zoning'], return_counts=True)).T

#ms_zoning does not show any unusual values

array([['A (agr)', 2],
       ['C (all)', 19],
       ['FV', 101],
       ['I (all)', 1],
       ['RH', 14],
       ['RL', 1598],
       ['RM', 316]], dtype=object)

In [9]:
#convert nan values to 0 and convert 'lot_frontage' to int type
ames_df['lot_frontage'] = ames_df['lot_frontage'].fillna(0).astype(int)

Since there are 330 null values in 'lot frontage', the cells are filled with the 0 value.

In [10]:
ames_df.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,saleprice
0,109,533352170,60,RL,0,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [11]:
#checking for unusual values in 'lot_area'
np.array(np.unique(ames_df['lot_area'], return_counts=True)).T

#'lot_area' does not contain any unusual values

array([[  1300,      1],
       [  1470,      1],
       [  1476,      1],
       ...,
       [ 70761,      1],
       [115149,      1],
       [159000,      1]], dtype=int64)

In [12]:
#checking 'street' for unusual values
np.array(np.unique(ames_df['street'], return_counts=True)).T

#'street' does not contain any unusual values

array([['Grvl', 7],
       ['Pave', 2044]], dtype=object)

In [13]:
ames_df.head(5)

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,saleprice
0,109,533352170,60,RL,0,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [14]:
#replace NaN with None for 'alley'
ames_df['alley'] = ames_df['alley'].replace(np.nan,'None')

In [15]:
ames_df.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,saleprice
0,109,533352170,60,RL,0,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [16]:
#check for replacement of NaN in 'alley'
np.array(np.unique(ames_df['alley'], return_counts=True)).T

array([['Grvl', 85],
       ['None', 1911],
       ['Pave', 55]], dtype=object)

In [17]:
#check for unusual values in 'lot_shape'
np.array(np.unique(ames_df['lot_shape'], return_counts=True)).T

#no unusual values in 'lot_shape'

array([['IR1', 692],
       ['IR2', 55],
       ['IR3', 9],
       ['Reg', 1295]], dtype=object)

In [18]:
#check for unusual values in 'land_contour'
np.array(np.unique(ames_df['land_contour'], return_counts=True)).T

#no unusual values in 'land contour'

array([['Bnk', 80],
       ['HLS', 85],
       ['Low', 43],
       ['Lvl', 1843]], dtype=object)

In [19]:
#check for unusual values in 'utilities'
np.array(np.unique(ames_df['utilities'], return_counts=True)).T

#no unusual values in 'utilities'

array([['AllPub', 2049],
       ['NoSeWa', 1],
       ['NoSewr', 1]], dtype=object)

In [20]:
#check for unusual values in 'lot_config'
np.array(np.unique(ames_df['lot_config'], return_counts=True)).T

#no unusual values in 'lot_config'

array([['Corner', 348],
       ['CulDSac', 131],
       ['FR2', 60],
       ['FR3', 9],
       ['Inside', 1503]], dtype=object)

In [21]:
#check for unusual values in 'land slope'
np.array(np.unique(ames_df['land_slope'], return_counts=True)).T

#no unusual values in 'land slope'

array([['Gtl', 1953],
       ['Mod', 88],
       ['Sev', 10]], dtype=object)

In [22]:
#check for unusual values in 'neighborhood'
np.array(np.unique(ames_df['neighborhood'], return_counts=True)).T

#no unusual values in 'neighborhood'

array([['Blmngtn', 22],
       ['Blueste', 6],
       ['BrDale', 19],
       ['BrkSide', 76],
       ['ClearCr', 27],
       ['CollgCr', 180],
       ['Crawfor', 71],
       ['Edwards', 143],
       ['Gilbert', 116],
       ['Greens', 3],
       ['GrnHill', 2],
       ['IDOTRR', 69],
       ['Landmrk', 1],
       ['MeadowV', 24],
       ['Mitchel', 82],
       ['NAmes', 310],
       ['NPkVill', 17],
       ['NWAmes', 87],
       ['NoRidge', 48],
       ['NridgHt', 122],
       ['OldTown', 163],
       ['SWISU', 32],
       ['Sawyer', 111],
       ['SawyerW', 87],
       ['Somerst', 130],
       ['StoneBr', 38],
       ['Timber', 48],
       ['Veenker', 17]], dtype=object)

In [23]:
#check for unusual values in 'condition_1'
np.array(np.unique(ames_df['condition_1'], return_counts=True)).T

#no unusual values in 'condition_1'

array([['Artery', 70],
       ['Feedr', 109],
       ['Norm', 1767],
       ['PosA', 12],
       ['PosN', 27],
       ['RRAe', 21],
       ['RRAn', 36],
       ['RRNe', 3],
       ['RRNn', 6]], dtype=object)

In [24]:
#check for unusual values in 'condition_2'
np.array(np.unique(ames_df['condition_2'], return_counts=True)).T

#no unusual values in 'condition_2'

array([['Artery', 5],
       ['Feedr', 11],
       ['Norm', 2025],
       ['PosA', 3],
       ['PosN', 3],
       ['RRAe', 1],
       ['RRAn', 1],
       ['RRNn', 2]], dtype=object)

In [25]:
#check for unusual values in 'bldg_type'
np.array(np.unique(ames_df['bldg_type'], return_counts=True)).T

#Twnhs should be replaced with TwnhsI in 'bldg_type'

array([['1Fam', 1700],
       ['2fmCon', 46],
       ['Duplex', 75],
       ['Twnhs', 69],
       ['TwnhsE', 161]], dtype=object)

In [26]:
#replace 'Twnhs' with 'TwnhsI'
ames_df['bldg_type'] = ames_df['bldg_type'].replace('Twnhs','TwnhsI')

In [27]:
#check for unusual values in 'bldg_type'

np.array(np.unique(ames_df['bldg_type'], return_counts=True)).T

#no unusual values in 'bldg_type'

array([['1Fam', 1700],
       ['2fmCon', 46],
       ['Duplex', 75],
       ['TwnhsE', 161],
       ['TwnhsI', 69]], dtype=object)

In [28]:
#check for unusual values in 'house_style'
np.array(np.unique(ames_df['house_style'], return_counts=True)).T

#no unusual values in 'house_style'

array([['1.5Fin', 218],
       ['1.5Unf', 12],
       ['1Story', 1059],
       ['2.5Fin', 6],
       ['2.5Unf', 14],
       ['2Story', 598],
       ['SFoyer', 50],
       ['SLvl', 94]], dtype=object)

In [29]:
#check for unusual values in 'overall_qual'
np.array(np.unique(ames_df['overall_qual'], return_counts=True)).T

#no unusual values in 'overall_qual'

array([[  1,   4],
       [  2,   9],
       [  3,  29],
       [  4, 159],
       [  5, 563],
       [  6, 506],
       [  7, 431],
       [  8, 250],
       [  9,  77],
       [ 10,  23]], dtype=int64)

In [30]:
#check for unusual values in 'overall_cond'
np.array(np.unique(ames_df['overall_cond'], return_counts=True)).T

#no unusual values in 'overall_cond'

array([[   1,    4],
       [   2,    6],
       [   3,   35],
       [   4,   70],
       [   5, 1168],
       [   6,  368],
       [   7,  270],
       [   8,  101],
       [   9,   29]], dtype=int64)

In [31]:
#check for unusual values in 'year_built'
np.array(np.unique(ames_df['year_built'], return_counts=True)).T

#no unusual values in 'year_built'

array([[1872,    1],
       [1875,    1],
       [1879,    1],
       [1880,    3],
       [1885,    1],
       [1890,    5],
       [1892,    1],
       [1893,    1],
       [1895,    3],
       [1896,    1],
       [1898,    1],
       [1900,   20],
       [1901,    2],
       [1905,    2],
       [1908,    1],
       [1910,   26],
       [1911,    1],
       [1912,    3],
       [1913,    1],
       [1914,    7],
       [1915,   17],
       [1916,    8],
       [1917,    2],
       [1918,    4],
       [1919,    2],
       [1920,   38],
       [1921,    9],
       [1922,   12],
       [1923,   10],
       [1924,   11],
       [1925,   22],
       [1926,   12],
       [1927,    8],
       [1928,    8],
       [1929,    8],
       [1930,   18],
       [1931,    4],
       [1932,    3],
       [1934,    4],
       [1935,    9],
       [1936,    9],
       [1937,    7],
       [1938,    7],
       [1939,   15],
       [1940,   29],
       [1941,   18],
       [1942,    6],
       [1945,

In [32]:
#check for unusual values in 'year_remod/add'
np.array(np.unique(ames_df['year_remod/add'], return_counts=True)).T

#no unusual values in 'year_remod/add'

array([[1950,  262],
       [1951,   10],
       [1952,    9],
       [1953,   18],
       [1954,   17],
       [1955,   18],
       [1956,   22],
       [1957,   15],
       [1958,   28],
       [1959,   21],
       [1960,   20],
       [1961,   17],
       [1962,   20],
       [1963,   19],
       [1964,   17],
       [1965,   18],
       [1966,   19],
       [1967,   22],
       [1968,   28],
       [1969,   16],
       [1970,   31],
       [1971,   18],
       [1972,   21],
       [1973,   13],
       [1974,   10],
       [1975,   26],
       [1976,   32],
       [1977,   31],
       [1978,   30],
       [1979,   16],
       [1980,   20],
       [1981,    6],
       [1982,    3],
       [1983,    7],
       [1984,   12],
       [1985,    9],
       [1986,    9],
       [1987,   12],
       [1988,   12],
       [1989,   14],
       [1990,   23],
       [1991,   19],
       [1992,   19],
       [1993,   33],
       [1994,   41],
       [1995,   38],
       [1996,   42],
       [1997,

In [33]:
#check for unusual values in 'roof_style'
np.array(np.unique(ames_df['roof_style'], return_counts=True)).T

#no unusual values in 'roof_style'

array([['Flat', 13],
       ['Gable', 1619],
       ['Gambrel', 12],
       ['Hip', 397],
       ['Mansard', 7],
       ['Shed', 3]], dtype=object)

In [34]:
#check for unusual values in 'roof_matl'
np.array(np.unique(ames_df['roof_matl'], return_counts=True)).T

#no unusual values in 'roof_matl'

array([['ClyTile', 1],
       ['CompShg', 2025],
       ['Membran', 1],
       ['Tar&Grv', 15],
       ['WdShake', 4],
       ['WdShngl', 5]], dtype=object)

In [35]:
#check for unusual values in 'exterior_1st'
np.array(np.unique(ames_df['exterior_1st'], return_counts=True)).T

#no unusual values in 'exterior_1st'

array([['AsbShng', 33],
       ['AsphShn', 1],
       ['BrkComm', 3],
       ['BrkFace', 64],
       ['CBlock', 2],
       ['CemntBd', 90],
       ['HdBoard', 300],
       ['ImStucc', 1],
       ['MetalSd', 331],
       ['Plywood', 152],
       ['Stone', 2],
       ['Stucco', 27],
       ['VinylSd', 724],
       ['Wd Sdng', 276],
       ['WdShing', 45]], dtype=object)

In [36]:
#check for unusual values in 'exterior_2nd'
np.array(np.unique(ames_df['exterior_2nd'], return_counts=True)).T

#no unusual values in 'exterior_2nd'

array([['AsbShng', 28],
       ['AsphShn', 3],
       ['Brk Cmn', 17],
       ['BrkFace', 34],
       ['CBlock', 2],
       ['CmentBd', 90],
       ['HdBoard', 275],
       ['ImStucc', 11],
       ['MetalSd', 324],
       ['Plywood', 185],
       ['Stone', 6],
       ['Stucco', 30],
       ['VinylSd', 721],
       ['Wd Sdng', 262],
       ['Wd Shng', 63]], dtype=object)

In [37]:
#fill empty cells in 'mas_vnr_type' column with 'None'
ames_df['mas_vnr_type'].fillna('None', inplace = True)

In [38]:
#check for unusual values in 'mas_vnr_type'
np.array(np.unique(ames_df['mas_vnr_type'], return_counts=True)).T

#no unusual values in 'mas_vnr_type'

array([['BrkCmn', 13],
       ['BrkFace', 630],
       ['None', 1240],
       ['Stone', 168]], dtype=object)

In [39]:
#check for unusual values in 'mas_vnr_type'
np.array(np.unique(ames_df['mas_vnr_area'], return_counts=True)).T

#'mas_vnr_type' contains 22 null values.

array([[0.000e+00, 1.216e+03],
       [1.000e+00, 3.000e+00],
       [3.000e+00, 1.000e+00],
       [1.400e+01, 2.000e+00],
       [1.600e+01, 9.000e+00],
       [1.800e+01, 2.000e+00],
       [2.000e+01, 2.000e+00],
       [2.200e+01, 2.000e+00],
       [2.300e+01, 3.000e+00],
       [2.400e+01, 2.000e+00],
       [2.700e+01, 1.000e+00],
       [2.800e+01, 2.000e+00],
       [3.000e+01, 4.000e+00],
       [3.100e+01, 1.000e+00],
       [3.200e+01, 3.000e+00],
       [3.600e+01, 2.000e+00],
       [3.800e+01, 2.000e+00],
       [3.900e+01, 1.000e+00],
       [4.000e+01, 8.000e+00],
       [4.100e+01, 3.000e+00],
       [4.200e+01, 3.000e+00],
       [4.400e+01, 5.000e+00],
       [4.500e+01, 1.000e+00],
       [4.600e+01, 1.000e+00],
       [4.700e+01, 1.000e+00],
       [5.000e+01, 4.000e+00],
       [5.100e+01, 2.000e+00],
       [5.200e+01, 2.000e+00],
       [5.400e+01, 3.000e+00],
       [5.600e+01, 2.000e+00],
       [5.700e+01, 1.000e+00],
       [5.800e+01, 1.000e+00],
       [

In [40]:
#fill empty cells in 'mas_vnr_area' column with 0
ames_df['mas_vnr_area'].fillna(0, inplace = True)

In [41]:
#check for unusual values in 'exter_qual'
np.array(np.unique(ames_df['exter_qual'], return_counts=True)).T

#no unusual values in 'exter_qual'

array([['Ex', 81],
       ['Fa', 26],
       ['Gd', 697],
       ['TA', 1247]], dtype=object)

In [42]:
#check for unusual values in 'exter_cond'
np.array(np.unique(ames_df['exter_cond'], return_counts=True)).T

#no unusual values in 'exter_cond'

array([['Ex', 7],
       ['Fa', 49],
       ['Gd', 215],
       ['Po', 2],
       ['TA', 1778]], dtype=object)

In [43]:
#check for unusual values in 'foundation'
np.array(np.unique(ames_df['foundation'], return_counts=True)).T

#no unusual values in 'foundation'

array([['BrkTil', 221],
       ['CBlock', 863],
       ['PConc', 926],
       ['Slab', 34],
       ['Stone', 5],
       ['Wood', 2]], dtype=object)

In [44]:
#fill empty cells in 'bsmt_qual' column with 'None'
ames_df['bsmt_qual'].fillna('None', inplace = True)

In [45]:
#check for unusual values in 'bsmt_qual'
np.array(np.unique(ames_df['bsmt_qual'], return_counts=True)).T

#no unusual values in 'bsmt_qual'

array([['Ex', 184],
       ['Fa', 60],
       ['Gd', 864],
       ['None', 55],
       ['Po', 1],
       ['TA', 887]], dtype=object)

In [46]:
#fill empty cells in 'bsmt_cond' column with 'None'
ames_df['bsmt_cond'].fillna('None', inplace = True)

In [47]:
#check for unusual values in 'bsmt_cond'
np.array(np.unique(ames_df['bsmt_cond'], return_counts=True)).T

#no unusual values in 'bsmt_cond'

array([['Ex', 3],
       ['Fa', 65],
       ['Gd', 89],
       ['None', 55],
       ['Po', 5],
       ['TA', 1834]], dtype=object)

In [48]:
#fill empty cells in 'bsmt_exposure' column with 'None'
ames_df['bsmt_exposure'].fillna('None', inplace = True)

In [49]:
#check for unusual values in 'bsmt_exposure'
np.array(np.unique(ames_df['bsmt_exposure'], return_counts=True)).T

#no unusual values in 'bsmt_exposure'

array([['Av', 288],
       ['Gd', 203],
       ['Mn', 163],
       ['No', 1339],
       ['None', 58]], dtype=object)

In [50]:
#fill empty cells in 'bsmtfin_type_1' column with 'None'
ames_df['bsmtfin_type_1'].fillna('None', inplace = True)

In [51]:
#check for unusual values in 'bsmtfin_type_1'
np.array(np.unique(ames_df['bsmtfin_type_1'], return_counts=True)).T

#no unusual values in 'bsmtfin_type_1'

array([['ALQ', 293],
       ['BLQ', 200],
       ['GLQ', 615],
       ['LwQ', 102],
       ['None', 55],
       ['Rec', 183],
       ['Unf', 603]], dtype=object)

In [52]:
#replace NaN with 0 in 'bsmtfin_sf_1'
ames_df['bsmtfin_sf_1'] = ames_df['bsmtfin_sf_1'].replace(np.nan,0)

In [53]:
#check for unusual values in 'bsmtfin_sf_1'
np.array(np.unique(ames_df['bsmtfin_sf_1'], return_counts=True)).T

#no unusual values in 'bsmtfin_sf_1'

array([[0.000e+00, 6.580e+02],
       [2.000e+00, 1.000e+00],
       [1.600e+01, 1.000e+01],
       ...,
       [2.188e+03, 1.000e+00],
       [4.010e+03, 1.000e+00],
       [5.644e+03, 1.000e+00]])

In [54]:
#fill empty cells in 'bsmtfin_type_2' column with 'None'
ames_df['bsmtfin_type_2'].fillna('None', inplace = True)

In [55]:
#check for unusual values in 'bsmtfin_type_2'
np.array(np.unique(ames_df['bsmtfin_type_2'], return_counts=True)).T

#no unusual values in 'bsmtfin_type_2'

array([['ALQ', 35],
       ['BLQ', 48],
       ['GLQ', 23],
       ['LwQ', 60],
       ['None', 56],
       ['Rec', 80],
       ['Unf', 1749]], dtype=object)

In [56]:
#replace NaN with 0 in 'bsmtfin_sf_2'
ames_df['bsmtfin_sf_2'] = ames_df['bsmtfin_sf_2'].replace(np.nan,0)

In [57]:
#check for unusual values in 'bsmtfin_sf_2'
np.array(np.unique(ames_df['bsmtfin_sf_2'], return_counts=True)).T

#no unusual values in 'bsmtfin_sf_2'

array([[0.000e+00, 1.804e+03],
       [6.000e+00, 1.000e+00],
       [1.200e+01, 1.000e+00],
       [2.800e+01, 1.000e+00],
       [3.500e+01, 1.000e+00],
       [3.800e+01, 1.000e+00],
       [4.000e+01, 1.000e+00],
       [4.100e+01, 2.000e+00],
       [4.200e+01, 1.000e+00],
       [5.200e+01, 1.000e+00],
       [6.000e+01, 2.000e+00],
       [6.400e+01, 2.000e+00],
       [6.600e+01, 1.000e+00],
       [6.800e+01, 2.000e+00],
       [7.200e+01, 3.000e+00],
       [7.600e+01, 1.000e+00],
       [8.000e+01, 2.000e+00],
       [8.100e+01, 1.000e+00],
       [9.200e+01, 1.000e+00],
       [9.300e+01, 1.000e+00],
       [9.500e+01, 1.000e+00],
       [9.600e+01, 2.000e+00],
       [1.020e+02, 1.000e+00],
       [1.050e+02, 2.000e+00],
       [1.060e+02, 1.000e+00],
       [1.080e+02, 2.000e+00],
       [1.100e+02, 2.000e+00],
       [1.130e+02, 1.000e+00],
       [1.160e+02, 2.000e+00],
       [1.170e+02, 2.000e+00],
       [1.190e+02, 1.000e+00],
       [1.200e+02, 1.000e+00],
       [

In [58]:
#replace NaN with 0
ames_df['bsmt_unf_sf'] = ames_df['bsmt_unf_sf'].replace(np.nan,0)

In [59]:
#check for unusual values in 'bsmt_unf_sf'
np.array(np.unique(ames_df['bsmt_unf_sf'], return_counts=True)).T

#no unusual values in 'bsmt_unf_sf'

array([[0.000e+00, 1.660e+02],
       [1.500e+01, 1.000e+00],
       [1.700e+01, 1.000e+00],
       ...,
       [2.140e+03, 1.000e+00],
       [2.153e+03, 1.000e+00],
       [2.336e+03, 1.000e+00]])

In [60]:
#replace NaN with 0
ames_df['total_bsmt_sf'] = ames_df['total_bsmt_sf'].replace(np.nan,0)

In [61]:
#check for unusual values in 'total_bsmt_sf'
np.array(np.unique(ames_df['total_bsmt_sf'], return_counts=True)).T

#no unusual values in 'total_bsmt_sf'

array([[0.000e+00, 5.500e+01],
       [1.600e+02, 1.000e+00],
       [1.730e+02, 1.000e+00],
       ...,
       [3.206e+03, 1.000e+00],
       [5.095e+03, 1.000e+00],
       [6.110e+03, 1.000e+00]])

In [62]:
#check for unusual values
np.array(np.unique(ames_df['heating'], return_counts=True)).T

array([['GasA', 2018],
       ['GasW', 20],
       ['Grav', 5],
       ['OthW', 2],
       ['Wall', 6]], dtype=object)

In [63]:
#check for unusual values
np.array(np.unique(ames_df['heating_qc'], return_counts=True)).T

array([['Ex', 1065],
       ['Fa', 67],
       ['Gd', 319],
       ['Po', 3],
       ['TA', 597]], dtype=object)

In [64]:
#check for unusual values
np.array(np.unique(ames_df['central_air'], return_counts=True)).T

array([['N', 141],
       ['Y', 1910]], dtype=object)

In [65]:
#check for unusual values
np.array(np.unique(ames_df['electrical'], return_counts=True)).T

array([['FuseA', 140],
       ['FuseF', 35],
       ['FuseP', 7],
       ['Mix', 1],
       ['SBrkr', 1868]], dtype=object)

In [66]:
#check for unusual values
np.array(np.unique(ames_df['1st_flr_sf'], return_counts=True)).T

array([[ 334,    1],
       [ 372,    1],
       [ 438,    1],
       ...,
       [3820,    1],
       [4692,    1],
       [5095,    1]], dtype=int64)

In [67]:
#check for unusual values
np.array(np.unique(ames_df['2nd_flr_sf'], return_counts=True)).T

array([[   0, 1191],
       [ 125,    1],
       [ 144,    1],
       ...,
       [1818,    1],
       [1836,    1],
       [1862,    1]], dtype=int64)

In [68]:
#check for unusual values
np.array(np.unique(ames_df['low_qual_fin_sf'], return_counts=True)).T

array([[   0, 2018],
       [  53,    1],
       [  80,    3],
       [ 108,    1],
       [ 114,    1],
       [ 120,    1],
       [ 140,    1],
       [ 144,    1],
       [ 156,    1],
       [ 205,    2],
       [ 234,    1],
       [ 259,    1],
       [ 312,    1],
       [ 360,    1],
       [ 362,    1],
       [ 371,    1],
       [ 384,    1],
       [ 390,    1],
       [ 397,    1],
       [ 436,    1],
       [ 450,    1],
       [ 473,    1],
       [ 479,    1],
       [ 512,    1],
       [ 513,    1],
       [ 514,    1],
       [ 515,    1],
       [ 528,    1],
       [ 572,    1],
       [ 697,    1],
       [1064,    1]], dtype=int64)

In [69]:
#check for unusual values
np.array(np.unique(ames_df['gr_liv_area'], return_counts=True)).T

array([[ 334,    1],
       [ 438,    1],
       [ 480,    1],
       ...,
       [3820,    1],
       [5095,    1],
       [5642,    1]], dtype=int64)

In [70]:
#replace NaN with 0
ames_df['bsmt_full_bath'] = ames_df['bsmt_full_bath'].replace(np.nan,0)

In [71]:
#check for unusual values
np.array(np.unique(ames_df['bsmt_full_bath'], return_counts=True)).T

array([[0.000e+00, 1.202e+03],
       [1.000e+00, 8.240e+02],
       [2.000e+00, 2.300e+01],
       [3.000e+00, 2.000e+00]])

In [72]:
#replace NaN with 0
ames_df['bsmt_half_bath'] = ames_df['bsmt_half_bath'].replace(np.nan,0)

In [73]:
#check for unusual values
np.array(np.unique(ames_df['bsmt_half_bath'], return_counts=True)).T

array([[0.000e+00, 1.925e+03],
       [1.000e+00, 1.220e+02],
       [2.000e+00, 4.000e+00]])

In [74]:
#check for unusual values
np.array(np.unique(ames_df['full_bath'], return_counts=True)).T

array([[   0,    8],
       [   1,  900],
       [   2, 1096],
       [   3,   45],
       [   4,    2]], dtype=int64)

In [75]:
#check for unusual values
np.array(np.unique(ames_df['half_bath'], return_counts=True)).T

array([[   0, 1308],
       [   1,  725],
       [   2,   18]], dtype=int64)

In [76]:
#check for unusual values
np.array(np.unique(ames_df['bedroom_abvgr'], return_counts=True)).T

array([[   0,    5],
       [   1,   75],
       [   2,  544],
       [   3, 1108],
       [   4,  265],
       [   5,   41],
       [   6,   12],
       [   8,    1]], dtype=int64)

In [77]:
#check for unusual values
np.array(np.unique(ames_df['kitchen_abvgr'], return_counts=True)).T

array([[   0,    2],
       [   1, 1960],
       [   2,   88],
       [   3,    1]], dtype=int64)

In [78]:
#check for unusual values
np.array(np.unique(ames_df['kitchen_qual'], return_counts=True)).T

array([['Ex', 151],
       ['Fa', 47],
       ['Gd', 806],
       ['TA', 1047]], dtype=object)

In [79]:
#check for unusual values
np.array(np.unique(ames_df['totrms_abvgrd'], return_counts=True)).T

array([[  2,   1],
       [  3,  12],
       [  4, 146],
       [  5, 407],
       [  6, 597],
       [  7, 475],
       [  8, 228],
       [  9,  98],
       [ 10,  49],
       [ 11,  22],
       [ 12,  13],
       [ 13,   1],
       [ 14,   1],
       [ 15,   1]], dtype=int64)

In [80]:
#check for unusual values
np.array(np.unique(ames_df['functional'], return_counts=True)).T

array([['Maj1', 12],
       ['Maj2', 7],
       ['Min1', 42],
       ['Min2', 42],
       ['Mod', 29],
       ['Sal', 2],
       ['Sev', 2],
       ['Typ', 1915]], dtype=object)

In [81]:
#check for unusual values
np.array(np.unique(ames_df['fireplaces'], return_counts=True)).T

array([[   0, 1000],
       [   1,  898],
       [   2,  146],
       [   3,    6],
       [   4,    1]], dtype=int64)

In [82]:
#replace NaN values with None
ames_df['fireplace_qu'] = ames_df['fireplace_qu'].replace(np.nan,'None')

In [83]:
#check for unusual values
np.array(np.unique(ames_df['fireplace_qu'], return_counts=True)).T

array([['Ex', 31],
       ['Fa', 59],
       ['Gd', 523],
       ['None', 1000],
       ['Po', 31],
       ['TA', 407]], dtype=object)

In [84]:
#replace NaN values with None
ames_df['garage_type'] = ames_df['garage_type'].replace(np.nan,'None')

In [85]:
#check for unusual values
np.array(np.unique(ames_df['garage_type'], return_counts=True)).T

array([['2Types', 19],
       ['Attchd', 1213],
       ['Basment', 27],
       ['BuiltIn', 132],
       ['CarPort', 11],
       ['Detchd', 536],
       ['None', 113]], dtype=object)

In [86]:
#replace NaN values with 0
ames_df['garage_yr_blt'] = ames_df['garage_yr_blt'].replace(np.nan,0)

In [87]:
#convert 'garage_yr_blt' to int type
ames_df['garage_yr_blt'] = ames_df['garage_yr_blt'].astype(int)

In [88]:
ames_df['garage_yr_blt'].dtypes

dtype('int32')

In [89]:
#replace NaN values with None
ames_df['garage_finish'] = ames_df['garage_finish'].replace(np.nan,'None')

In [90]:
#check for unusual values
np.array(np.unique(ames_df['garage_finish'], return_counts=True)).T

array([['Fin', 509],
       ['None', 114],
       ['RFn', 579],
       ['Unf', 849]], dtype=object)

In [91]:
#replace NaN values with 0
ames_df['garage_cars'] = ames_df['garage_cars'].replace(np.nan,0)

In [92]:
#convert 'garage_cars' to int type
ames_df['garage_cars']=ames_df['garage_cars'].astype(int)

In [93]:
#check for unusual values
np.array(np.unique(ames_df['garage_cars'], return_counts=True)).T

array([[   0,  114],
       [   1,  524],
       [   2, 1136],
       [   3,  263],
       [   4,   13],
       [   5,    1]], dtype=int64)

In [94]:
ames_df['garage_cars'].dtypes

dtype('int32')

In [95]:
#replace NaN values with 0
ames_df['garage_area'] = ames_df['garage_area'].replace(np.nan,0)

In [96]:
#convert 'garage_area' to int type
ames_df['garage_area']=ames_df['garage_area'].astype(int)

In [97]:
#check for unusual values
np.array(np.unique(ames_df['garage_area'], return_counts=True)).T

array([[   0,  114],
       [ 100,    1],
       [ 160,    2],
       ...,
       [1348,    1],
       [1356,    1],
       [1418,    1]], dtype=int64)

In [98]:
ames_df['garage_area'].dtypes

dtype('int32')

In [99]:
#replace NaN values with 'None' 
ames_df['garage_qual'] = ames_df['garage_qual'].replace(np.nan,'None')

In [100]:
#check for unusual values
np.array(np.unique(ames_df['garage_qual'], return_counts=True)).T

array([['Ex', 3],
       ['Fa', 82],
       ['Gd', 18],
       ['None', 114],
       ['Po', 2],
       ['TA', 1832]], dtype=object)

In [101]:
#replace NaN values with 'None' in 'garage_cond'
ames_df['garage_cond'] = ames_df['garage_cond'].replace(np.nan,'None')

In [102]:
#check for unusual values
np.array(np.unique(ames_df['garage_cond'], return_counts=True)).T

array([['Ex', 2],
       ['Fa', 47],
       ['Gd', 12],
       ['None', 114],
       ['Po', 8],
       ['TA', 1868]], dtype=object)

In [103]:
#check for unusual values
np.array(np.unique(ames_df['paved_drive'], return_counts=True)).T

array([['N', 151],
       ['P', 39],
       ['Y', 1861]], dtype=object)

In [104]:
#check for unusual values
np.array(np.unique(ames_df['wood_deck_sf'], return_counts=True)).T

array([[   0, 1075],
       [   4,    1],
       [  12,    2],
       [  14,    1],
       [  16,    1],
       [  22,    1],
       [  24,    5],
       [  26,    1],
       [  28,    3],
       [  30,    2],
       [  32,    2],
       [  33,    1],
       [  35,    1],
       [  36,    5],
       [  38,    2],
       [  40,    2],
       [  42,    2],
       [  45,    1],
       [  48,    8],
       [  49,    2],
       [  50,    1],
       [  51,    1],
       [  52,    3],
       [  53,    1],
       [  54,    2],
       [  56,    1],
       [  58,    2],
       [  60,    2],
       [  63,    2],
       [  64,    6],
       [  66,    2],
       [  68,    2],
       [  70,    1],
       [  72,    1],
       [  73,    1],
       [  74,    2],
       [  75,    1],
       [  78,    1],
       [  80,    5],
       [  81,    4],
       [  84,    4],
       [  85,    2],
       [  86,    1],
       [  87,    1],
       [  88,    5],
       [  90,    1],
       [  92,    3],
       [  94,

In [105]:
#check for unusual values
np.array(np.unique(ames_df['open_porch_sf'], return_counts=True)).T

array([[  0, 912],
       [  4,   1],
       [  8,   1],
       [ 10,   2],
       [ 11,   3],
       [ 12,   3],
       [ 15,   1],
       [ 16,  11],
       [ 17,   2],
       [ 18,   5],
       [ 20,  24],
       [ 21,   8],
       [ 22,   6],
       [ 23,   3],
       [ 24,  23],
       [ 25,   9],
       [ 26,  11],
       [ 27,   7],
       [ 28,  18],
       [ 29,   6],
       [ 30,  23],
       [ 31,   1],
       [ 32,  29],
       [ 33,   9],
       [ 34,   8],
       [ 35,  15],
       [ 36,  38],
       [ 37,   1],
       [ 38,  11],
       [ 39,  15],
       [ 40,  34],
       [ 41,   3],
       [ 42,   8],
       [ 43,   2],
       [ 44,  15],
       [ 45,  20],
       [ 46,   9],
       [ 47,   3],
       [ 48,  40],
       [ 49,   7],
       [ 50,  20],
       [ 51,   6],
       [ 52,  11],
       [ 53,   9],
       [ 54,  13],
       [ 55,  10],
       [ 56,  11],
       [ 57,   5],
       [ 58,   5],
       [ 59,   5],
       [ 60,  23],
       [ 61,   3],
       [ 62,

In [106]:
#check for unusual values
np.array(np.unique(ames_df['enclosed_porch'], return_counts=True)).T

array([[   0, 1724],
       [  16,    1],
       [  18,    1],
       [  19,    1],
       [  20,    1],
       [  23,    1],
       [  24,    2],
       [  25,    1],
       [  26,    1],
       [  30,    3],
       [  32,    2],
       [  34,    3],
       [  35,    2],
       [  36,    5],
       [  37,    1],
       [  39,    2],
       [  40,    6],
       [  42,    2],
       [  43,    1],
       [  44,    1],
       [  45,    2],
       [  48,    2],
       [  50,    1],
       [  52,    2],
       [  54,    1],
       [  55,    1],
       [  56,    4],
       [  57,    1],
       [  60,    4],
       [  64,    3],
       [  66,    1],
       [  67,    1],
       [  68,    2],
       [  70,    4],
       [  72,    1],
       [  75,    1],
       [  77,    4],
       [  78,    2],
       [  80,    2],
       [  81,    2],
       [  84,    6],
       [  87,    1],
       [  88,    1],
       [  90,    3],
       [  92,    1],
       [  94,    1],
       [  96,   10],
       [  98,

In [107]:
#check for unusual values
np.array(np.unique(ames_df['3ssn_porch'], return_counts=True)).T

array([[   0, 2025],
       [  86,    1],
       [  96,    1],
       [ 120,    1],
       [ 140,    1],
       [ 144,    2],
       [ 150,    1],
       [ 153,    3],
       [ 162,    1],
       [ 168,    3],
       [ 176,    1],
       [ 180,    1],
       [ 182,    1],
       [ 216,    1],
       [ 224,    1],
       [ 245,    1],
       [ 255,    1],
       [ 290,    1],
       [ 304,    1],
       [ 323,    1],
       [ 407,    1],
       [ 508,    1]], dtype=int64)

In [108]:
#check for unusual values
np.array(np.unique(ames_df['screen_porch'], return_counts=True)).T

array([[   0, 1870],
       [  53,    1],
       [  64,    1],
       [  84,    1],
       [  88,    1],
       [  90,    2],
       [  92,    1],
       [  94,    1],
       [  95,    2],
       [ 100,    4],
       [ 104,    1],
       [ 108,    1],
       [ 109,    1],
       [ 110,    1],
       [ 111,    1],
       [ 112,    2],
       [ 113,    1],
       [ 115,    1],
       [ 116,    1],
       [ 120,    6],
       [ 122,    1],
       [ 126,    3],
       [ 130,    1],
       [ 135,    1],
       [ 138,    1],
       [ 140,    2],
       [ 141,    1],
       [ 142,    3],
       [ 143,    1],
       [ 144,    9],
       [ 145,    2],
       [ 147,    3],
       [ 148,    1],
       [ 150,    1],
       [ 152,    1],
       [ 153,    2],
       [ 154,    1],
       [ 155,    3],
       [ 156,    2],
       [ 160,    2],
       [ 161,    3],
       [ 162,    1],
       [ 163,    1],
       [ 164,    1],
       [ 165,    2],
       [ 168,    8],
       [ 170,    2],
       [ 171,

In [109]:
#check for unusual values
np.array(np.unique(ames_df['pool_area'], return_counts=True)).T

array([[   0, 2042],
       [ 228,    1],
       [ 368,    1],
       [ 480,    1],
       [ 519,    1],
       [ 561,    1],
       [ 576,    1],
       [ 648,    1],
       [ 738,    1],
       [ 800,    1]], dtype=int64)

In [110]:
#convert NaN values to 'NA' in 'pool_qc'
ames_df['pool_qc'] = ames_df['pool_qc'].replace(np.nan,'None')

In [111]:
#check for unusual values
np.array(np.unique(ames_df['pool_qc'], return_counts=True)).T

array([['Ex', 1],
       ['Fa', 2],
       ['Gd', 4],
       ['None', 2042],
       ['TA', 2]], dtype=object)

In [112]:
#convert NaN values to 'None' in 'fence'
ames_df['fence'] = ames_df['fence'].replace(np.nan,'None')

In [113]:
#check for unusual values
np.array(np.unique(ames_df['fence'], return_counts=True)).T

array([['GdPrv', 83],
       ['GdWo', 80],
       ['MnPrv', 227],
       ['MnWw', 10],
       ['None', 1651]], dtype=object)

In [114]:
#convert NaN to 'None' in 'misc_feature'
ames_df['misc_feature'] = ames_df['misc_feature'].replace(np.nan,'None')

In [115]:
#check for unusual values
np.array(np.unique(ames_df['misc_feature'], return_counts=True)).T

array([['Elev', 1],
       ['Gar2', 4],
       ['None', 1986],
       ['Othr', 3],
       ['Shed', 56],
       ['TenC', 1]], dtype=object)

In [116]:
#check for unusual values
np.array(np.unique(ames_df['misc_val'], return_counts=True)).T

array([[    0,  1986],
       [   54,     1],
       [   80,     1],
       [  300,     1],
       [  400,    12],
       [  450,     5],
       [  455,     1],
       [  460,     1],
       [  480,     1],
       [  500,     8],
       [  600,     6],
       [  650,     1],
       [  700,     4],
       [  800,     1],
       [  900,     1],
       [ 1150,     1],
       [ 1200,     2],
       [ 1300,     1],
       [ 1500,     1],
       [ 2000,     5],
       [ 2500,     2],
       [ 3000,     2],
       [ 3500,     1],
       [ 4500,     2],
       [ 6500,     1],
       [ 8300,     1],
       [12500,     1],
       [17000,     1]], dtype=int64)

In [117]:
#check for unusual values
np.array(np.unique(ames_df['mo_sold'], return_counts=True)).T

array([[  1,  79],
       [  2, 104],
       [  3, 168],
       [  4, 208],
       [  5, 257],
       [  6, 352],
       [  7, 303],
       [  8, 167],
       [  9, 109],
       [ 10, 123],
       [ 11, 103],
       [ 12,  78]], dtype=int64)

In [118]:
#check for unusual values
np.array(np.unique(ames_df['yr_sold'], return_counts=True)).T

array([[2006,  438],
       [2007,  498],
       [2008,  435],
       [2009,  446],
       [2010,  234]], dtype=int64)

In [119]:
#check for unusual values
np.array(np.unique(ames_df['sale_type'], return_counts=True)).T

array([['COD', 63],
       ['CWD', 10],
       ['Con', 4],
       ['ConLD', 17],
       ['ConLI', 7],
       ['ConLw', 5],
       ['New', 160],
       ['Oth', 4],
       ['WD ', 1781]], dtype=object)

In [120]:
#check for unusual values
np.array(np.unique(ames_df['saleprice'], return_counts=True)).T

array([[ 12789,      1],
       [ 13100,      1],
       [ 34900,      1],
       ...,
       [584500,      1],
       [591587,      1],
       [611657,      1]], dtype=int64)

In [121]:
#check for null values
ames_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 81 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               2051 non-null   int64  
 1   pid              2051 non-null   int64  
 2   ms_subclass      2051 non-null   int64  
 3   ms_zoning        2051 non-null   object 
 4   lot_frontage     2051 non-null   int32  
 5   lot_area         2051 non-null   int64  
 6   street           2051 non-null   object 
 7   alley            2051 non-null   object 
 8   lot_shape        2051 non-null   object 
 9   land_contour     2051 non-null   object 
 10  utilities        2051 non-null   object 
 11  lot_config       2051 non-null   object 
 12  land_slope       2051 non-null   object 
 13  neighborhood     2051 non-null   object 
 14  condition_1      2051 non-null   object 
 15  condition_2      2051 non-null   object 
 16  bldg_type        2051 non-null   object 
 17  house_style   

In [122]:
#save train_dataframe to csv
ames_df.to_csv('../data/ames_df_train.csv',index=False)