# Test Data Cleaning Part 1

In [1]:
#import packages
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline


In [2]:
#load dataset
ames_df = pd.read_csv('../data/test.csv')
ames_df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [3]:
#convert column names to lower case
ames_df.columns= ames_df.columns.str.lower()

In [4]:
#replace spaces in column names with _
ames_df.columns = ames_df.columns.str.replace(' ', '_')
ames_df.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [5]:
#look at the info
ames_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 80 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               878 non-null    int64  
 1   pid              878 non-null    int64  
 2   ms_subclass      878 non-null    int64  
 3   ms_zoning        878 non-null    object 
 4   lot_frontage     718 non-null    float64
 5   lot_area         878 non-null    int64  
 6   street           878 non-null    object 
 7   alley            58 non-null     object 
 8   lot_shape        878 non-null    object 
 9   land_contour     878 non-null    object 
 10  utilities        878 non-null    object 
 11  lot_config       878 non-null    object 
 12  land_slope       878 non-null    object 
 13  neighborhood     878 non-null    object 
 14  condition_1      878 non-null    object 
 15  condition_2      878 non-null    object 
 16  bldg_type        878 non-null    object 
 17  house_style     

In [6]:
#display number of null values for each variable
pd.options.display.min_rows = 60
ames_df.isnull().sum().sort_values(ascending=False)

pool_qc           874
misc_feature      837
alley             820
fence             706
fireplace_qu      422
lot_frontage      160
garage_yr_blt      45
garage_finish      45
garage_qual        45
garage_cond        45
garage_type        44
bsmtfin_type_1     25
bsmt_qual          25
bsmt_cond          25
bsmt_exposure      25
bsmtfin_type_2     25
electrical          1
mas_vnr_type        1
mas_vnr_area        1
kitchen_abvgr       0
totrms_abvgrd       0
bedroom_abvgr       0
half_bath           0
full_bath           0
bsmt_half_bath      0
bsmt_full_bath      0
gr_liv_area         0
kitchen_qual        0
id                  0
functional          0
                 ... 
condition_2         0
condition_1         0
neighborhood        0
land_slope          0
utilities           0
central_air         0
land_contour        0
lot_shape           0
street              0
lot_area            0
ms_zoning           0
ms_subclass         0
overall_qual        0
overall_cond        0
year_built

In [7]:
#view unique values of 'ms_subclass'
np.array(np.unique(ames_df['ms_subclass'], return_counts=True)).T

#'ms_subclass' does not show any unusual values 

array([[ 20, 309],
       [ 30,  38],
       [ 40,   2],
       [ 45,   7],
       [ 50,  89],
       [ 60, 180],
       [ 70,  38],
       [ 75,   7],
       [ 80,  32],
       [ 85,  20],
       [ 90,  34],
       [120,  60],
       [160,  41],
       [180,   6],
       [190,  15]], dtype=int64)

In [8]:
#view unique values of 'ms_zoning'
np.array(np.unique(ames_df['ms_zoning'], return_counts=True)).T

#ms_zoning does not show any unusual values

array([['C (all)', 6],
       ['FV', 38],
       ['I (all)', 1],
       ['RH', 13],
       ['RL', 674],
       ['RM', 146]], dtype=object)

In [9]:
#convert nan values to 0 and convert 'lot_frontage' to int type
ames_df['lot_frontage'] = ames_df['lot_frontage'].fillna(0).astype(int)

Since there are 330 null values in 'lot frontage', the cells are filled with the 0 value.

In [10]:
ames_df.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type
0,2658,902301120,190,RM,69,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,0,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,0,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [11]:
#checking for unusual values in 'lot_area'
np.array(np.unique(ames_df['lot_area'], return_counts=True)).T

#'lot_area' does not contain any unusual values

array([[  1477,      1],
       [  1488,      1],
       [  1491,      1],
       ...,
       [ 56600,      1],
       [164660,      1],
       [215245,      1]], dtype=int64)

In [12]:
#checking 'street' for unusual values
np.array(np.unique(ames_df['street'], return_counts=True)).T

#'street' does not contain any unusual values

array([['Grvl', 5],
       ['Pave', 873]], dtype=object)

In [13]:
ames_df.head(5)

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type
0,2658,902301120,190,RM,69,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,0,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,0,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [14]:
#replace NaN with None for 'alley'
ames_df['alley'] = ames_df['alley'].replace(np.nan,'None')

In [15]:
ames_df.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type
0,2658,902301120,190,RM,69,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,0,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,0,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [16]:
#check for replacement of NaN in 'alley'
np.array(np.unique(ames_df['alley'], return_counts=True)).T

array([['Grvl', 35],
       ['None', 820],
       ['Pave', 23]], dtype=object)

In [17]:
#check for unusual values in 'lot_shape'
np.array(np.unique(ames_df['lot_shape'], return_counts=True)).T

#no unusual values in 'lot_shape'

array([['IR1', 286],
       ['IR2', 21],
       ['IR3', 7],
       ['Reg', 564]], dtype=object)

In [18]:
#check for unusual values in 'land_contour'
np.array(np.unique(ames_df['land_contour'], return_counts=True)).T

#no unusual values in 'land contour'

array([['Bnk', 36],
       ['HLS', 35],
       ['Low', 17],
       ['Lvl', 790]], dtype=object)

In [19]:
#check for unusual values in 'utilities'
np.array(np.unique(ames_df['utilities'], return_counts=True)).T

#no unusual values in 'utilities'

array([['AllPub', 877],
       ['NoSewr', 1]], dtype=object)

In [20]:
#check for unusual values in 'lot_config'
np.array(np.unique(ames_df['lot_config'], return_counts=True)).T

#no unusual values in 'lot_config'

array([['Corner', 163],
       ['CulDSac', 49],
       ['FR2', 25],
       ['FR3', 5],
       ['Inside', 636]], dtype=object)

In [21]:
#check for unusual values in 'land slope'
np.array(np.unique(ames_df['land_slope'], return_counts=True)).T

#no unusual values in 'land slope'

array([['Gtl', 835],
       ['Mod', 37],
       ['Sev', 6]], dtype=object)

In [22]:
#check for unusual values in 'neighborhood'
np.array(np.unique(ames_df['neighborhood'], return_counts=True)).T

#no unusual values in 'neighborhood'

array([['Blmngtn', 6],
       ['Blueste', 4],
       ['BrDale', 11],
       ['BrkSide', 32],
       ['ClearCr', 17],
       ['CollgCr', 87],
       ['Crawfor', 32],
       ['Edwards', 50],
       ['Gilbert', 49],
       ['Greens', 5],
       ['IDOTRR', 24],
       ['MeadowV', 13],
       ['Mitchel', 32],
       ['NAmes', 133],
       ['NPkVill', 6],
       ['NWAmes', 44],
       ['NoRidge', 23],
       ['NridgHt', 44],
       ['OldTown', 76],
       ['SWISU', 16],
       ['Sawyer', 40],
       ['SawyerW', 38],
       ['Somerst', 52],
       ['StoneBr', 13],
       ['Timber', 24],
       ['Veenker', 7]], dtype=object)

In [23]:
#check for unusual values in 'condition_1'
np.array(np.unique(ames_df['condition_1'], return_counts=True)).T

#no unusual values in 'condition_1'

array([['Artery', 22],
       ['Feedr', 55],
       ['Norm', 755],
       ['PosA', 8],
       ['PosN', 11],
       ['RRAe', 7],
       ['RRAn', 14],
       ['RRNe', 3],
       ['RRNn', 3]], dtype=object)

In [24]:
#check for unusual values in 'condition_2'
np.array(np.unique(ames_df['condition_2'], return_counts=True)).T

#no unusual values in 'condition_2'

array([['Feedr', 2],
       ['Norm', 875],
       ['PosA', 1]], dtype=object)

In [25]:
#check for unusual values in 'bldg_type'
np.array(np.unique(ames_df['bldg_type'], return_counts=True)).T

#Twnhs should be replaced with TwnhsI in 'bldg_type'

array([['1Fam', 724],
       ['2fmCon', 16],
       ['Duplex', 34],
       ['Twnhs', 32],
       ['TwnhsE', 72]], dtype=object)

In [26]:
#replace 'Twnhs' with 'TwnhsI'
ames_df['bldg_type'] = ames_df['bldg_type'].replace('Twnhs','TwnhsI')

In [27]:
#check for unusual values in 'bldg_type'

np.array(np.unique(ames_df['bldg_type'], return_counts=True)).T

#no unusual values in 'bldg_type'

array([['1Fam', 724],
       ['2fmCon', 16],
       ['Duplex', 34],
       ['TwnhsE', 72],
       ['TwnhsI', 32]], dtype=object)

In [28]:
#check for unusual values in 'house_style'
np.array(np.unique(ames_df['house_style'], return_counts=True)).T

#no unusual values in 'house_style'

array([['1.5Fin', 96],
       ['1.5Unf', 7],
       ['1Story', 422],
       ['2.5Fin', 2],
       ['2.5Unf', 10],
       ['2Story', 274],
       ['SFoyer', 33],
       ['SLvl', 34]], dtype=object)

In [29]:
#check for unusual values in 'overall_qual'
np.array(np.unique(ames_df['overall_qual'], return_counts=True)).T

#no unusual values in 'overall_qual'

array([[  2,   4],
       [  3,  11],
       [  4,  67],
       [  5, 262],
       [  6, 226],
       [  7, 171],
       [  8, 100],
       [  9,  30],
       [ 10,   7]], dtype=int64)

In [30]:
#check for unusual values in 'overall_cond'
np.array(np.unique(ames_df['overall_cond'], return_counts=True)).T

#no unusual values in 'overall_cond'

array([[  1,   3],
       [  2,   4],
       [  3,  15],
       [  4,  31],
       [  5, 485],
       [  6, 165],
       [  7, 120],
       [  8,  43],
       [  9,  12]], dtype=int64)

In [31]:
#check for unusual values in 'year_built'
np.array(np.unique(ames_df['year_built'], return_counts=True)).T

#no unusual values in 'year_built'

array([[1880,    2],
       [1882,    1],
       [1885,    1],
       [1890,    2],
       [1892,    1],
       [1900,    9],
       [1902,    1],
       [1904,    1],
       [1905,    1],
       [1906,    1],
       [1907,    1],
       [1908,    1],
       [1910,   17],
       [1912,    2],
       [1914,    1],
       [1915,    7],
       [1916,    2],
       [1917,    1],
       [1918,    6],
       [1919,    3],
       [1920,   19],
       [1921,    2],
       [1922,    4],
       [1923,    7],
       [1924,    5],
       [1925,   12],
       [1926,    7],
       [1927,    1],
       [1928,    1],
       [1930,    8],
       [1931,    3],
       [1932,    2],
       [1934,    1],
       [1935,    4],
       [1936,    2],
       [1937,    2],
       [1938,    6],
       [1939,    5],
       [1940,    7],
       [1941,    5],
       [1945,    4],
       [1946,    6],
       [1947,    3],
       [1948,    7],
       [1949,    7],
       [1950,   10],
       [1951,    5],
       [1952,

In [32]:
#check for unusual values in 'year_remod/add'
np.array(np.unique(ames_df['year_remod/add'], return_counts=True)).T

#no unusual values in 'year_remod/add'

array([[1950,   99],
       [1951,    4],
       [1952,    6],
       [1953,    2],
       [1954,   11],
       [1955,    7],
       [1956,    8],
       [1957,    5],
       [1958,    6],
       [1959,    9],
       [1960,    9],
       [1961,    7],
       [1962,    6],
       [1963,   11],
       [1964,    9],
       [1965,   10],
       [1966,    8],
       [1967,   12],
       [1968,   11],
       [1969,   10],
       [1970,   13],
       [1971,   13],
       [1972,   14],
       [1973,    8],
       [1974,    9],
       [1975,    4],
       [1976,   16],
       [1977,   15],
       [1978,    8],
       [1979,    8],
       [1980,   10],
       [1981,    7],
       [1982,    6],
       [1983,    4],
       [1984,    7],
       [1985,    5],
       [1986,    4],
       [1987,    4],
       [1988,    3],
       [1989,    4],
       [1990,    6],
       [1991,   10],
       [1992,   13],
       [1993,   10],
       [1994,   13],
       [1995,   18],
       [1996,   17],
       [1997,

In [33]:
#check for unusual values in 'roof_style'
np.array(np.unique(ames_df['roof_style'], return_counts=True)).T

#no unusual values in 'roof_style'

array([['Flat', 7],
       ['Gable', 702],
       ['Gambrel', 10],
       ['Hip', 153],
       ['Mansard', 4],
       ['Shed', 2]], dtype=object)

In [34]:
#check for unusual values in 'roof_matl'
np.array(np.unique(ames_df['roof_matl'], return_counts=True)).T

#no unusual values in 'roof_matl'

array([['CompShg', 861],
       ['Metal', 1],
       ['Roll', 1],
       ['Tar&Grv', 8],
       ['WdShake', 5],
       ['WdShngl', 2]], dtype=object)

In [35]:
#check for unusual values in 'exterior_1st'
np.array(np.unique(ames_df['exterior_1st'], return_counts=True)).T

#no unusual values in 'exterior_1st'

array([['AsbShng', 11],
       ['AsphShn', 1],
       ['BrkComm', 3],
       ['BrkFace', 24],
       ['CemntBd', 35],
       ['HdBoard', 142],
       ['MetalSd', 119],
       ['Plywood', 69],
       ['PreCast', 1],
       ['Stucco', 16],
       ['VinylSd', 302],
       ['Wd Sdng', 144],
       ['WdShing', 11]], dtype=object)

In [36]:
#check for unusual values in 'exterior_2nd'
np.array(np.unique(ames_df['exterior_2nd'], return_counts=True)).T

#no unusual values in 'exterior_2nd'

array([['AsbShng', 10],
       ['AsphShn', 1],
       ['Brk Cmn', 5],
       ['BrkFace', 13],
       ['CBlock', 1],
       ['CmentBd', 35],
       ['HdBoard', 131],
       ['ImStucc', 4],
       ['MetalSd', 123],
       ['Other', 1],
       ['Plywood', 89],
       ['PreCast', 1],
       ['Stucco', 17],
       ['VinylSd', 294],
       ['Wd Sdng', 135],
       ['Wd Shng', 18]], dtype=object)

In [37]:
#fill empty cells in 'mas_vnr_type' column with 'None'
ames_df['mas_vnr_type'].fillna('None', inplace = True)

In [38]:
#check for unusual values in 'mas_vnr_type'
np.array(np.unique(ames_df['mas_vnr_type'], return_counts=True)).T

#no unusual values in 'mas_vnr_type'

array([['BrkCmn', 12],
       ['BrkFace', 250],
       ['CBlock', 1],
       ['None', 535],
       ['Stone', 80]], dtype=object)

In [39]:
#check for unusual values in 'mas_vnr_type'
np.array(np.unique(ames_df['mas_vnr_area'], return_counts=True)).T

#'mas_vnr_type' contains 22 null values.

array([[0.000e+00, 5.320e+02],
       [1.100e+01, 1.000e+00],
       [1.400e+01, 2.000e+00],
       [1.600e+01, 2.000e+00],
       [1.800e+01, 1.000e+00],
       [2.000e+01, 2.000e+00],
       [2.300e+01, 1.000e+00],
       [3.200e+01, 1.000e+00],
       [3.400e+01, 1.000e+00],
       [4.400e+01, 2.000e+00],
       [4.500e+01, 2.000e+00],
       [4.800e+01, 1.000e+00],
       [5.000e+01, 3.000e+00],
       [5.100e+01, 1.000e+00],
       [5.200e+01, 1.000e+00],
       [5.300e+01, 2.000e+00],
       [5.400e+01, 1.000e+00],
       [5.800e+01, 1.000e+00],
       [6.000e+01, 1.000e+00],
       [6.500e+01, 1.000e+00],
       [6.600e+01, 1.000e+00],
       [6.700e+01, 1.000e+00],
       [6.800e+01, 1.000e+00],
       [7.000e+01, 2.000e+00],
       [7.200e+01, 2.000e+00],
       [7.400e+01, 1.000e+00],
       [7.500e+01, 1.000e+00],
       [7.600e+01, 2.000e+00],
       [8.000e+01, 5.000e+00],
       [8.100e+01, 1.000e+00],
       [8.200e+01, 2.000e+00],
       [8.400e+01, 1.000e+00],
       [

In [40]:
#fill empty cells in 'mas_vnr_area' column with 0
ames_df['mas_vnr_area'].fillna(0, inplace = True)

In [41]:
#check for unusual values in 'exter_qual'
np.array(np.unique(ames_df['exter_qual'], return_counts=True)).T

#no unusual values in 'exter_qual'

array([['Ex', 25],
       ['Fa', 9],
       ['Gd', 292],
       ['TA', 552]], dtype=object)

In [42]:
#check for unusual values in 'exter_cond'
np.array(np.unique(ames_df['exter_cond'], return_counts=True)).T

#no unusual values in 'exter_cond'

array([['Ex', 5],
       ['Fa', 18],
       ['Gd', 84],
       ['Po', 1],
       ['TA', 770]], dtype=object)

In [43]:
#check for unusual values in 'foundation'
np.array(np.unique(ames_df['foundation'], return_counts=True)).T

#no unusual values in 'foundation'

array([['BrkTil', 90],
       ['CBlock', 381],
       ['PConc', 383],
       ['Slab', 15],
       ['Stone', 6],
       ['Wood', 3]], dtype=object)

In [44]:
#fill empty cells in 'bsmt_qual' column with 'None'
ames_df['bsmt_qual'].fillna('None', inplace = True)

In [45]:
#check for unusual values in 'bsmt_qual'
np.array(np.unique(ames_df['bsmt_qual'], return_counts=True)).T

#no unusual values in 'bsmt_qual'

array([['Ex', 73],
       ['Fa', 28],
       ['Gd', 355],
       ['None', 25],
       ['Po', 1],
       ['TA', 396]], dtype=object)

In [46]:
#fill empty cells in 'bsmt_cond' column with 'None'
ames_df['bsmt_cond'].fillna('None', inplace = True)

In [47]:
#check for unusual values in 'bsmt_cond'
np.array(np.unique(ames_df['bsmt_cond'], return_counts=True)).T

#no unusual values in 'bsmt_cond'

array([['Fa', 39],
       ['Gd', 33],
       ['None', 25],
       ['TA', 781]], dtype=object)

In [48]:
#fill empty cells in 'bsmt_exposure' column with 'None'
ames_df['bsmt_exposure'].fillna('None', inplace = True)

In [49]:
#check for unusual values in 'bsmt_exposure'
np.array(np.unique(ames_df['bsmt_exposure'], return_counts=True)).T

#no unusual values in 'bsmt_exposure'

array([['Av', 130],
       ['Gd', 80],
       ['Mn', 76],
       ['No', 567],
       ['None', 25]], dtype=object)

In [50]:
#fill empty cells in 'bsmtfin_type_1' column with 'None'
ames_df['bsmtfin_type_1'].fillna('None', inplace = True)

In [51]:
#check for unusual values in 'bsmtfin_type_1'
np.array(np.unique(ames_df['bsmtfin_type_1'], return_counts=True)).T

#no unusual values in 'bsmtfin_type_1'

array([['ALQ', 136],
       ['BLQ', 69],
       ['GLQ', 243],
       ['LwQ', 52],
       ['None', 25],
       ['Rec', 105],
       ['Unf', 248]], dtype=object)

In [52]:
#replace NaN with 0 in 'bsmtfin_sf_1'
ames_df['bsmtfin_sf_1'] = ames_df['bsmtfin_sf_1'].replace(np.nan,0)

In [53]:
#check for unusual values in 'bsmtfin_sf_1'
np.array(np.unique(ames_df['bsmtfin_sf_1'], return_counts=True)).T

#no unusual values in 'bsmtfin_sf_1'

array([[   0,  273],
       [  16,    4],
       [  20,    1],
       [  24,   10],
       [  28,    1],
       [  33,    1],
       [  36,    1],
       [  40,    1],
       [  48,    2],
       [  49,    1],
       [  53,    1],
       [  54,    1],
       [  60,    1],
       [  68,    2],
       [  70,    1],
       [  73,    1],
       [  75,    1],
       [  77,    1],
       [  80,    1],
       [ 104,    1],
       [ 110,    1],
       [ 114,    1],
       [ 116,    1],
       [ 119,    1],
       [ 120,    1],
       [ 121,    1],
       [ 122,    1],
       [ 125,    1],
       [ 144,    2],
       [ 148,    1],
       [ 150,    1],
       [ 152,    1],
       [ 154,    1],
       [ 156,    1],
       [ 165,    1],
       [ 168,    3],
       [ 169,    1],
       [ 173,    1],
       [ 175,    1],
       [ 176,    1],
       [ 180,    2],
       [ 182,    1],
       [ 185,    1],
       [ 187,    1],
       [ 188,    2],
       [ 190,    1],
       [ 192,    3],
       [ 194,

In [54]:
#fill empty cells in 'bsmtfin_type_2' column with 'None'
ames_df['bsmtfin_type_2'].fillna('None', inplace = True)

In [55]:
#check for unusual values in 'bsmtfin_type_2'
np.array(np.unique(ames_df['bsmtfin_type_2'], return_counts=True)).T

#no unusual values in 'bsmtfin_type_2'

array([['ALQ', 18],
       ['BLQ', 20],
       ['GLQ', 11],
       ['LwQ', 29],
       ['None', 25],
       ['Rec', 26],
       ['Unf', 749]], dtype=object)

In [56]:
#replace NaN with 0 in 'bsmtfin_sf_2'
ames_df['bsmtfin_sf_2'] = ames_df['bsmtfin_sf_2'].replace(np.nan,0)

In [57]:
#check for unusual values in 'bsmtfin_sf_2'
np.array(np.unique(ames_df['bsmtfin_sf_2'], return_counts=True)).T

#no unusual values in 'bsmtfin_sf_2'

array([[   0,  774],
       [  32,    1],
       [  40,    1],
       [  42,    1],
       [  46,    1],
       [  48,    1],
       [  63,    1],
       [  78,    1],
       [  93,    1],
       [ 121,    1],
       [ 136,    1],
       [ 139,    1],
       [ 144,    1],
       [ 147,    2],
       [ 153,    1],
       [ 165,    1],
       [ 168,    2],
       [ 174,    1],
       [ 176,    1],
       [ 180,    2],
       [ 182,    2],
       [ 184,    1],
       [ 193,    1],
       [ 210,    1],
       [ 216,    1],
       [ 240,    1],
       [ 243,    1],
       [ 250,    1],
       [ 252,    1],
       [ 264,    1],
       [ 276,    1],
       [ 279,    1],
       [ 287,    2],
       [ 288,    1],
       [ 319,    1],
       [ 336,    1],
       [ 337,    1],
       [ 344,    1],
       [ 350,    1],
       [ 352,    1],
       [ 360,    1],
       [ 362,    1],
       [ 374,    2],
       [ 391,    2],
       [ 393,    1],
       [ 396,    1],
       [ 398,    1],
       [ 400,

In [58]:
#replace NaN with 0
ames_df['bsmt_unf_sf'] = ames_df['bsmt_unf_sf'].replace(np.nan,0)

In [59]:
#check for unusual values in 'bsmt_unf_sf'
np.array(np.unique(ames_df['bsmt_unf_sf'], return_counts=True)).T

#no unusual values in 'bsmt_unf_sf'

array([[   0,   79],
       [  14,    1],
       [  22,    1],
       ...,
       [2002,    1],
       [2042,    1],
       [2046,    1]], dtype=int64)

In [60]:
#replace NaN with 0
ames_df['total_bsmt_sf'] = ames_df['total_bsmt_sf'].replace(np.nan,0)

In [61]:
#check for unusual values in 'total_bsmt_sf'
np.array(np.unique(ames_df['total_bsmt_sf'], return_counts=True)).T

#no unusual values in 'total_bsmt_sf'

array([[   0,   25],
       [ 105,    1],
       [ 240,    1],
       ...,
       [2535,    1],
       [2552,    1],
       [2630,    1]], dtype=int64)

In [62]:
#check for unusual values
np.array(np.unique(ames_df['heating'], return_counts=True)).T

array([['Floor', 1],
       ['GasA', 866],
       ['GasW', 7],
       ['Grav', 4]], dtype=object)

In [63]:
#check for unusual values
np.array(np.unique(ames_df['heating_qc'], return_counts=True)).T

array([['Ex', 429],
       ['Fa', 25],
       ['Gd', 157],
       ['TA', 267]], dtype=object)

In [64]:
#check for unusual values
np.array(np.unique(ames_df['central_air'], return_counts=True)).T

array([['N', 55],
       ['Y', 823]], dtype=object)

In [65]:
ames_df['electrical'].value_counts(dropna=False)

SBrkr    813
FuseA     48
FuseF     15
NaN        1
FuseP      1
Name: electrical, dtype: int64

In [66]:
#replace NaN with 'SBrKr'
ames_df['electrical'] = ames_df['electrical'].replace(np.nan,'SBrKr')

In [67]:
#check for unusual values
np.array(np.unique(ames_df['electrical'], return_counts=True)).T

array([['FuseA', 48],
       ['FuseF', 15],
       ['FuseP', 1],
       ['SBrKr', 1],
       ['SBrkr', 813]], dtype=object)

In [68]:
#check for unusual values
np.array(np.unique(ames_df['1st_flr_sf'], return_counts=True)).T

array([[ 407,    1],
       [ 432,    1],
       [ 442,    1],
       ...,
       [2522,    1],
       [2552,    1],
       [2674,    1]], dtype=int64)

In [69]:
#check for unusual values
np.array(np.unique(ames_df['2nd_flr_sf'], return_counts=True)).T

array([[   0,  487],
       [ 110,    1],
       [ 167,    1],
       [ 180,    1],
       [ 182,    1],
       [ 185,    1],
       [ 192,    1],
       [ 208,    1],
       [ 218,    1],
       [ 224,    1],
       [ 228,    1],
       [ 240,    1],
       [ 308,    1],
       [ 320,    1],
       [ 330,    1],
       [ 336,    1],
       [ 343,    1],
       [ 348,    1],
       [ 349,    1],
       [ 363,    1],
       [ 368,    1],
       [ 375,    1],
       [ 378,    3],
       [ 380,    2],
       [ 384,    1],
       [ 403,    1],
       [ 406,    2],
       [ 412,    1],
       [ 424,    2],
       [ 432,    2],
       [ 434,    1],
       [ 436,    1],
       [ 442,    2],
       [ 456,    1],
       [ 457,    1],
       [ 464,    1],
       [ 468,    1],
       [ 472,    1],
       [ 473,    2],
       [ 496,    1],
       [ 499,    1],
       [ 501,    1],
       [ 504,    3],
       [ 510,    1],
       [ 511,    1],
       [ 512,    1],
       [ 514,    1],
       [ 516,

In [70]:
#check for unusual values
np.array(np.unique(ames_df['low_qual_fin_sf'], return_counts=True)).T

array([[  0, 871],
       [ 80,   1],
       [232,   1],
       [360,   1],
       [392,   1],
       [420,   1],
       [431,   1],
       [481,   1]], dtype=int64)

In [71]:
#check for unusual values
np.array(np.unique(ames_df['gr_liv_area'], return_counts=True)).T

array([[ 407,    1],
       [ 599,    1],
       [ 630,    4],
       ...,
       [3627,    1],
       [4316,    1],
       [4476,    1]], dtype=int64)

In [72]:
#replace NaN with 0
ames_df['bsmt_full_bath'] = ames_df['bsmt_full_bath'].replace(np.nan,0)

In [73]:
#check for unusual values
np.array(np.unique(ames_df['bsmt_full_bath'], return_counts=True)).T

array([[  0, 507],
       [  1, 356],
       [  2,  15]], dtype=int64)

In [74]:
#replace NaN with 0
ames_df['bsmt_half_bath'] = ames_df['bsmt_half_bath'].replace(np.nan,0)

In [75]:
#check for unusual values
np.array(np.unique(ames_df['bsmt_half_bath'], return_counts=True)).T

array([[  0, 829],
       [  1,  49]], dtype=int64)

In [76]:
#check for unusual values
np.array(np.unique(ames_df['full_bath'], return_counts=True)).T

array([[  0,   4],
       [  1, 418],
       [  2, 436],
       [  3,  18],
       [  4,   2]], dtype=int64)

In [77]:
#check for unusual values
np.array(np.unique(ames_df['half_bath'], return_counts=True)).T

array([[  0, 535],
       [  1, 336],
       [  2,   7]], dtype=int64)

In [78]:
#check for unusual values
np.array(np.unique(ames_df['bedroom_abvgr'], return_counts=True)).T

array([[  0,   3],
       [  1,  37],
       [  2, 199],
       [  3, 488],
       [  4, 135],
       [  5,   7],
       [  6,   9]], dtype=int64)

In [79]:
#check for unusual values
np.array(np.unique(ames_df['kitchen_abvgr'], return_counts=True)).T

array([[  0,   1],
       [  1, 835],
       [  2,  41],
       [  3,   1]], dtype=int64)

In [80]:
#check for unusual values
np.array(np.unique(ames_df['kitchen_qual'], return_counts=True)).T

array([['Ex', 53],
       ['Fa', 23],
       ['Gd', 354],
       ['Po', 1],
       ['TA', 447]], dtype=object)

In [81]:
#check for unusual values
np.array(np.unique(ames_df['totrms_abvgrd'], return_counts=True)).T

array([[  3,  14],
       [  4,  57],
       [  5, 179],
       [  6, 247],
       [  7, 174],
       [  8, 119],
       [  9,  45],
       [ 10,  31],
       [ 11,   9],
       [ 12,   3]], dtype=int64)

In [82]:
#check for unusual values
np.array(np.unique(ames_df['functional'], return_counts=True)).T

array([['Maj1', 7],
       ['Maj2', 2],
       ['Min1', 23],
       ['Min2', 28],
       ['Mod', 6],
       ['Typ', 812]], dtype=object)

In [83]:
#check for unusual values
np.array(np.unique(ames_df['fireplaces'], return_counts=True)).T

array([[  0, 422],
       [  1, 375],
       [  2,  75],
       [  3,   6]], dtype=int64)

In [84]:
#replace NaN values with None
ames_df['fireplace_qu'] = ames_df['fireplace_qu'].replace(np.nan,'None')

In [85]:
#check for unusual values
np.array(np.unique(ames_df['fireplace_qu'], return_counts=True)).T

array([['Ex', 12],
       ['Fa', 16],
       ['Gd', 220],
       ['None', 422],
       ['Po', 15],
       ['TA', 193]], dtype=object)

In [86]:
#replace NaN values with None
ames_df['garage_type'] = ames_df['garage_type'].replace(np.nan,'None')

In [87]:
#check for unusual values
np.array(np.unique(ames_df['garage_type'], return_counts=True)).T

array([['2Types', 4],
       ['Attchd', 518],
       ['Basment', 9],
       ['BuiltIn', 53],
       ['CarPort', 4],
       ['Detchd', 246],
       ['None', 44]], dtype=object)

In [88]:
#replace NaN values with 0
ames_df['garage_yr_blt'] = ames_df['garage_yr_blt'].replace(np.nan,0)

In [89]:
#convert 'garage_yr_blt' to int type
ames_df['garage_yr_blt'] = ames_df['garage_yr_blt'].astype(int)

In [90]:
ames_df['garage_yr_blt'].dtypes

dtype('int32')

In [91]:
#replace NaN values with None
ames_df['garage_finish'] = ames_df['garage_finish'].replace(np.nan,'None')

In [92]:
#check for unusual values
np.array(np.unique(ames_df['garage_finish'], return_counts=True)).T

array([['Fin', 218],
       ['None', 45],
       ['RFn', 233],
       ['Unf', 382]], dtype=object)

In [93]:
#replace NaN values with 0
ames_df['garage_cars'] = ames_df['garage_cars'].replace(np.nan,0)

In [94]:
#convert 'garage_cars' to int type
ames_df['garage_cars']=ames_df['garage_cars'].astype(int)

In [95]:
#check for unusual values
np.array(np.unique(ames_df['garage_cars'], return_counts=True)).T

array([[  0,  44],
       [  1, 254],
       [  2, 467],
       [  3, 110],
       [  4,   3]], dtype=int64)

In [96]:
ames_df['garage_cars'].dtypes

dtype('int32')

In [97]:
#replace NaN values with 0
ames_df['garage_area'] = ames_df['garage_area'].replace(np.nan,0)

In [98]:
#convert 'garage_area' to int type
ames_df['garage_area']=ames_df['garage_area'].astype(int)

In [99]:
#check for unusual values
np.array(np.unique(ames_df['garage_area'], return_counts=True)).T

array([[   0,   44],
       [ 160,    1],
       [ 162,    1],
       [ 164,    1],
       [ 180,    2],
       [ 184,    1],
       [ 186,    1],
       [ 189,    1],
       [ 192,    1],
       [ 195,    2],
       [ 200,    4],
       [ 205,    1],
       [ 210,    1],
       [ 215,    1],
       [ 216,    7],
       [ 220,    1],
       [ 224,    1],
       [ 225,    1],
       [ 230,    1],
       [ 234,    1],
       [ 240,   24],
       [ 244,    1],
       [ 250,    2],
       [ 252,    5],
       [ 253,    1],
       [ 256,    1],
       [ 258,    1],
       [ 260,    2],
       [ 264,   18],
       [ 266,    1],
       [ 270,    1],
       [ 272,    1],
       [ 273,    2],
       [ 275,    2],
       [ 276,    1],
       [ 280,    6],
       [ 281,    1],
       [ 286,   11],
       [ 287,    2],
       [ 288,    7],
       [ 290,    1],
       [ 292,    1],
       [ 294,    3],
       [ 297,    3],
       [ 299,    4],
       [ 300,    5],
       [ 301,    2],
       [ 303,

In [100]:
ames_df['garage_area'].dtypes

dtype('int32')

In [101]:
#replace NaN values with 'None' 
ames_df['garage_qual'] = ames_df['garage_qual'].replace(np.nan,'None')

In [102]:
#check for unusual values
np.array(np.unique(ames_df['garage_qual'], return_counts=True)).T

array([['Fa', 42],
       ['Gd', 6],
       ['None', 45],
       ['Po', 3],
       ['TA', 782]], dtype=object)

In [103]:
#replace NaN values with 'None' in 'garage_cond'
ames_df['garage_cond'] = ames_df['garage_cond'].replace(np.nan,'None')

In [104]:
#check for unusual values
np.array(np.unique(ames_df['garage_cond'], return_counts=True)).T

array([['Ex', 1],
       ['Fa', 27],
       ['Gd', 3],
       ['None', 45],
       ['Po', 6],
       ['TA', 796]], dtype=object)

In [105]:
#check for unusual values
np.array(np.unique(ames_df['paved_drive'], return_counts=True)).T

array([['N', 65],
       ['P', 23],
       ['Y', 790]], dtype=object)

In [106]:
#check for unusual values
np.array(np.unique(ames_df['wood_deck_sf'], return_counts=True)).T

array([[  0, 451],
       [ 20,   1],
       [ 23,   1],
       [ 25,   2],
       [ 26,   1],
       [ 27,   1],
       [ 32,   2],
       [ 36,   3],
       [ 38,   1],
       [ 40,   4],
       [ 42,   2],
       [ 44,   1],
       [ 48,   1],
       [ 49,   1],
       [ 50,   1],
       [ 55,   2],
       [ 56,   1],
       [ 57,   1],
       [ 58,   1],
       [ 60,   2],
       [ 63,   1],
       [ 64,   2],
       [ 66,   1],
       [ 68,   1],
       [ 70,   1],
       [ 71,   1],
       [ 72,   3],
       [ 74,   2],
       [ 76,   3],
       [ 77,   1],
       [ 80,   3],
       [ 81,   1],
       [ 84,   1],
       [ 85,   1],
       [ 89,   1],
       [ 90,   2],
       [ 95,   1],
       [ 96,   2],
       [100,  21],
       [104,   1],
       [106,   1],
       [108,   2],
       [110,   2],
       [112,   1],
       [113,   1],
       [115,   1],
       [116,   2],
       [120,  13],
       [121,   1],
       [123,   3],
       [124,   2],
       [125,   2],
       [127,

In [107]:
#check for unusual values
np.array(np.unique(ames_df['open_porch_sf'], return_counts=True)).T

array([[  0, 388],
       [  6,   1],
       [ 12,   2],
       [ 15,   1],
       [ 16,   4],
       [ 18,   2],
       [ 20,   9],
       [ 21,   5],
       [ 22,   3],
       [ 23,   2],
       [ 24,  13],
       [ 25,   2],
       [ 26,   5],
       [ 27,   3],
       [ 28,  17],
       [ 29,   1],
       [ 30,   8],
       [ 32,   9],
       [ 33,   4],
       [ 34,   5],
       [ 35,   8],
       [ 36,  14],
       [ 38,   8],
       [ 39,  10],
       [ 40,  10],
       [ 41,   1],
       [ 42,  10],
       [ 43,   4],
       [ 44,   5],
       [ 45,   8],
       [ 46,   4],
       [ 47,   1],
       [ 48,  12],
       [ 49,   3],
       [ 50,   9],
       [ 51,   1],
       [ 52,   1],
       [ 53,   2],
       [ 54,   5],
       [ 55,   5],
       [ 56,   6],
       [ 57,   2],
       [ 58,   3],
       [ 59,   1],
       [ 60,   7],
       [ 61,   2],
       [ 62,   1],
       [ 63,   7],
       [ 64,   9],
       [ 65,   3],
       [ 66,   1],
       [ 68,   7],
       [ 69,

In [108]:
#check for unusual values
np.array(np.unique(ames_df['enclosed_porch'], return_counts=True)).T

array([[   0,  746],
       [  20,    1],
       [  28,    1],
       [  32,    2],
       [  35,    1],
       [  40,    2],
       [  41,    2],
       [  42,    2],
       [  48,    1],
       [  51,    1],
       [  55,    2],
       [  56,    1],
       [  60,    2],
       [  64,    1],
       [  68,    1],
       [  70,    1],
       [  80,    3],
       [  81,    1],
       [  84,    2],
       [  86,    1],
       [  88,    1],
       [  90,    1],
       [  91,    1],
       [  94,    1],
       [  96,    3],
       [  98,    1],
       [ 100,    1],
       [ 102,    3],
       [ 105,    1],
       [ 108,    2],
       [ 112,    9],
       [ 116,    3],
       [ 120,    3],
       [ 121,    1],
       [ 123,    1],
       [ 126,    3],
       [ 128,    1],
       [ 132,    1],
       [ 137,    1],
       [ 139,    1],
       [ 140,    2],
       [ 144,    5],
       [ 150,    2],
       [ 160,    2],
       [ 164,    1],
       [ 168,    4],
       [ 169,    2],
       [ 170,

In [109]:
#check for unusual values
np.array(np.unique(ames_df['3ssn_porch'], return_counts=True)).T

array([[  0, 867],
       [ 23,   1],
       [130,   1],
       [174,   1],
       [180,   1],
       [196,   1],
       [216,   1],
       [219,   1],
       [225,   1],
       [238,   1],
       [320,   1],
       [360,   1]], dtype=int64)

In [110]:
#check for unusual values
np.array(np.unique(ames_df['screen_porch'], return_counts=True)).T

array([[  0, 803],
       [ 40,   1],
       [ 60,   1],
       [ 63,   1],
       [ 80,   1],
       [ 90,   1],
       [ 92,   1],
       [ 99,   1],
       [100,   1],
       [108,   1],
       [110,   1],
       [112,   1],
       [115,   2],
       [116,   1],
       [117,   1],
       [119,   1],
       [120,   3],
       [121,   1],
       [123,   1],
       [126,   1],
       [128,   1],
       [138,   1],
       [144,   4],
       [153,   2],
       [155,   1],
       [156,   2],
       [160,   4],
       [166,   1],
       [168,   2],
       [175,   1],
       [178,   1],
       [184,   1],
       [185,   1],
       [189,   1],
       [192,   4],
       [195,   1],
       [196,   1],
       [197,   1],
       [198,   1],
       [200,   4],
       [204,   1],
       [216,   3],
       [221,   1],
       [225,   1],
       [227,   1],
       [228,   1],
       [256,   3],
       [259,   1],
       [263,   1],
       [266,   1],
       [273,   1],
       [287,   1],
       [288,

In [111]:
#check for unusual values
np.array(np.unique(ames_df['pool_area'], return_counts=True)).T

array([[  0, 874],
       [144,   1],
       [444,   1],
       [512,   1],
       [555,   1]], dtype=int64)

In [112]:
#convert NaN values to 'NA' in 'pool_qc'
ames_df['pool_qc'] = ames_df['pool_qc'].replace(np.nan,'None')

In [113]:
#check for unusual values
np.array(np.unique(ames_df['pool_qc'], return_counts=True)).T

array([['Ex', 3],
       ['None', 874],
       ['TA', 1]], dtype=object)

In [114]:
#convert NaN values to 'None' in 'fence'
ames_df['fence'] = ames_df['fence'].replace(np.nan,'None')

In [115]:
#check for unusual values
np.array(np.unique(ames_df['fence'], return_counts=True)).T

array([['GdPrv', 35],
       ['GdWo', 32],
       ['MnPrv', 103],
       ['MnWw', 2],
       ['None', 706]], dtype=object)

In [116]:
#convert NaN to 'None' in 'misc_feature'
ames_df['misc_feature'] = ames_df['misc_feature'].replace(np.nan,'None')

In [117]:
#check for unusual values
np.array(np.unique(ames_df['misc_feature'], return_counts=True)).T

array([['Gar2', 1],
       ['None', 837],
       ['Othr', 1],
       ['Shed', 39]], dtype=object)

In [118]:
#check for unusual values
np.array(np.unique(ames_df['misc_val'], return_counts=True)).T

array([[    0,   840],
       [  350,     1],
       [  400,     6],
       [  420,     1],
       [  450,     4],
       [  480,     1],
       [  490,     1],
       [  500,     5],
       [  560,     1],
       [  600,     2],
       [  620,     1],
       [  650,     2],
       [  700,     3],
       [  750,     1],
       [ 1000,     1],
       [ 1200,     1],
       [ 1400,     1],
       [ 1500,     2],
       [ 1512,     1],
       [ 2000,     2],
       [15500,     1]], dtype=int64)

In [119]:
#check for unusual values
np.array(np.unique(ames_df['mo_sold'], return_counts=True)).T

array([[  1,  44],
       [  2,  29],
       [  3,  64],
       [  4,  71],
       [  5, 138],
       [  6, 153],
       [  7, 146],
       [  8,  66],
       [  9,  52],
       [ 10,  49],
       [ 11,  40],
       [ 12,  26]], dtype=int64)

In [120]:
#check for unusual values
np.array(np.unique(ames_df['yr_sold'], return_counts=True)).T

array([[2006,  187],
       [2007,  195],
       [2008,  187],
       [2009,  202],
       [2010,  107]], dtype=int64)

In [121]:
#check for unusual values
np.array(np.unique(ames_df['sale_type'], return_counts=True)).T

array([['COD', 24],
       ['CWD', 2],
       ['Con', 1],
       ['ConLD', 9],
       ['ConLI', 2],
       ['ConLw', 3],
       ['New', 78],
       ['Oth', 3],
       ['VWD', 1],
       ['WD ', 755]], dtype=object)

In [122]:
ames_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 80 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               878 non-null    int64  
 1   pid              878 non-null    int64  
 2   ms_subclass      878 non-null    int64  
 3   ms_zoning        878 non-null    object 
 4   lot_frontage     878 non-null    int32  
 5   lot_area         878 non-null    int64  
 6   street           878 non-null    object 
 7   alley            878 non-null    object 
 8   lot_shape        878 non-null    object 
 9   land_contour     878 non-null    object 
 10  utilities        878 non-null    object 
 11  lot_config       878 non-null    object 
 12  land_slope       878 non-null    object 
 13  neighborhood     878 non-null    object 
 14  condition_1      878 non-null    object 
 15  condition_2      878 non-null    object 
 16  bldg_type        878 non-null    object 
 17  house_style     

In [123]:
#save train_dataframe to csv
ames_df.to_csv('../data/ames_df_test.csv',index=False)