# 1. Train Data Cleaning

In [1]:
#import packages
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

In [2]:
#load dataset
ames_df = pd.read_csv('../data/train.csv')
ames_df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [3]:
#convert column names to lower case
ames_df.columns= ames_df.columns.str.lower()

In [4]:
#replace spaces in column names with _
ames_df.columns = ames_df.columns.str.replace(' ', '_')
ames_df.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,saleprice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [5]:
#look at the info
ames_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 81 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               2051 non-null   int64  
 1   pid              2051 non-null   int64  
 2   ms_subclass      2051 non-null   int64  
 3   ms_zoning        2051 non-null   object 
 4   lot_frontage     1721 non-null   float64
 5   lot_area         2051 non-null   int64  
 6   street           2051 non-null   object 
 7   alley            140 non-null    object 
 8   lot_shape        2051 non-null   object 
 9   land_contour     2051 non-null   object 
 10  utilities        2051 non-null   object 
 11  lot_config       2051 non-null   object 
 12  land_slope       2051 non-null   object 
 13  neighborhood     2051 non-null   object 
 14  condition_1      2051 non-null   object 
 15  condition_2      2051 non-null   object 
 16  bldg_type        2051 non-null   object 
 17  house_style   

In [6]:
#display number of null values for each variable
pd.options.display.min_rows = 30
ames_df.isnull().sum().sort_values(ascending=False)

pool_qc           2042
misc_feature      1986
alley             1911
fence             1651
fireplace_qu      1000
lot_frontage       330
garage_finish      114
garage_qual        114
garage_yr_blt      114
garage_cond        114
garage_type        113
bsmt_exposure       58
bsmtfin_type_2      56
bsmtfin_type_1      55
bsmt_cond           55
                  ... 
year_built           0
year_remod/add       0
roof_style           0
roof_matl            0
exterior_1st         0
exterior_2nd         0
exter_qual           0
exter_cond           0
foundation           0
pid                  0
heating_qc           0
central_air          0
electrical           0
1st_flr_sf           0
saleprice            0
Length: 81, dtype: int64

### Pool QC, Misc Feature, Alley & Fence

Since the 4 variables 'pool_qc', 'misc_feature', 'alley' and 'fence' have more than 1500 null values and they do not appear to be key variables affecting the sale price, we decide to drop all 4 columns.

In [7]:
#drop 'pool_qc', 'misc_feature', 'alley' and 'fence' columns
ames_df.drop(columns=['pool_qc', 'misc_feature', 'alley', 'fence'], inplace=True)

In [8]:
ames_df.shape

(2051, 77)

In [9]:
ames_df.isnull().sum().sort_values(ascending=False)

fireplace_qu      1000
lot_frontage       330
garage_cond        114
garage_yr_blt      114
garage_finish      114
garage_qual        114
garage_type        113
bsmt_exposure       58
bsmtfin_type_2      56
bsmt_qual           55
bsmt_cond           55
bsmtfin_type_1      55
mas_vnr_type        22
mas_vnr_area        22
bsmt_half_bath       2
                  ... 
overall_cond         0
year_built           0
year_remod/add       0
roof_style           0
roof_matl            0
exterior_1st         0
exterior_2nd         0
exter_qual           0
exter_cond           0
foundation           0
pid                  0
heating              0
heating_qc           0
central_air          0
saleprice            0
Length: 77, dtype: int64

### Fireplace

In [10]:
ames_df[['fireplaces','fireplace_qu']].head(15)

Unnamed: 0,fireplaces,fireplace_qu
0,0,
1,1,TA
2,0,
3,0,
4,0,
5,1,Gd
6,0,
7,0,
8,0,
9,2,TA


In [11]:
ames_df['fireplace_qu'].value_counts()

Gd    523
TA    407
Fa     59
Ex     31
Po     31
Name: fireplace_qu, dtype: int64

In [12]:
#replace 'NaN' with 'None' if 'fireplaces' = 0
ames_df.loc[ames_df.fireplaces.isin([0]),'fireplace_qu']='None'

In [13]:
ames_df[['fireplaces','fireplace_qu']].head(15)

Unnamed: 0,fireplaces,fireplace_qu
0,0,
1,1,TA
2,0,
3,0,
4,0,
5,1,Gd
6,0,
7,0,
8,0,
9,2,TA


In [14]:
#check value count for 'fireplace_qu'
ames_df['fireplace_qu'].value_counts()

None    1000
Gd       523
TA       407
Fa        59
Ex        31
Po        31
Name: fireplace_qu, dtype: int64

In [15]:
#check value count for 'fireplaces'
ames_df['fireplaces'].value_counts()

0    1000
1     898
2     146
3       6
4       1
Name: fireplaces, dtype: int64

Both 'fireplaces' and 'fireplace_qu' have corresponding value counts for 0 and 'None'.  

### Lot Frontage

In [16]:
#check values for 'lot_frontage'
ames_df[['lot_frontage','lot_area']].head()

Unnamed: 0,lot_frontage,lot_area
0,,13517
1,43.0,11492
2,68.0,7922
3,73.0,9802
4,82.0,14235


In [17]:
#Replace NaN with ames_df['lot_frontage'].median()
ames_df['lot_frontage'] = ames_df['lot_frontage'].replace(np.nan,ames_df['lot_frontage'].median())

In [18]:
#check values for 'lot_frontage'
ames_df[['lot_frontage','lot_area']].head()

Unnamed: 0,lot_frontage,lot_area
0,68.0,13517
1,43.0,11492
2,68.0,7922
3,73.0,9802
4,82.0,14235


In [19]:
ames_df.isnull().sum().sort_values(ascending=False)

garage_yr_blt     114
garage_finish     114
garage_qual       114
garage_cond       114
garage_type       113
bsmt_exposure      58
bsmtfin_type_2     56
bsmt_cond          55
bsmt_qual          55
bsmtfin_type_1     55
mas_vnr_type       22
mas_vnr_area       22
bsmt_half_bath      2
bsmt_full_bath      2
bsmtfin_sf_1        1
                 ... 
overall_cond        0
year_built          0
year_remod/add      0
roof_style          0
roof_matl           0
exterior_1st        0
exterior_2nd        0
exter_qual          0
exter_cond          0
foundation          0
pid                 0
heating             0
heating_qc          0
central_air         0
saleprice           0
Length: 77, dtype: int64

### Garage

In [20]:
ames_df[['garage_yr_blt','year_built']].head(10)

Unnamed: 0,garage_yr_blt,year_built
0,1976.0,1976
1,1997.0,1996
2,1953.0,1953
3,2007.0,2006
4,1957.0,1900
5,1966.0,1966
6,2005.0,2005
7,1959.0,1959
8,1952.0,1952
9,1969.0,1969


The columns 'garage_yr_blt' and 'year_built' are very similar, therefore it would be better to drop 'garage_year_blt'.

In [21]:
#drop 'garage_yr_blt' 
ames_df.drop(columns=['garage_yr_blt'], inplace=True)

In [22]:
#display NaN values in 'garage_finish','garage_qual', 'garage_cond', 'garage_type'
ames_df.loc[pd.isnull(ames_df.garage_finish), ['garage_finish','garage_qual', 'garage_cond', 'garage_type']].head(10)

Unnamed: 0,garage_finish,garage_qual,garage_cond,garage_type
28,,,,
53,,,,
65,,,,
79,,,,
101,,,,
103,,,,
114,,,,
120,,,,
134,,,,
136,,,,


Since the missing values in the 4 columns, 'garage_finish','garage_qual', 'garage_cond' & 'garage_type' all correspond to each other, we can fill the missing values in 'garage_finish','garage_qual', 'garage_cond' and 'garage_type' with 'None'.

In [23]:
#replace NaN values with None in 'garage_finish'
ames_df['garage_finish'] = ames_df['garage_finish'].replace(np.nan,'None')

#replace NaN values with None in 'garage_qual'
ames_df['garage_qual'] = ames_df['garage_qual'].replace(np.nan,'None')

#replace NaN values with None in 'garage_cond'
ames_df['garage_cond'] = ames_df['garage_cond'].replace(np.nan,'None')

#replace NaN values with None in 'garage_type'
ames_df['garage_type'] = ames_df['garage_type'].replace(np.nan,'None')

In [24]:
#convert string values to numberic values in 'garage_finish','garage_qual', 'garage_cond'
ames_df.garage_finish = ames_df.garage_finish.map({'None': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3})

ames_df.garage_qual = ames_df.garage_qual.map({'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})

ames_df.garage_cond = ames_df.garage_cond.map({'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})

In [25]:
ames_df['garage_finish'].value_counts() 

1    849
2    579
3    509
0    114
Name: garage_finish, dtype: int64

In [26]:
ames_df['garage_qual'].value_counts() 

3    1832
0     114
2      82
4      18
5       3
1       2
Name: garage_qual, dtype: int64

In [27]:
ames_df['garage_cond'].value_counts() 

3    1868
0     114
2      47
4      12
1       8
5       2
Name: garage_cond, dtype: int64

In [28]:
ames_df.isnull().sum().sort_values(ascending=False)

bsmt_exposure     58
bsmtfin_type_2    56
bsmt_cond         55
bsmt_qual         55
bsmtfin_type_1    55
mas_vnr_type      22
mas_vnr_area      22
bsmt_full_bath     2
bsmt_half_bath     2
garage_cars        1
garage_area        1
total_bsmt_sf      1
bsmtfin_sf_1       1
bsmtfin_sf_2       1
bsmt_unf_sf        1
                  ..
overall_qual       0
overall_cond       0
year_built         0
year_remod/add     0
roof_style         0
roof_matl          0
exterior_1st       0
exterior_2nd       0
exter_qual         0
exter_cond         0
foundation         0
pid                0
heating            0
heating_qc         0
saleprice          0
Length: 76, dtype: int64

### Basement

In [29]:
#display NaN values in 'bsmt_type_1', 'bsmt_type_2', 'bsmt_exposure','bsmt_cond', ''bsmt_qual'
ames_df.loc[pd.isnull(ames_df.bsmtfin_type_1), ['bsmtfin_type_1', 'bsmtfin_type_2', 'bsmt_exposure', 'bsmt_cond', 'bsmt_qual']].head(10)

Unnamed: 0,bsmtfin_type_1,bsmtfin_type_2,bsmt_exposure,bsmt_cond,bsmt_qual
12,,,,,
93,,,,,
114,,,,,
146,,,,,
183,,,,,
240,,,,,
249,,,,,
256,,,,,
390,,,,,
437,,,,,


Since the missing values in the 5 columns, 'bsmtfin_type_1', 'bsmtfin_type_2', 'bsmt_exposure', 'bsmt_cond' & 'bsmt_qual' all correspond to each other, we can fill the missing values in the 5 columns with 'None' as they are likely to have no basement.

In [30]:
#replace NaN values with None in 'bsmtfin_type_1', 'bsmtfin_type_2', 'bsmt_exposure', 'bsmt_cond' & 'bsmt_qual'
ames_df['bsmtfin_type_1'] = ames_df['bsmtfin_type_1'].replace(np.nan,'None')

ames_df['bsmtfin_type_2'] = ames_df['bsmtfin_type_2'].replace(np.nan,'None')

ames_df['bsmt_exposure'] = ames_df['bsmt_exposure'].replace(np.nan,'None')

ames_df['bsmt_cond'] = ames_df['bsmt_cond'].replace(np.nan,'None')

ames_df['bsmt_qual'] = ames_df['bsmt_qual'].replace(np.nan,'None')

In [31]:
#convert strings to numeric values in 'bsmtfin_type_1', 'bsmtfin_type_2', 'bsmt_exposure', 'bsmt_cond' & 'bsmt_qual'
ames_df.bsmtfin_type_1 = ames_df.bsmtfin_type_1.map({'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6})

ames_df.bsmtfin_type_2 = ames_df.bsmtfin_type_2.map({'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6})

ames_df.bsmt_exposure = ames_df.bsmt_exposure.map({'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4})

ames_df.bsmt_cond = ames_df.bsmt_cond.map({'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})

ames_df.bsmt_qual = ames_df.bsmt_qual.map({'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})


In [32]:
ames_df['bsmtfin_type_1'].value_counts()

6    615
1    603
5    293
4    200
3    183
2    102
0     55
Name: bsmtfin_type_1, dtype: int64

In [33]:
ames_df['bsmtfin_type_2'].value_counts()

1    1749
3      80
2      60
0      56
4      48
5      35
6      23
Name: bsmtfin_type_2, dtype: int64

In [34]:
ames_df['bsmt_exposure'].value_counts()

1    1339
3     288
4     203
2     163
0      58
Name: bsmt_exposure, dtype: int64

In [35]:
ames_df['bsmt_cond'].value_counts()

3    1834
4      89
2      65
0      55
1       5
5       3
Name: bsmt_cond, dtype: int64

In [36]:
ames_df['bsmt_qual'].value_counts()

3    887
4    864
5    184
2     60
0     55
1      1
Name: bsmt_qual, dtype: int64

In [37]:
pd.options.display.min_rows = 20
ames_df.isnull().sum().sort_values(ascending=False)

mas_vnr_type      22
mas_vnr_area      22
bsmt_half_bath     2
bsmt_full_bath     2
garage_cars        1
garage_area        1
total_bsmt_sf      1
bsmtfin_sf_1       1
bsmtfin_sf_2       1
bsmt_unf_sf        1
                  ..
bsmt_qual          0
exter_cond         0
year_built         0
exter_qual         0
exterior_2nd       0
exterior_1st       0
roof_matl          0
roof_style         0
year_remod/add     0
saleprice          0
Length: 76, dtype: int64

### Masonry Veneer

In [38]:
#display NaN values in 'mas_vnr_type' & 'mas_vnr_area' 
ames_df.loc[pd.isnull(ames_df.mas_vnr_type), ['mas_vnr_type', 'mas_vnr_area']].head(10)

Unnamed: 0,mas_vnr_type,mas_vnr_area
22,,
41,,
86,,
212,,
276,,
338,,
431,,
451,,
591,,
844,,


Since the missing values in 'mas_vnr_type' & 'mas_vnr_area' columns correspond to each other, we can fill the missing values in 'mas_vnr_type' as 'None' and 'mas_vnr_area' as 0.

In [39]:
#replace NaN values with None in 'mas_vnr_type'
ames_df['mas_vnr_type'] = ames_df['mas_vnr_type'].replace(np.nan,'None')

#replace NaN values with 0 in 'mas_vnr_area'
ames_df['mas_vnr_area'] = ames_df['mas_vnr_area'].replace(np.nan,0)

In [40]:
ames_df['mas_vnr_type'].value_counts()

None       1240
BrkFace     630
Stone       168
BrkCmn       13
Name: mas_vnr_type, dtype: int64

In [41]:
ames_df['mas_vnr_area'].value_counts()

0.0      1238
120.0      11
176.0      10
200.0      10
210.0       9
180.0       9
72.0        9
16.0        9
108.0       9
132.0       8
         ... 
253.0       1
324.0       1
640.0       1
179.0       1
541.0       1
576.0       1
796.0       1
573.0       1
372.0       1
428.0       1
Name: mas_vnr_area, Length: 373, dtype: int64

In [42]:
#find the entry where ['mas_vnr_type'] == 'None' but ['mas_vnr_area'] != 0

In [43]:
#find the index of entries where ['mas_vnr_type'] == 'None' but ['mas_vnr_area'] != 0
x = ames_df[(ames_df['mas_vnr_type'] == 'None') & (ames_df['mas_vnr_area'] != 0)]

In [44]:
x[['mas_vnr_type','mas_vnr_area']]

Unnamed: 0,mas_vnr_type,mas_vnr_area
765,,1.0
810,,288.0
1148,,1.0
1684,,1.0
1832,,344.0


In [45]:
ames_df.shape

(2051, 76)

In [46]:
#drop entries 810 and 1832 from ames_df as the mas_vnr_area is >0 but the mas_vnr_type is None.
ames_df = ames_df.drop([810, 1832], axis=0)

In [47]:
ames_df.shape

(2049, 76)

In [48]:
ames_df['mas_vnr_type'].value_counts()

None       1238
BrkFace     630
Stone       168
BrkCmn       13
Name: mas_vnr_type, dtype: int64

In [49]:
ames_df['mas_vnr_area'].value_counts()

0.0      1238
120.0      11
176.0      10
200.0      10
210.0       9
180.0       9
72.0        9
16.0        9
108.0       9
132.0       8
         ... 
253.0       1
324.0       1
640.0       1
179.0       1
541.0       1
576.0       1
796.0       1
573.0       1
372.0       1
428.0       1
Name: mas_vnr_area, Length: 373, dtype: int64

### Basement 

In [50]:
pd.options.display.min_rows = 20
ames_df.isnull().sum().sort_values(ascending=False)

bsmt_full_bath    2
bsmt_half_bath    2
bsmtfin_sf_1      1
garage_cars       1
garage_area       1
total_bsmt_sf     1
bsmtfin_sf_2      1
bsmt_unf_sf       1
1st_flr_sf        0
2nd_flr_sf        0
                 ..
foundation        0
exter_qual        0
year_remod/add    0
mas_vnr_area      0
mas_vnr_type      0
exterior_2nd      0
exterior_1st      0
roof_matl         0
roof_style        0
saleprice         0
Length: 76, dtype: int64

In [51]:
#display NaN values in 'mas_vnr_type' & 'mas_vnr_area' 
ames_df.loc[pd.isnull(ames_df.bsmt_full_bath), ['bsmt_full_bath', 'bsmt_half_bath', 'bsmtfin_sf_1', 'bsmtfin_sf_2', 'bsmt_unf_sf', 'total_bsmt_sf', 'bsmt_qual']].head(10)

Unnamed: 0,bsmt_full_bath,bsmt_half_bath,bsmtfin_sf_1,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,bsmt_qual
616,,,0.0,0.0,0.0,0.0,0
1327,,,,,,,0


Since basement quality is 0 for 616 and 1327, we can replace the null values with 0.

In [52]:
#replace NaN in 'bsmt_full_bath', 'bsmt_half_bath', 'bsmtfin_sf_1', 'bsmtfin_sf_2', 'bsmt_unf_sf', 'total_bsmt_sf' with 0
ames_df['bsmt_full_bath'] = ames_df['bsmt_full_bath'].replace(np.nan,0)

ames_df['bsmt_half_bath'] = ames_df['bsmt_half_bath'].replace(np.nan,0)

ames_df['bsmtfin_sf_1'] = ames_df['bsmtfin_sf_1'].replace(np.nan,0)

ames_df['bsmtfin_sf_2'] = ames_df['bsmtfin_sf_2'].replace(np.nan,0)

ames_df['bsmt_unf_sf'] = ames_df['bsmt_unf_sf'].replace(np.nan,0)

ames_df['total_bsmt_sf'] = ames_df['total_bsmt_sf'].replace(np.nan,0)

In [53]:
pd.options.display.min_rows = 20
ames_df.isnull().sum().sort_values(ascending=False)

garage_cars      1
garage_area      1
id               0
fireplaces       0
totrms_abvgrd    0
kitchen_qual     0
kitchen_abvgr    0
bedroom_abvgr    0
half_bath        0
full_bath        0
                ..
foundation       0
exter_cond       0
exter_qual       0
mas_vnr_area     0
mas_vnr_type     0
exterior_2nd     0
exterior_1st     0
roof_matl        0
roof_style       0
saleprice        0
Length: 76, dtype: int64

In [54]:
ames_df[['bsmtfin_sf_1','bsmtfin_sf_2','bsmt_unf_sf','total_bsmt_sf']].head(10)

Unnamed: 0,bsmtfin_sf_1,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf
0,533.0,0.0,192.0,725.0
1,637.0,0.0,276.0,913.0
2,731.0,0.0,326.0,1057.0
3,0.0,0.0,384.0,384.0
4,0.0,0.0,676.0,676.0
5,247.0,713.0,557.0,1517.0
6,547.0,0.0,0.0,547.0
7,1000.0,0.0,188.0,1188.0
8,292.0,0.0,632.0,924.0
9,650.0,0.0,390.0,1040.0


The sum of 'bsmtfin_sf_1','bsmtfin_sf_2' & 'bsmt_unf_sf' equals to 'total_bsmt_sf', therefore it would be better to drop these three columns and keep 'total_bsmt_sf'.

In [55]:
#drop 'bsmtfin_sf_1','bsmtfin_sf_2','bsmt_unf_sf'
ames_df.drop(columns=['bsmtfin_sf_1','bsmtfin_sf_2','bsmt_unf_sf'], inplace=True)

In [56]:
ames_df.shape

(2049, 73)

### Above Ground Living Area

In [57]:
ames_df.loc[ames_df['low_qual_fin_sf']!= 0,['1st_flr_sf','2nd_flr_sf','low_qual_fin_sf','gr_liv_area']].head()

Unnamed: 0,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area
249,640,0,205,845
269,520,600,80,1200
320,854,0,528,1382
356,1013,0,513,1526
382,929,929,371,2229


Since 'gr_liv_area' is the sum of '1st_flr_sf','2nd_flr_sf' & 'low_qual_fin_sf', we can drop the 3 columns and keep 'gr_liv_area'.

In [58]:
#drop '1st_flr_sf','2nd_flr_sf' & 'low_qual_fin_sf' columns
ames_df.drop(columns=['1st_flr_sf','2nd_flr_sf','low_qual_fin_sf'], inplace=True)

In [59]:
ames_df.shape

(2049, 70)

### ID & PID

'id' & 'pid' columns are dropped as they are identification numbers and are not likely to affect the sale price.

In [60]:
#drop 'id' & 'pid' columns
ames_df.drop(columns=['id','pid'], inplace=True)

In [61]:
ames_df.shape

(2049, 68)

In [62]:
#save train_dataframe to csv
ames_df.to_csv('../data/df_train_cleaned.csv',index=False)