### Imports

In [21]:
import pandas as pd

In [22]:
ames = pd.read_csv('../datasets/test.csv')

In [23]:
ames.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [24]:
ames.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 80 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               878 non-null    int64  
 1   PID              878 non-null    int64  
 2   MS SubClass      878 non-null    int64  
 3   MS Zoning        878 non-null    object 
 4   Lot Frontage     718 non-null    float64
 5   Lot Area         878 non-null    int64  
 6   Street           878 non-null    object 
 7   Alley            58 non-null     object 
 8   Lot Shape        878 non-null    object 
 9   Land Contour     878 non-null    object 
 10  Utilities        878 non-null    object 
 11  Lot Config       878 non-null    object 
 12  Land Slope       878 non-null    object 
 13  Neighborhood     878 non-null    object 
 14  Condition 1      878 non-null    object 
 15  Condition 2      878 non-null    object 
 16  Bldg Type        878 non-null    object 
 17  House Style     

In [25]:
# Syndicating the test data set to be compatible with the regression model.
# It's important to note I am not tarnishing or manipulating the quality of the data.

ames['MS SubClass'] = ames['MS SubClass'].astype(str)
ames.loc[ames['Lot Frontage'].isnull(),'Lot Frontage'] = ames['Lot Frontage'].mean()
ames.loc[ames['Alley'].isnull(),'Alley'] = 'No Alley'
ames.loc[ames['Mas Vnr Type'].isnull(), 'Mas Vnr Type'] = 'None'
ames.loc[ames['Mas Vnr Area'].isnull(),'Mas Vnr Area']=0
ames.loc[ames['Bsmt Qual'].isnull(),'Bsmt Qual'] = 'NA'
ames.loc[ames['Bsmt Cond'].isnull(),'Bsmt Cond'] = 'NA'
ames.loc[ames['Bsmt Exposure'].isnull(),'Bsmt Exposure'] = 'NA'
ames.loc[ames['BsmtFin Type 1'].isnull(),'BsmtFin Type 1'] = 'NA'
ames.loc[ames['BsmtFin Type 2'].isnull(),'BsmtFin Type 2'] = 'NA'
ames.loc[ames['Bsmt Unf SF'].isnull(),'Bsmt Unf SF'] = 0
ames.loc[ames['Electrical'].isnull(),'Electrical'] = 'SBkrk'
ames.loc[ames['Fireplace Qu'].isnull(),'Fireplace Qu'] = 'NA'
ames.loc[ames['Garage Type'].isnull(),'Garage Type'] = 'NA'
ames.loc[ames['Garage Yr Blt'].isnull(),'Garage Yr Blt'] = 0
ames.loc[ames['Garage Finish'].isnull(),'Garage Finish'] = 'NA'
ames.loc[ames['Garage Qual'].isnull(),'Garage Qual'] = 'NA'
ames.loc[ames['Garage Cond'].isnull(),'Garage Cond'] = 'NA'
ames.loc[ames['Pool QC'].isnull(),'Pool QC'] = 'NA'
ames.loc[ames['Fence'].isnull(),'Fence'] = 'NA'
ames.loc[ames['Misc Feature'].isnull(),'Misc Feature'] = 'NA'
ames['has_garage'] = ames['Garage Area'].apply(lambda x: 1 if x > 0 else 0)
ames['has_pool'] = ames['Pool Area'].apply(lambda x:1 if x>0 else 0)
ames['has_fireplace'] = ames['Fireplaces'].apply(lambda x:1 if x>0 else 0)
ames['has_porch'] = ames['Open Porch SF'].apply(lambda x:1 if x>0 else 0)
ames['has_basement'] = ames['Total Bsmt SF'].apply(lambda x:1 if x>0 else 0)
ames['Bsmt Bath'] = ames['Bsmt Full Bath'] + 0.5*ames['Bsmt Half Bath']
ames['Total Bath'] = ames['Full Bath'] + 0.5*ames['Half Bath']

In [26]:
# Dictionary to convert ordinal varibles into numeric variables (only if order is visually present)
# For instance, there is a clear distincion between EX, GD, TA, FA whereas Pool QC, it doesn't tell much story other than that most houses don't have a pool.

numeric_scores = {
           'NA':0, 
           'Po':2, 
           'Fa':4, 
           'TA':6, 
           'Gd':8, 
           'Ex':10,
           'Gd':4,
           'Av':3,
           'Mn':2,
           'No':1,
           'GLQ':10,
           'ALQ':8,
           'BLQ':6,
           'Rec':4,
           'LwQ':2,
           'Unf':0,
           'Fin':3,
           'RFn':2,
           'Unf':1,
}


ames['Exter Qual'] = ames['Exter Qual'].map(numeric_scores)
ames['Bsmt Qual'] = ames['Bsmt Qual'].map(numeric_scores)
ames['Bsmt Exposure'] = ames['Bsmt Exposure'].map(numeric_scores)
ames['BsmtFin Type 1'] = ames['BsmtFin Type 1'].map(numeric_scores)
ames['Heating QC'] = ames['Heating QC'].map(numeric_scores)
ames['Kitchen Qual'] = ames['Kitchen Qual'].map(numeric_scores)
ames['Garage Finish'] = ames['Garage Finish'].map(numeric_scores)

In [27]:
ames.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 87 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               878 non-null    int64  
 1   PID              878 non-null    int64  
 2   MS SubClass      878 non-null    object 
 3   MS Zoning        878 non-null    object 
 4   Lot Frontage     878 non-null    float64
 5   Lot Area         878 non-null    int64  
 6   Street           878 non-null    object 
 7   Alley            878 non-null    object 
 8   Lot Shape        878 non-null    object 
 9   Land Contour     878 non-null    object 
 10  Utilities        878 non-null    object 
 11  Lot Config       878 non-null    object 
 12  Land Slope       878 non-null    object 
 13  Neighborhood     878 non-null    object 
 14  Condition 1      878 non-null    object 
 15  Condition 2      878 non-null    object 
 16  Bldg Type        878 non-null    object 
 17  House Style     

In [28]:
# Export the polished test data so I can fit it through the model.

ames.to_csv('../datasets/test_polished_v2.csv')