In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
train = pd.read_csv("train.csv").drop("SalePrice", axis=1)
test = pd.read_csv("test.csv")

df = pd.concat([train, test])

In [12]:
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2919 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             2919 non-null   int64  
 1   MSSubClass     2919 non-null   int64  
 2   MSZoning       2915 non-null   object 
 3   LotFrontage    2433 non-null   float64
 4   LotArea        2919 non-null   int64  
 5   Street         2919 non-null   object 
 6   Alley          198 non-null    object 
 7   LotShape       2919 non-null   object 
 8   LandContour    2919 non-null   object 
 9   Utilities      2917 non-null   object 
 10  LotConfig      2919 non-null   object 
 11  LandSlope      2919 non-null   object 
 12  Neighborhood   2919 non-null   object 
 13  Condition1     2919 non-null   object 
 14  Condition2     2919 non-null   object 
 15  BldgType       2919 non-null   object 
 16  HouseStyle     2919 non-null   object 
 17  OverallQual    2919 non-null   int64  
 18  OverallCond  

In [14]:
def missing_table(df):
    return pd.DataFrame({"missing":df.isna().sum(), "miss_precnt":np.round(df.isna().sum()/df.shape[0],3)*100})
missing_table(df)

Unnamed: 0,missing,miss_precnt
Id,0,0.0
MSSubClass,0,0.0
MSZoning,4,0.1
LotFrontage,486,16.6
LotArea,0,0.0
...,...,...
MiscVal,0,0.0
MoSold,0,0.0
YrSold,0,0.0
SaleType,1,0.0


In [9]:
dub = df[["YrSold", "MoSold"]]
dub[dub.duplicated()]

Unnamed: 0,YrSold,MoSold
10,2008,2
12,2008,9
13,2007,8
22,2008,9
23,2007,6
...,...,...
1454,2006,6
1455,2006,4
1456,2006,9
1457,2006,7


In [None]:
categorical_mappings = {
    'MSSubClass': {
        20: '1-STORY 1946 & NEWER ALL STYLES',
        30: '1-STORY 1945 & OLDER',
        40: '1-STORY W/FINISHED ATTIC ALL AGES',
        45: '1-1/2 STORY - UNFINISHED ALL AGES',
        50: '1-1/2 STORY FINISHED ALL AGES',
        60: '2-STORY 1946 & NEWER',
        70: '2-STORY 1945 & OLDER',
        75: '2-1/2 STORY ALL AGES',
        80: 'SPLIT OR MULTI-LEVEL',
        85: 'SPLIT FOYER',
        90: 'DUPLEX - ALL STYLES AND AGES',
        120: '1-STORY PUD - 1946 & NEWER',
        150: '1-1/2 STORY PUD - ALL AGES',
        160: '2-STORY PUD - 1946 & NEWER',
        180: 'PUD - MULTILEVEL - INCL SPLIT LEV/FOYER',
        190: '2 FAMILY CONVERSION - ALL STYLES AND AGES'
    },
    
    'MSZoning': {
        'A': 'Agriculture',
        'C': 'Commercial',
        'FV': 'Floating Village Residential',
        'I': 'Industrial',
        'RH': 'Residential High Density',
        'RL': 'Residential Low Density',
        'RP': 'Residential Low Density Park',
        'RM': 'Residential Medium Density'
    },
    
    'Street': {
        'Grvl': 'Gravel',
        'Pave': 'Paved'
    },
    
    'Alley': {
        'Grvl': 'Gravel',
        'Pave': 'Paved',
        np.nan: 'No alley access'
    },
    
    'LotShape': {
        'Reg': 'Regular',
        'IR1': 'Slightly irregular',
        'IR2': 'Moderately Irregular',
        'IR3': 'Irregular'
    },
    
    'LandContour': {
        'Lvl': 'Near Flat/Level',
        'Bnk': 'Banked - Quick and significant rise',
        'HLS': 'Hillside - Significant slope',
        'Low': 'Depression'
    },
    
    'Utilities': {
        'AllPub': 'All public Utilities',
        'NoSewr': 'Electricity, Gas, and Water',
        'NoSeWa': 'Electricity and Gas Only',
        'ELO': 'Electricity only'
    },
    
    'LotConfig': {
        'Inside': 'Inside lot',
        'Corner': 'Corner lot',
        'CulDSac': 'Cul-de-sac',
        'FR2': 'Frontage on 2 sides',
        'FR3': 'Frontage on 3 sides'
    },
    
    'LandSlope': {
        'Gtl': 'Gentle slope',
        'Mod': 'Moderate Slope',
        'Sev': 'Severe Slope'
    },
    
    'Neighborhood': {
        'Blmngtn': 'Bloomington Heights',
        'Blueste': 'Bluestem',
        'BrDale': 'Briardale',
        'BrkSide': 'Brookside',
        'ClearCr': 'Clear Creek',
        'CollgCr': 'College Creek',
        'Crawfor': 'Crawford',
        'Edwards': 'Edwards',
        'Gilbert': 'Gilbert',
        'IDOTRR': 'Iowa DOT and Rail Road',
        'MeadowV': 'Meadow Village',
        'Mitchel': 'Mitchell',
        'Names': 'North Ames',
        'NoRidge': 'Northridge',
        'NPkVill': 'Northpark Villa',
        'NridgHt': 'Northridge Heights',
        'NWAmes': 'Northwest Ames',
        'OldTown': 'Old Town',
        'SWISU': 'South & West of Iowa State University',
        'Sawyer': 'Sawyer',
        'SawyerW': 'Sawyer West',
        'Somerst': 'Somerset',
        'StoneBr': 'Stone Brook',
        'Timber': 'Timberland',
        'Veenker': 'Veenker'
    },
    
    'BldgType': {
        '1Fam': 'Single-family Detached',
        '2FmCon': 'Two-family Conversion',
        'Duplx': 'Duplex',
        'TwnhsE': 'Townhouse End Unit',
        'TwnhsI': 'Townhouse Inside Unit'
    },
    
    'HouseStyle': {
        '1Story': 'One story',
        '1.5Fin': 'One and one-half story: 2nd level finished',
        '1.5Unf': 'One and one-half story: 2nd level unfinished',
        '2Story': 'Two story',
        '2.5Fin': 'Two and one-half story: 2nd level finished',
        '2.5Unf': 'Two and one-half story: 2nd level unfinished',
        'SFoyer': 'Split Foyer',
        'SLvl': 'Split Level'
    },
    
    'RoofStyle': {
        'Flat': 'Flat',
        'Gable': 'Gable',
        'Gambrel': 'Gabrel (Barn)',
        'Hip': 'Hip',
        'Mansard': 'Mansard',
        'Shed': 'Shed'
    },
    
    'RoofMatl': {
        'ClyTile': 'Clay or Tile',
        'CompShg': 'Standard (Composite) Shingle',
        'Membran': 'Membrane',
        'Metal': 'Metal',
        'Roll': 'Roll',
        'Tar&Grv': 'Gravel & Tar',
        'WdShake': 'Wood Shakes',
        'WdShngl': 'Wood Shingles'
    },
    
    'ExterQual': {
        'Ex': 'Excellent',
        'Gd': 'Good',
        'TA': 'Average/Typical',
        'Fa': 'Fair',
        'Po': 'Poor'
    },
    
    'ExterCond': {
        'Ex': 'Excellent',
        'Gd': 'Good',
        'TA': 'Average/Typical',
        'Fa': 'Fair',
        'Po': 'Poor'
    },
    
    'Foundation': {
        'BrkTil': 'Brick & Tile',
        'CBlock': 'Cinder Block',
        'PConc': 'Poured Concrete',
        'Slab': 'Slab',
        'Stone': 'Stone',
        'Wood': 'Wood'
    },
    
    'Heating': {
        'Floor': 'Floor Furnace',
        'GasA': 'Gas forced warm air furnace',
        'GasW': 'Gas hot water or steam heat',
        'Grav': 'Gravity furnace',
        'OthW': 'Hot water or steam heat other than gas',
        'Wall': 'Wall furnace'
    },
    
    'CentralAir': {
        'N': 'No',
        'Y': 'Yes'
    },
    
    'Functional': {
        'Typ': 'Typical Functionality',
        'Min1': 'Minor Deductions 1',
        'Min2': 'Minor Deductions 2',
        'Mod': 'Moderate Deductions',
        'Maj1': 'Major Deductions 1',
        'Maj2': 'Major Deductions 2',
        'Sev': 'Severely Damaged',
        'Sal': 'Salvage only'
    },
    
    'GarageType': {
        '2Types': 'More than one type of garage',
        'Attchd': 'Attached to home',
        'Basment': 'Basement Garage',
        'BuiltIn': 'Built-In',
        'CarPort': 'Car Port',
        'Detchd': 'Detached from home',
        np.nan: 'No Garage'
    },
    
    'PavedDrive': {
        'Y': 'Paved',
        'P': 'Partial Pavement',
        'N': 'Dirt/Gravel'
    },
    
    'PoolQC': {
        'Ex': 'Excellent',
        'Gd': 'Good',
        'TA': 'Average/Typical',
        'Fa': 'Fair',
        np.nan: 'No Pool'
    },
    
    'Fence': {
        'GdPrv': 'Good Privacy',
        'MnPrv': 'Minimum Privacy',
        'GdWo': 'Good Wood',
        'MnWw': 'Minimum Wood/Wire',
        np.nan: 'No Fence'
    },
    
    'SaleType': {
        'WD': 'Warranty Deed - Conventional',
        'CWD': 'Warranty Deed - Cash',
        'VWD': 'Warranty Deed - VA Loan',
        'New': 'Home just constructed and sold',
        'COD': 'Court Officer Deed/Estate',
        'Con': 'Contract 15% Down payment regular terms',
        'ConLw': 'Contract Low Down payment and low interest',
        'ConLI': 'Contract Low Interest',
        'ConLD': 'Contract Low Down',
        'Oth': 'Other'
    },
    
    'SaleCondition': {
        'Normal': 'Normal Sale',
        'Abnorml': 'Abnormal Sale - trade, foreclosure, short sale',
        'AdjLand': 'Adjoining Land Purchase',
        'Alloca': 'Allocation - two linked properties with separate deeds',
        'Family': 'Sale between family members',
        'Partial': 'Home was not completed when last assessed'
    },

    'Condition1': {
            'Artery': 'Adjacent to arterial street',
            'Feedr': 'Adjacent to feeder street',
            'Norm': 'Normal',
            'RRNn': 'Within 200\' of North-South Railroad',
            'RRAn': 'Adjacent to North-South Railroad',
            'PosN': 'Near positive off-site feature--park, greenbelt, etc.',
            'PosA': 'Adjacent to positive off-site feature',
            'RRNe': 'Within 200\' of East-West Railroad',
            'RRAe': 'Adjacent to East-West Railroad'
        },
    
    'Condition2': {
        'Artery': 'Adjacent to arterial street',
        'Feedr': 'Adjacent to feeder street',
        'Norm': 'Normal',
        'RRNn': 'Within 200\' of North-South Railroad',
        'RRAn': 'Adjacent to North-South Railroad',
        'PosN': 'Near positive off-site feature--park, greenbelt, etc.',
        'PosA': 'Adjacent to positive off-site feature',
        'RRNe': 'Within 200\' of East-West Railroad',
        'RRAe': 'Adjacent to East-West Railroad'
    },

    'Exterior1st': {
        'AsbShng': 'Asbestos Shingles',
        'AsphShn': 'Asphalt Shingles',
        'BrkComm': 'Brick Common',
        'BrkFace': 'Brick Face',
        'CBlock': 'Cinder Block',
        'CemntBd': 'Cement Board',
        'HdBoard': 'Hard Board',
        'ImStucc': 'Imitation Stucco',
        'MetalSd': 'Metal Siding',
        'Other': 'Other',
        'Plywood': 'Plywood',
        'PreCast': 'PreCast',
        'Stone': 'Stone',
        'Stucco': 'Stucco',
        'VinylSd': 'Vinyl Siding',
        'Wd Sdng': 'Wood Siding',
        'WdShing': 'Wood Shingles'
    },

    'Exterior2nd': {
        'AsbShng': 'Asbestos Shingles',
        'AsphShn': 'Asphalt Shingles',
        'BrkComm': 'Brick Common',
        'BrkFace': 'Brick Face',
        'CBlock': 'Cinder Block',
        'CemntBd': 'Cement Board',
        'HdBoard': 'Hard Board',
        'ImStucc': 'Imitation Stucco',
        'MetalSd': 'Metal Siding',
        'Other': 'Other',
        'Plywood': 'Plywood',
        'PreCast': 'PreCast',
        'Stone': 'Stone',
        'Stucco': 'Stucco',
        'VinylSd': 'Vinyl Siding',
        'Wd Sdng': 'Wood Siding',
        'WdShing': 'Wood Shingles'
    },

    'MasVnrType': {
        'BrkCmn': 'Brick Common',
        'BrkFace': 'Brick Face',
        'CBlock': 'Cinder Block',
        np.nan: 'None',
        'Stone': 'Stone'
    },

    'BsmtQual': {
        'Ex': 'Excellent (100+ inches)',
        'Gd': 'Good (90-99 inches)',
        'TA': 'Typical (80-89 inches)',
        'Fa': 'Fair (70-79 inches)',
        'Po': 'Poor (<70 inches)',
        np.nan: 'No Basement'
    },

    'BsmtCond': {
        'Ex': 'Excellent',
        'Gd': 'Good',
        'TA': 'Typical - slight dampness allowed',
        'Fa': 'Fair - dampness or some cracking or settling',
        'Po': 'Poor - Severe cracking, settling, or wetness',
        np.nan: 'No Basement'
    },

    'BsmtExposure': {
        'Gd': 'Good Exposure',
        'Av': 'Average Exposure',
        'Mn': 'Minimum Exposure',
        'No': 'No Exposure',
        np.nan: 'No Basement'
    },

    'BsmtFinType1': {
        'GLQ': 'Good Living Quarters',
        'ALQ': 'Average Living Quarters',
        'BLQ': 'Below Average Living Quarters',
        'Rec': 'Average Rec Room',
        'LwQ': 'Low Quality',
        'Unf': 'Unfinished',
        np.nan: 'No Basement'
    },

    'BsmtFinType2': {
        'GLQ': 'Good Living Quarters',
        'ALQ': 'Average Living Quarters',
        'BLQ': 'Below Average Living Quarters',
        'Rec': 'Average Rec Room',
        'LwQ': 'Low Quality',
        'Unf': 'Unfinished',
        np.nan: 'No Basement'
    },

    'HeatingQC': {
        'Ex': 'Excellent',
        'Gd': 'Good',
        'TA': 'Average/Typical',
        'Fa': 'Fair',
        'Po': 'Poor'
    },

    'Electrical': {
        'SBrkr': 'Standard Circuit Breakers & Romex',
        'FuseA': 'Fuse Box over 60 AMP and all Romex wiring (Average)',
        'FuseF': '60 AMP Fuse Box and mostly Romex wiring (Fair)',
        'FuseP': '60 AMP Fuse Box and mostly knob & tube wiring (poor)',
        'Mix': 'Mixed'
    },

    'KitchenQual': {
        'Ex': 'Excellent',
        'Gd': 'Good',
        'TA': 'Typical/Average',
        'Fa': 'Fair',
        'Po': 'Poor'
    },

    'FireplaceQu': {
        'Ex': 'Excellent - Exceptional Masonry Fireplace',
        'Gd': 'Good - Masonry Fireplace in main level',
        'TA': 'Average - Prefabricated Fireplace in main living area or Masonry Fireplace in basement',
        'Fa': 'Fair - Prefabricated Fireplace in basement',
        'Po': 'Poor - Ben Franklin Stove',
        np.nan: 'No Fireplace'
    },

    'GarageFinish': {
        'Fin': 'Finished',
        'RFn': 'Rough Finished',
        'Unf': 'Unfinished',
        np.nan: 'No Garage'
    },

    'GarageQual': {
        'Ex': 'Excellent',
        'Gd': 'Good',
        'TA': 'Typical/Average',
        'Fa': 'Fair',
        'Po': 'Poor',
        np.nan: 'No Garage'
    },

    'GarageCond': {
        'Ex': 'Excellent',
        'Gd': 'Good',
        'TA': 'Typical/Average',
        'Fa': 'Fair',
        'Po': 'Poor',
        np.nan: 'No Garage'
    },

    'MiscFeature': {
        'Elev': 'Elevator',
        'Gar2': '2nd Garage (if not described in garage section)',
        'Othr': 'Other',
        'Shed': 'Shed (over 100 SF)',
        'TenC': 'Tennis Court',
        np.nan: 'None'
    }
}


In [32]:
#for col in categorical_mappings
df2 = df.replace(categorical_mappings)
df2.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,2-STORY 1946 & NEWER,Residential Low Density,65.0,8450,Paved,No alley access,Regular,Near Flat/Level,All public Utilities,...,0,0,No Pool,No Fence,,0,2,2008,Warranty Deed - Conventional,Normal Sale
1,2,1-STORY 1946 & NEWER ALL STYLES,Residential Low Density,80.0,9600,Paved,No alley access,Regular,Near Flat/Level,All public Utilities,...,0,0,No Pool,No Fence,,0,5,2007,Warranty Deed - Conventional,Normal Sale
2,3,2-STORY 1946 & NEWER,Residential Low Density,68.0,11250,Paved,No alley access,Slightly irregular,Near Flat/Level,All public Utilities,...,0,0,No Pool,No Fence,,0,9,2008,Warranty Deed - Conventional,Normal Sale
3,4,2-STORY 1945 & OLDER,Residential Low Density,60.0,9550,Paved,No alley access,Slightly irregular,Near Flat/Level,All public Utilities,...,0,0,No Pool,No Fence,,0,2,2006,Warranty Deed - Conventional,"Abnormal Sale - trade, foreclosure, short sale"
4,5,2-STORY 1946 & NEWER,Residential Low Density,84.0,14260,Paved,No alley access,Slightly irregular,Near Flat/Level,All public Utilities,...,0,0,No Pool,No Fence,,0,12,2008,Warranty Deed - Conventional,Normal Sale


In [46]:
x  = missing_table(df2.bfill())
x[x["missing"] != 0]

Unnamed: 0,missing,miss_precnt


In [45]:
df2.bfill()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,2-STORY 1946 & NEWER,Residential Low Density,65.0,8450,Paved,No alley access,Regular,Near Flat/Level,All public Utilities,...,0,0,No Pool,No Fence,,0,2,2008,Warranty Deed - Conventional,Normal Sale
1,2,1-STORY 1946 & NEWER ALL STYLES,Residential Low Density,80.0,9600,Paved,No alley access,Regular,Near Flat/Level,All public Utilities,...,0,0,No Pool,No Fence,,0,5,2007,Warranty Deed - Conventional,Normal Sale
2,3,2-STORY 1946 & NEWER,Residential Low Density,68.0,11250,Paved,No alley access,Slightly irregular,Near Flat/Level,All public Utilities,...,0,0,No Pool,No Fence,,0,9,2008,Warranty Deed - Conventional,Normal Sale
3,4,2-STORY 1945 & OLDER,Residential Low Density,60.0,9550,Paved,No alley access,Slightly irregular,Near Flat/Level,All public Utilities,...,0,0,No Pool,No Fence,,0,2,2006,Warranty Deed - Conventional,"Abnormal Sale - trade, foreclosure, short sale"
4,5,2-STORY 1946 & NEWER,Residential Low Density,84.0,14260,Paved,No alley access,Slightly irregular,Near Flat/Level,All public Utilities,...,0,0,No Pool,No Fence,,0,12,2008,Warranty Deed - Conventional,Normal Sale
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,2-STORY PUD - 1946 & NEWER,Residential Medium Density,21.0,1936,Paved,No alley access,Regular,Near Flat/Level,All public Utilities,...,0,0,No Pool,No Fence,,0,6,2006,Warranty Deed - Conventional,Normal Sale
1455,2916,2-STORY PUD - 1946 & NEWER,Residential Medium Density,21.0,1894,Paved,No alley access,Regular,Near Flat/Level,All public Utilities,...,0,0,No Pool,No Fence,,0,4,2006,Warranty Deed - Conventional,"Abnormal Sale - trade, foreclosure, short sale"
1456,2917,1-STORY 1946 & NEWER ALL STYLES,Residential Low Density,160.0,20000,Paved,No alley access,Regular,Near Flat/Level,All public Utilities,...,0,0,No Pool,No Fence,,0,9,2006,Warranty Deed - Conventional,"Abnormal Sale - trade, foreclosure, short sale"
1457,2918,SPLIT FOYER,Residential Low Density,62.0,10441,Paved,No alley access,Regular,Near Flat/Level,All public Utilities,...,0,0,No Pool,Minimum Privacy,Shed (over 100 SF),700,7,2006,Warranty Deed - Conventional,Normal Sale


In [None]:
df2[df2['LotFrontage'].isna()]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
95,1556,1-1/2 STORY FINISHED ALL AGES,Residential Low Density,72.0,10632,Paved,No alley access,Slightly irregular,Near Flat/Level,All public Utilities,...,0,0,No Pool,No Fence,,0,1,2010,Court Officer Deed/Estate,Normal Sale


In [44]:
df2['LotFrontage']

0        65.0
1        80.0
2        68.0
3        60.0
4        84.0
        ...  
1454     21.0
1455     21.0
1456    160.0
1457     62.0
1458     74.0
Name: LotFrontage, Length: 2919, dtype: float64