In [5]:
import pandas as pd
import typing
import numpy as np
ames=pd.read_csv('AmesHousing.csv')
ames.describe()

Unnamed: 0,Order,PID,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,...,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,SalePrice
count,2930.0,2930.0,2930.0,2440.0,2930.0,2930.0,2930.0,2930.0,2930.0,2907.0,...,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0
mean,1465.5,714464500.0,57.387372,69.22459,10147.921843,6.094881,5.56314,1971.356314,1984.266553,101.896801,...,93.751877,47.533447,23.011604,2.592491,16.002048,2.243345,50.635154,6.216041,2007.790444,180796.060068
std,845.96247,188730800.0,42.638025,23.365335,7880.017759,1.411026,1.111537,30.245361,20.860286,179.112611,...,126.361562,67.4834,64.139059,25.141331,56.08737,35.597181,566.344288,2.714492,1.316613,79886.692357
min,1.0,526301100.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,12789.0
25%,733.25,528477000.0,20.0,58.0,7440.25,5.0,5.0,1954.0,1965.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0,129500.0
50%,1465.5,535453600.0,50.0,68.0,9436.5,6.0,5.0,1973.0,1993.0,0.0,...,0.0,27.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,160000.0
75%,2197.75,907181100.0,70.0,80.0,11555.25,7.0,6.0,2001.0,2004.0,164.0,...,168.0,70.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,213500.0
max,2930.0,1007100000.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,...,1424.0,742.0,1012.0,508.0,576.0,800.0,17000.0,12.0,2010.0,755000.0


In [6]:
# - Cleanup
# - remove "magic literals"
# - remove lambdas
# - add types
# - Add docstring



def size_name(df_, name=''):
    print(f'{name} {df_.shape}')
    return df_

def tweak_ames(ames:pd.DataFrame) -> pd.DataFrame:
    """ Clean up raw Ames data ...
    """

    def rating_to_num(a_df:pd.DataFrame,col:str) -> pd.Series:
        return a_df[col].replace({'Ex':5, 'Gd':4, 'TA':3,'Fa':2,'Po':1})

    def create_rating_fn(col:str) -> typing.Callable:
        def wrapper(a_df):
            return rating_to_num(a_df,col)
        return wrapper

    def clean_col_name(col:str) -> str:
        return col.lower().replace(' ','_')

    def extract_story_num(a_df: pd.DataFrame) -> pd.Series:
        return a_df.house_style.str.extract(r'(\d\.?\d?)').astype(float).fillna(0)
    
    def has_basement(a_df: pd.DataFrame) ->pd.Series:
        return ~a_df.bsmt_qual.isna()

    def has_garage(a_df: pd.DataFrame) ->pd.Series:
        return ~a_df.garage_qual.isna()

    def fix_garage_year(a_df: pd.DataFrame) ->pd.Series:
        return a_df.garage_yr_blt.fillna(a_df.year_built).astype('uint8')

    def create_category_fillna_fn(col:str) ->typing.Callable:
        def wrapper(a_df: pd.DataFrame) ->pd.Series:
            return a_df[col].fillna('Other').astype('category')
        return wrapper
    
    def create_float_fillna_fn(col:str) -> typing.Callable:
        def wrapper(a_df: pd.DataFrame) -> pd.Series:
            return a_df[col].fillna(0).astype(float)
        return wrapper
    
    rating_cols=['exter_qual'  ,'exter_cond'   ,'bsmt_qual',
                  'bsmt_cond'  , 'kitchen_qual', 'garage_qual',
                  'garage_cond', 'heating_qc'  , 'fireplace_qu'
                ]

    rating_mapping = {k:create_rating_fn(k)
                        for k in rating_cols}

    category_cols = ['ms_zoning'   , 'street'        , 'alley'           , 'lot_shape'    ,   'land_contour',
                    'utilities'    , 'lot_config'    , 'land_slope'      , 'neighborhood' ,   'condition_1',
                    'condition_2'  ,  'bldg_type'    , 'house_style'     , 'roof_style'   ,   'roof_matl',
                    'exterior_1st' , 'exterior_2nd'  , 'mas_vnr_type'    , 'foundation'   ,
                    'bsmt_exposure', 'bsmtfin_type_1', 'bsmtfin_type_2'  , 'heating'      , 
                    'electrical'   , 'functional'    , 'garage_type'     , 'garage_finish',
                    'paved_drive'  , 'fence'         ,  'misc_feature'   , 'sale_type'    ,    'sale_condition']

    category_mapping = {k:create_category_fillna_fn(k) for k in category_cols}

    float_cols = ['lot_frontage' ,'mas_vnr_area'   ,'bsmtfin_sf_1'   ,'bsmtfin_sf_2'    , 
                  'bsmt_unf_sf'  ,'total_bsmt_sf'  ,'bsmt_full_bath' ,'bsmt_half_bath' ,
                  'garage_cars'  ,'garage_area']
    
    float_mapping ={k:create_float_fillna_fn(k) for k in float_cols}

    uint8_cols = ['ms_subclass'  ,'overall_qual'   ,'overall_cond' ,
                  'full_bath'    ,'half_bath'      ,'bedroom_abvgr',
                  'kitchen_abvgr', 'totrms_abvgrd' ,'fireplaces'   ,
                  'mo_sold']
    uint16_cols = ['order'          ,'year_built'   ,'year_remod/add' ,'1st_flr_sf'   ,'2nd_flr_sf' ,
                   'low_qual_fin_sf','gr_liv_area'  ,'wood_deck_sf'   ,'open_porch_sf',
                   'enclosed_porch' ,'3ssn_porch'   ,'screen_porch'   ,'pool_area'    ,'misc_val'   ,
                   'yr_sold']
    
    uint32_cols =['lot_area', 'saleprice']

    return (ames
    .pipe(size_name,'orig')
    .rename(columns=clean_col_name)
    .pipe(size_name,'rename')
    .assign(stories=extract_story_num,
            has_bsmt=has_basement,
            has_garage=has_garage,
            **rating_mapping,
            garage_yr_blt=fix_garage_year,
            **category_mapping,
            **float_mapping
             )
        .pipe(size_name,'assign')
        .astype({'central_air':bool,
            **{k:'uint8' for k in uint8_cols},
            **{k:'uint16' for k in uint16_cols},
            **{k:'uint32' for k in uint32_cols},
                })
        .pipe(size_name,'astype')
        .drop(columns=['pid'])
        .pipe(size_name,'drop')
           )


a1=tweak_ames(ames)


orig (2930, 82)
rename (2930, 82)
assign (2930, 85)
astype (2930, 85)
drop (2930, 84)


In [8]:
(a1
 .assign(**pd.DataFrame(res,columns=[f'PC{i+1}' fpr o om ramge)res.shape]))
)

Unnamed: 0,order,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,utilities,...,misc_feature,misc_val,mo_sold,yr_sold,sale_type,sale_condition,saleprice,stories,has_bsmt,has_garage
0,1,20,RL,141.0,31770,Pave,Other,IR1,Lvl,AllPub,...,Other,0,5,2010,WD,Normal,215000,1.0,True,True
1,2,20,RH,80.0,11622,Pave,Other,Reg,Lvl,AllPub,...,Other,0,6,2010,WD,Normal,105000,1.0,True,True
2,3,20,RL,81.0,14267,Pave,Other,IR1,Lvl,AllPub,...,Gar2,12500,6,2010,WD,Normal,172000,1.0,True,True
3,4,20,RL,93.0,11160,Pave,Other,Reg,Lvl,AllPub,...,Other,0,4,2010,WD,Normal,244000,1.0,True,True
4,5,60,RL,74.0,13830,Pave,Other,IR1,Lvl,AllPub,...,Other,0,3,2010,WD,Normal,189900,2.0,True,True
