In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
%matplotlib inline

## Import Train/Test

In [2]:
train_raw = pd.read_csv('./data/train.csv')

In [3]:
train_df = pd.read_pickle('train.pkl')
test_df = pd.read_pickle('test.pkl')

In [4]:
train_df = train_df.drop('SalePrice', axis=1)

In [5]:
cols_raw = train_raw.columns
non_dummies = [x for x in train_df.columns if not '_' in x ]

## Remove Features

In [6]:
# train_df = train_df.drop(
#     'PoolArea', axis=1) # mostly 0 and coleniar with 3SsnPorch and there is a binary for poolquality

## Binary for 3SsnPorch

In [7]:
train_df['3SsnPorch'] = train_df['3SsnPorch'].isna().apply(lambda x: int(not x))

In [8]:
test_df['3SsnPorch'] = test_df['3SsnPorch'].isna().apply(lambda x: int(not x))

# Dates

In [9]:
def add_dummies_remove_modes(dummy_list, df_raw, df):
    """ 
    add_dummies_remove_modes (dummy_list, df_raw, df)
    takes a list of column names `dummy_list` to dummify then drop 
    after dummification, a reference dataframe `df_raw` to search for 
    the dominant value of each varable in `dummy_list` then drop the 
    dominant dummy variable and a dataframe `df` you wish to concat 
    dummified variables to.
    """

    dummy_modes = list(df_raw[dummy_list].mode().iloc[0,:].items())

    dummy_modes = [(col, (float(mode))) 
                   if type(mode) == int else (col, mode) for col, mode in dummy_modes]

    drop_modes = list(map(lambda x: str(x[0]) + '_' + str(x[1]), dummy_modes))

    dummy_cols = pd.get_dummies(
        df[dummy_list].astype(object), dummy_na=True).drop(drop_modes, axis=1)

    return pd.concat([
       df.drop(dummy_list, axis=1),
       dummy_cols], axis=1)


In [10]:
# dates = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'MoSold', 'YrSold']
# train_df[dates].describe()

## Change remodled year to years since remodeled

In [14]:
train_df['YearRemodAdd']  = (train_df['YrSold'] - train_df['YearRemodAdd'])

In [15]:
test_df['YearRemodAdd']  = (test_df['YrSold'] - test_df['YearRemodAdd'])

## Bin years then dummifying

In [18]:
# Yb = train_df['YearBuilt']

# range_bins = [(1800,1900), (1900,1910), (1910,1920), (1920,1930), (1930,1940), (1940,1950), (1950,1960), (1960,1970), (1970,1980), (1980,1990), (1990,2000), (2000,2011)]

# bins = {r:x for (x,y) in range_bins for r in range(x,y)}

# train_df = pd.concat([train_df.drop('YearBuilt', axis=1), Yb.map(bins)], axis=1)

In [19]:
# Yb = test_df['YearBuilt']
# test_df = pd.concat([test_df.drop('YearBuilt', axis=1), Yb.map(bins)], axis=1)

## Convert Year to Age

In [20]:
train_df[['Age']] = train_df[['YearBuilt']].apply(lambda x: x.max() - x)

In [21]:
train_df = train_df.drop('YearBuilt', axis=1)

In [22]:
test_df[['Age']] = test_df[['YearBuilt']].apply(lambda x: x.max() - x)

In [23]:
test_df = test_df.drop('YearBuilt', axis=1)

## Dummify MoSold and YrSold

In [356]:
# train_df = add_dummies_remove_modes(['MoSold', 'YrSold'], train_df, train_df)

## Standardize Continuous

In [22]:
# cont = train_df.T[np.array(train_df.nunique() > 50)].T.columns.delete(-1)
# scale = StandardScaler()
# train_df[cont] = scale.fit_transform(train_df[cont])

In [21]:
# cont = test_df.T[np.array(test_df.nunique() > 50)].T.columns.delete(-1)
# scale = StandardScaler()
# test_df[cont] = scale.fit_transform(test_df[cont])

## Reorder

In [24]:
test_df = test_df[train_df.columns]

# Add SalePrice

In [25]:
train_df['SalePrice'] = train_raw['SalePrice']#.apply(lambda x: np.log(x))

# Pickle

In [26]:
train_df.to_pickle('train_engineered.pkl')

In [27]:
test_df.to_pickle('test_engineered.pkl')