# Import

In [131]:
import pandas as pd
import numpy as np

from IPython.display import display
pd.options.display.max_columns = 500
pd.options.display.max_rows = 2000

%matplotlib inline

In [132]:
train_raw = pd.read_csv('data/train.csv')

In [133]:
train_df = pd.read_csv('data/train.csv')
train_df = train_df.drop('Id', axis = 1)

In [134]:
test_df = pd.read_csv('data/test.csv')

In [135]:
y = train_df['SalePrice']
train_df = train_df.drop('SalePrice', axis=1)

# Divide varbles by type

In [136]:
# date variables
date_vars = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'MoSold', 'YrSold']

# list comprehension of continuous variables that are not dates
cont_vars = [cont_var for cont_var in train_df.columns if
        (train_df[cont_var].dtype == 'float64') | (train_df[cont_var].dtype == 'int64')
        and not cont_var in date_vars]

# change continus varibles to float
train_df[cont_vars] = train_df[cont_vars].astype('float64')

# create dataframe of date variables
dates = train_df[date_vars].astype(object)

In [137]:
# dict compehension of the frequncy of unique varibales
cont_freq = {cont_var: train_df[cont_var].nunique() for cont_var in cont_vars}
# frequency data frame
cont_freq_df = pd.DataFrame.from_dict(cont_freq, orient='index').rename(columns={0: 'freq'})
# continuous variables with few unique values
low_vol_cont = ['LowQualFinSF', '3SsnPorch', 'PoolArea']
# variabkes with Nan as the mode
mode_na = ['Alley', 'PoolQC', 'Fence','MiscFeature', 'FireplaceQu']


In [138]:
# data frame with numerical variables 
nums = train_df[list(cont_freq_df[cont_freq_df.freq >= 50].index)].astype(object)

num_cats = train_df[list(cont_freq_df[cont_freq_df.freq < 50].index)].astype(object)

# list of non categorical variables
non_cat = list(nums.columns) + list(dates.columns) + list(num_cats.columns) + mode_na
# data frame with only categorical
cats =  train_df.drop(non_cat, axis = 1)

# data frame with categoricals where the mode is nan
cats_na = train_df[mode_na]

# move continuous variables with low-frequency from num_cats to nums
nums[low_vol_cont] = num_cats[low_vol_cont]
num_cats = num_cats.drop(low_vol_cont, axis=1)

# Fill numarical missing with 0

In [139]:
nums = nums.fillna(0)

# Add dummies remove dominate and original categorical columns

In [140]:
def add_dummies_remove_modes(dummy_list, house_raw, house_df):

    dummy_modes = list(house_raw[dummy_list].mode().iloc[0,:].items())

    dummy_modes = [(col, (float(mode))) 
                   if type(mode) == int else (col, mode) for col, mode in dummy_modes]

    drop_modes = list(map(lambda x: str(x[0]) + '_' + str(x[1]), dummy_modes))

    dummy_cols = pd.get_dummies(
        house_df[dummy_list].astype(object), dummy_na=True).drop(drop_modes, axis=1)

    return pd.concat([
       house_df.drop(dummy_list, axis=1),
       dummy_cols], axis=1)


# Numarical Categorical to Dummies

In [141]:
# impute numarical categoricals with mode
num_cats = num_cats.fillna(num_cats.mode())

# dummify categotical numericals
num_cats = add_dummies_remove_modes(list(num_cats.columns), train_df, num_cats)

# drop columns that are all 0
num_cats = num_cats.drop(
    num_cats.columns[num_cats.sum()==0], axis=1)


# Categorical to Dummies

In [142]:
#cats dummies
cats = add_dummies_remove_modes(list(cats.columns), train_df, cats)

cats = cats.drop(cats.columns[cats.sum()==0], axis=1)

# Dummify columns with NA as mode

In [143]:
cats_na = pd.get_dummies(cats_na)

cats_na = cats_na.drop(cats_na.columns[cats_na.sum()==0], axis=1)

# Change GarageYrBlt to binary variable

In [144]:
dates['GarageYrBlt'] = ~dates['GarageYrBlt'].isna()
dates['GarageYrBlt'] = dates['GarageYrBlt'].apply(lambda x: sum([x]))

# Concat final data frame

In [145]:
df_train_final = pd.concat([dates, nums, num_cats, cats, cats_na], axis=1)

# Quality Check

In [146]:
df_train_final.shape

(1460, 337)

In [147]:
cols_before_drop = set(df_train_final.columns)
cols_after_drop = set(df_train_final.T.drop_duplicates(keep='first').T.columns)
cols_before_drop - cols_after_drop

{'BldgType_Duplex',
 'BsmtCond_nan',
 'BsmtFinType1_nan',
 'Condition2_RRAe',
 'Exterior2nd_CBlock',
 'GarageCond_nan',
 'GarageFinish_nan',
 'GarageQual_nan',
 'GarageType_nan',
 'TotRmsAbvGrd_14.0'}

In [148]:
cols_after_drop - cols_before_drop

set()

In [149]:
df_train_final = df_train_final.T.drop_duplicates(keep='first').T

In [150]:
df_train_final.shape

(1460, 327)

# Transform Dummies on Test

In [151]:
dummies_frame = pd.get_dummies(
    pd.concat([num_cats, cats, cats_na], axis=1))

test_dummies = test_df.reindex(columns = dummies_frame.columns, fill_value=0)

In [152]:
test_df = pd.concat([test_dummies, test_df], axis=1)

In [153]:
train_cols = set(df_train_final.columns)
test_cols = set(test_df.columns)

test_drop =  list(test_cols - train_cols)

# Quality Check on Test

In [154]:
list(train_cols - test_cols)

[]

In [155]:
test_df.drop(test_drop, axis=1).shape

(1459, 327)

# Drop Cols not in Train

In [156]:
test_df = test_df.drop(test_drop, axis=1)

# Impute missing with 0

In [157]:
test_na_cols = test_df.columns[test_df.isna().sum() > 0]
test_df[test_na_cols] = test_df[test_na_cols].fillna(0)

# Pickle

In [158]:
# df_train_final.to_pickle('train.pkl')
# test_df.to_pickle('test.pkl')