In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


from scipy.stats import skew
from scipy.special import boxcox1p

from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import power_transform
from sklearn.preprocessing import LabelEncoder

import re

from IPython.display import display
pd.options.display.max_columns = 500

## Import data set

In [2]:
train_raw = pd.read_csv('data/train.csv')
test_raw = pd.read_csv('data/test.csv')

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

## Outliers

In [4]:
# remove outliers
train = train[~((train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000))]
train = train[~((train['MasVnrArea'] > 1400) & (train['SalePrice'] < 300000))]
train = train[~((train['LotFrontage'] > 300) & (train['SalePrice'] < 300000 ))]
train = train[~((train['LotArea'] > 200000) & (train['SalePrice'] < 500000 ))]
# train = train[~((train['YearBuilt'] < 1900) & (train['SalePrice'] > 400000))]
train = train[~((train['OpenPorchSF'] > 500) & (train['SalePrice'] < 100000))]


## Remove SalePrice and Id

In [5]:
response_variable = np.log1p(train.SalePrice)
id_ = test.Id
train = train.loc[:,'MSSubClass':'SaleCondition']
test = test.loc[:,'MSSubClass':'SaleCondition']

## Multicoliearity 

We need to add justification

In [6]:
# drop some features to avoid multicollinearity

train.drop(['1stFlrSF','GarageArea', 'TotRmsAbvGrd'], axis=1, inplace=True)
test.drop(['1stFlrSF','GarageArea','TotRmsAbvGrd'], axis=1, inplace=True)

## Skewdness

In [7]:
numeric_feats = train.dtypes[train.dtypes != "object"].index


skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[skewed_feats > .65]

skewed_index = skewed_feats.index

train[skewed_index] = boxcox1p(train[skewed_index], 0.1)
test[skewed_index] = boxcox1p(test[skewed_index], 0.1)

scale = StandardScaler()

train[skewed_index] = scale.fit_transform(train[skewed_index])
test[skewed_index] = scale.transform(test[skewed_index])

## Imputation

In [8]:
train = train.fillna(train.mean())
test = test.fillna(test.mean())

# Ordnial

In [9]:
# ords = []
# for col in train:
#     if 'Ex' in set(train[col]):
#         ords.append(col)

In [10]:
# ord_values = {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1}

In [11]:
# for ordn in ords:
#     train[ordn] = train[ordn].map(ord_values)

In [12]:
# train[ords] = train[ords].fillna(0)

## Dummification

In [13]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [14]:
train_cols = set(train.columns)
test_cols = set(test.columns)

test_drop =  list(test_cols - train_cols)
test_unkonwn_dummies = list(train_cols - test_cols)

In [15]:
test_dummies = test.reindex(columns=test_unkonwn_dummies, fill_value=0)

In [16]:
test = pd.concat([test, test_dummies], axis=1)

In [17]:
test = test[train.columns]

# Feature Engineering

## Age

In [18]:
# range_bins = [(1800,1900), (1900,1910), (1910,1920), (1920,1930), (1930,1940), (1940,1950), (1950,1960), (1960,1970), (1970,1980), (1980,1990), (1990,2000), (2000,2011)]

# bins = {r:x for (x,y) in range_bins for r in range(x,y)}

# train['YearBuilt'] = train['YearBuilt'].map(bins)

# test['YearBuilt'] = test['YearBuilt'].map(bins)

In [19]:
# train['YearBuilt'] = train['YrSold'] - train['YearBuilt']
# test['YearBuilt'] = test['YrSold'] - test['YearBuilt']

In [20]:
# train[['Age']] = train[['YearBuilt']].apply(lambda x: x.max() - x)
# # train = train.drop('YearBuilt', axis=1)
# test[['Age']] = test[['YearBuilt']].apply(lambda x: x.max() - x)
# test = test.drop('YearBuilt', axis=1)

## Garage Age

In [21]:
# test['GarageYrBlt'] = test['GarageYrBlt'].isna().apply(lambda x: int(not x))

In [22]:
# train['GarageYrBlt'] = train['GarageYrBlt'].isna().apply(lambda x: int(not x))

In [23]:
# train['GarageYrBlt'] = train['YrSold'] - train['GarageYrBlt']
# test['GarageYrBlt'] = test['YrSold'] - test['GarageYrBlt']

## SF

In [24]:
# train.drop('TotalBsmtSF', axis= 1, inplace=True)
# test.drop('TotalBsmtSF', axis= 1, inplace=True)


# train['TwoStory'] = train['2ndFlrSF'].isna().apply(lambda x: int(not x))
# test['TwoStory'] = test['2ndFlrSF'].isna().apply(lambda x: int(not x))

# train['SF'] = train['1stFlrSF'] + train['2ndFlrSF']
# # train.drop(['1stFlrSF','2ndFlrSF'], axis= 1, inplace=True)

# test['SF'] = test['1stFlrSF'] + test['2ndFlrSF']
# test.drop(['1stFlrSF','2ndFlrSF'], axis= 1, inplace=True)

## YearRemodAdd

In [25]:
# train['YearRemodAdd']  = (train['YrSold'] - train['YearRemodAdd'])
# test['YearRemodAdd']  = (test['YrSold'] - test['YearRemodAdd'])

# Pickel

In [27]:
train.to_pickle('dump/train.pkl')
test.to_pickle('dump/test.pkl')
response_variable.to_pickle('dump/y_train.pkl')