In [9]:
%matplotlib inline
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import pylab
import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import mean_squared_error
def remse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [10]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
# Get all features from train & test set, take out the target variable
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:,'MSSubClass':'SaleCondition']), ignore_index=True)


In [11]:
x = all_data.loc[all_data["LotFrontage"].notnull(), "LotArea"]
y = all_data.loc[all_data["LotFrontage"].notnull(), "LotFrontage"]
outlier_removed = (x <= 25000) & (y <= 150)
slope_intercept = np.polyfit(x[outlier_removed], y[outlier_removed], 1)

# Fill the missing values in LotFrontage using linear approximation to from LotArea
all_data.loc[all_data['LotFrontage'].isnull(), 'LotFrontage'] = \
    np.polyval(slope_intercept, all_data.loc[all_data['LotFrontage'].isnull(), 'LotArea'])

# _ = plt.scatter(all_data['LotArea'], all_data['LotFrontage'])
# _ = plt.xlabel("LotArea")
# _ = plt.ylabel("LotFrontage")

In [12]:
temp = all_data.Alley.value_counts()

print temp.index[0]
print all_data['MasVnrType'].value_counts()

def fill_null_with_most_common(data, col_name):
    something = data[col_name].isnull().sum()
    print something
fill_null_with_most_common(all_data, 'MasVnrType')

Grvl
None       1742
BrkFace     879
Stone       249
BrkCmn       25
Name: MasVnrType, dtype: int64
24


In [13]:
def feature_process(process_func='None', most_frequent_func=False, median_func=False, col_name=[]):
    for column in col_name:
        if median_func:
            process_func = all_data[column].median()
        if most_frequent_func:
            process_func = all_data[column].value_counts().index.values[0]
        all_data.loc[all_data[column].isnull(), column] = process_func

In [14]:
# Replace missing values with 'None'
feature_process(col_name=['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', \
                          'BsmtFinType1', 'BsmtFinType2', 'MasVnrType', 'MiscFeature', \
                         'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', \
                         'GarageCond', 'PoolQC', 'Fence'])
# Replace missing values with '0'
feature_process(process_func=0, col_name=['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', \
                'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', ])
# Replace with 'median'
feature_process(median_func=True, col_name=['BsmtFinSF1', 'BsmtUnfSF'])
# Replace with most frequent value in the column
feature_process(most_frequent_func=True, col_name=['KitchenQual', 'MSZoning', 'Utilities', \
                                                   'Exterior1st', 'Exterior2nd', 'Functional', \
                                                   'SaleCondition', 'SaleType', 'Electrical'])

In [15]:
# Replace with mean() & median() from detached type garage
all_data.loc[all_data['GarageArea'].isnull(), 'GarageArea'] = \
                    all_data.loc[all_data['GarageType']=='Detchd', 'GarageArea'].mean()
all_data.loc[all_data['GarageCars'].isnull(), 'GarageCars'] = \
                    all_data.loc[all_data['GarageType']=='Detchd', 'GarageCars'].median()

In [16]:
# Make a new feature, newer_dwelling, and weight 20, 60, 120 more important by giving them 1, while others 0
MSSubClass_vals = np.sort(all_data['MSSubClass'].value_counts().index.values)
newer_dwelling = all_data['MSSubClass']

for val in MSSubClass_vals:
    if val==20 or val==60 or val==120:
        newer_dwelling = newer_dwelling.replace({val: 1})
    else:
        newer_dwelling = newer_dwelling.replace({val: 0})

newer_dwelling.name = 'newer_dwelling'
print all_data['MSSubClass'].head(4)
print newer_dwelling.head(4)

0    60
1    20
2    60
3    70
Name: MSSubClass, dtype: int64
0    1
1    1
2    1
3    0
Name: newer_dwelling, dtype: int64


In [40]:
overall_poor_qu = all_data.OverallQual.copy()
overall_poor_qu = 5 - overall_poor_qu
overall_poor_qu[overall_poor_qu<0] = 0
overall_poor_qu.name = 'overall_poor_qu'
print overall_poor_qu.tail()

overall_good_qu = all_data.OverallQual.copy()
overall_good_qu = overall_good_qu - 5
overall_good_qu[overall_good_qu<0] = 0
overall_good_qu.name = 'overall_good_qu'
print overall_good_qu.tail()

2914    1
2915    1
2916    0
2917    0
2918    0
Name: overall_poor_qu, dtype: int64
2914    0
2915    0
2916    0
2917    0
2918    2
Name: overall_good_qu, dtype: int64


In [46]:
def qualify_features(norm_val, col_name):
    copy = all_data[col_name].copy()
    good = copy - norm_val
    good[good  < 0] = 0
    good.name = col_name + '_good'

    poor = norm_val - copy
    poor[poor < 0] = 0
    poor.name = col_name + '_poor'
    
    return good, poor

In [45]:
OverallQual_good, OverallQual_poor = qualify_features(5, 'OverallQual')
OverallCond_good, OverallCond_poor = qualify_features(5, 'OverallCond')

ExterQual_good, ExterQual_poor = qualify_features(3, 'ExterQual')
ExterCond_good, ExterCond_poor = qualify_features(3, 'ExterCond')

BsmtCond_good, BsmtCond_poor = qualify_features(3, 'BsmtCond')

GarageQual_good, GarageQual_poor = qualify_features(5, 'GarageQual')
GarageCond_good, GarageCond_poor = qualify_features(5, 'GarageCond')

KitchenQual_good, KitchenQual_poor = qualify_features(5, 'KitchenQual')

qu_list = pd.concat((OverallQual_good, OverallQual_poor, ExterQual_good, ExterQual_poor, \
                    BsmtCond_good, BsmtCond_poor, GarageQual_good, GarageQual_poor, \
                    GarageCond_good, GarageCond_poor, KitchenQual_good, KitchenQual_poor), axis=1)

TypeError: unsupported operand type(s) for -: 'str' and 'int'

In [170]:
# Good quality -> rise price. Poor quality -> reduce price
qu_list = pd.concat((overall_poor_qu, overall_good_qu, overall_poor_cond, overall_good_cond, exter_poor_qu,
                     exter_good_qu, exter_poor_cond, exter_good_cond, bsmt_poor_cond, bsmt_good_cond, garage_poor_qu,
                     garage_good_qu, garage_poor_cond, garage_good_cond, kitchen_poor_qu, kitchen_good_qu), axis=1)

In [190]:
bad_heating = all_data.HeatingQC.replace({'Ex': 0, 
                                          'Gd': 0, 
                                          'TA': 0, 
                                          'Fa': 1,
                                          'Po': 1})
bad_heating.name = 'bad_heating'
                                          
MasVnrType_Any = all_data.MasVnrType.replace({'BrkCmn': 1,
                                              'BrkFace': 1,
                                              'CBlock': 1,
                                              'Stone': 1,
                                              'None': 0})
MasVnrType_Any.name = 'MasVnrType_Any'

SaleCondition_PriceDown = all_data.SaleCondition.replace({'Abnorml': 1,
                                                          'Alloca': 1,
                                                          'AdjLand': 1,
                                                          'Family': 1,
                                                          'Normal': 0,
                                                          'Partial': 0})
SaleCondition_PriceDown.name = 'SaleCondition_PriceDown'

Neighborhood_Good = pd.DataFrame(np.zeros((all_data.shape[0],1)), columns=['Neighborhood_Good'])
good = ['NridgHt', 'Crawfor', 'StoneBr', 'Somerst', 'NoRidge']
for item in good:
    Neighborhood_Good[all_data.Neighborhood==item] = 1

    Neighborhood_Good
0                 0.0
1                 0.0
2                 0.0
3                 1.0
4                 1.0
5                 0.0
6                 1.0
7                 0.0
8                 0.0
9                 0.0
10                0.0
11                1.0
12                0.0
13                0.0
14                0.0
15                0.0
16                0.0
17                0.0
18                0.0
19                0.0
