# House Prices Analysis: A Kaggle Competition

In [49]:
import pandas as pd
from sklearn.preprocessing import Imputer, LabelEncoder

## Data Processing

In [50]:
# load data
train = pd.read_csv('/Users/Tomas/Desktop/Kaggle-House-Prices-Challenge/data/train.csv')
print(train.shape)

(1460, 81)


In [51]:
# columns with missing data
def summarize_missing(df):
    return df.isnull().sum()[df.isnull().any()]

summarize_missing(train)

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

Examining these columns, some features are missing much more data than others. Alley, PoolQC, and MiscFeature are missing for almost the entirety of the dataset. It makes sense to drop these features, as we would be imputing too much of the data. We use a 20% cutoff to determine which columns to impute and which to discard. 

In [52]:
def percentage_missing(df):
    return df.isnull().sum().divide(df.shape[0]).multiply(100)[train.isnull().any()]

percentage_missing(train)

LotFrontage     17.739726
Alley           93.767123
MasVnrType       0.547945
MasVnrArea       0.547945
BsmtQual         2.534247
BsmtCond         2.534247
BsmtExposure     2.602740
BsmtFinType1     2.534247
BsmtFinType2     2.602740
Electrical       0.068493
FireplaceQu     47.260274
GarageType       5.547945
GarageYrBlt      5.547945
GarageFinish     5.547945
GarageQual       5.547945
GarageCond       5.547945
PoolQC          99.520548
Fence           80.753425
MiscFeature     96.301370
dtype: float64

Alley, FireplaceQu, PoolQC, Fence, MiscFeature all are missing more than 20% of their values, so these columns are dropped from the training set. NOTE: Remember to drop these during processing of test data for model building

In [53]:
train = train.drop(['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)
train.shape


(1460, 76)

In [54]:
# Majority of missing values very little of data. Drop all columns with missing values except LotFrontage, impute those
train = train.dropna(axis=0,how='any', subset=['MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                                              'Electrical', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond'])
summarize_missing(train)

LotFrontage    244
dtype: int64

In [55]:
train.shape

(1338, 76)

We see that the majority of the observations were not removed, so we continue

We decide to impute the remaining data with the mean value for each column

In [56]:
# Impute missing data
imputer = Imputer(missing_values= 'NaN', strategy = 'mean', axis=0)
train['LotFrontage'] = imputer.fit_transform(train['LotFrontage'].values.reshape(-1,1))

In [57]:
# Check to make sure no more missing data
summarize_missing(train)

Series([], dtype: int64)

In [58]:
# Save data to file for later use
train.to_csv('/Users/Tomas/Desktop/Kaggle-House-Prices-Challenge/data/train_modified.csv')

## Encode and Standardize

In [59]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
import numpy as np
import Utilities

In [60]:
# train = train.set_index('Id')
prices = train["SalePrice"]
prices = np.log(prices)

train = Utilities.encodeAndStandardize(train)
print("Train shape : {}".format(train.shape))
train['SalePrice'] = prices
train.to_csv('/Users/Tomas/Desktop/Kaggle-House-Prices-Challenge/data/final_data.csv')

Dummized shape : (1338, 229)
Normalized shape : (1338, 37)
Combined shape : (1338, 266)
Train shape : (1338, 266)


In [19]:
# train.to_csv('/Users/Tomas/Desktop/Kaggle-House-Prices-Challenge/data/final_data.csv')

## Model

In [20]:
from sklearn.model_selection import train_test_split
train = pd.read_csv('/Users/Tomas/Desktop/Kaggle-House-Prices-Challenge/data/final_data.csv', index_col=0)
X_train, X_test, y_train, y_test = train_test_split(train.drop(['SalePrice'], axis=1), train['SalePrice'], test_size=0.3, random_state=42)

In [22]:
from sklearn.pipeline import Pipeline
from sklearn import linear_model
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.grid_search import GridSearchCV
import math

def rmse(predictions, truths):
    return math.sqrt(mean_squared_error(truths, predictions))
#     return (mean_squared_error(truths, predictions))





In [23]:
def linearRegression(X_train, X_test, y_train, y_test):
    LoS = Pipeline([('pca', PCA(.95)), 
                     ('regr', linear_model.LinearRegression())])
    LoS.fit(X_train, y_train)
    predictions = LoS.predict(X_test)
    print("LoS Regression:")
    print("Test MSE : {}".format(rmse(predictions, y_test)))
    print()
    
    return LoS
    
linearRegression(X_train, X_test, y_train, y_test)

LoS Regression:
Test MSE : 0.22037018727342347



  linalg.lstsq(X, y)


Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('regr', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))])

In [24]:
def ridgeRegression(X_train, X_test, y_train, y_test):
    ridge = Pipeline([('pca', PCA(.95)), 
                     ('regr', linear_model.Ridge())])
    
    alphas = [.1, .01, .001, .0001, .00001]
    grid_params = [{'regr__alpha' : alphas}]
        
    gridSearch = GridSearchCV(estimator = ridge, param_grid=grid_params, scoring='neg_mean_squared_error', cv = 10, n_jobs=1)
    gridSearch.fit(X_train, y_train)
    
    print("Ridge Regresion:")
    print("Best CV MSE : {}".format(gridSearch.best_score_))
    print("Best params: {}".format(gridSearch.best_params_))
    predictions = gridSearch.predict(X_test)
    print("Test MSE : {}".format(rmse(predictions, y_test)))
    print()
    
    return gridSearch

ridgeRegression(X_train, X_test, y_train, y_test)

Ridge Regresion:
Best CV MSE : -0.01286108254909304
Best params: {'regr__alpha': 0.1}
Test MSE : 0.22035886403880112



GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('regr', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'regr__alpha': [0.1, 0.01, 0.001, 0.0001, 1e-05]}],
       pre_dispatch='2*n_jobs', refit=True,
       scoring='neg_mean_squared_error', verbose=0)

In [25]:
def lassoRegression(X_train, X_test, y_train, y_test):
    lasso = Pipeline([('pca', PCA(.95)), 
                     ('regr', linear_model.Lasso())])
    
    alphas = [.1, .01, .001, .005, .002, .0001, .00001]
    grid_params = [{'regr__alpha' : alphas}]
        
    gridSearch = GridSearchCV(estimator = lasso, param_grid=grid_params, scoring='neg_mean_squared_error', cv = 10, n_jobs=1)
    gridSearch.fit(X_train, y_train)
    
    print("Lasso Regresion:")
    print("Best CV MSE : {}".format(gridSearch.best_score_))
    print("Best params: {}".format(gridSearch.best_params_))
    predictions = gridSearch.predict(X_test)
    print("Test MSE : {}".format(rmse(predictions, y_test)))
    print()
    
    return gridSearch
    
lassoRegression(X_train, X_test, y_train, y_test)

Lasso Regresion:
Best CV MSE : -0.01252737458404693
Best params: {'regr__alpha': 0.001}
Test MSE : 0.21906061851816397



GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('regr', Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'regr__alpha': [0.1, 0.01, 0.001, 0.005, 0.002, 0.0001, 1e-05]}],
       pre_dispatch='2*n_jobs', refit=True,
       scoring='neg_mean_squared_error', verbose=0)

In [26]:
def elasticNetRegression(X_train, X_test, y_train, y_test):
    elasticNet = Pipeline([('pca', PCA(.95)), 
                     ('regr', linear_model.ElasticNet())])
    
    l1 = [.01, .001, .001, .0001, .1, .5, 1]
    iterations = [1, 5, 10, 20, 50]
    grid_params = [{'regr__l1_ratio' : l1, 'regr__max_iter' : iterations}]
        
    gridSearch = GridSearchCV(estimator = elasticNet, param_grid=grid_params, scoring='neg_mean_squared_error', cv = 10, n_jobs=-11)
    gridSearch.fit(X_train, y_train)
    
    print("ElasticNet Regresion:")
    print("Best CV MSE : {}".format(gridSearch.best_score_))
    print("Best params: {}".format(gridSearch.best_params_))
    predictions = gridSearch.predict(X_test)
    print("Test MSE : {}".format(rmse(predictions, y_test)))
    print()
    
    return gridSearch
    
model = elasticNetRegression(X_train, X_test, y_train, y_test)

ElasticNet Regresion:
Best CV MSE : -0.017170193948874358
Best params: {'regr__l1_ratio': 0.0001, 'regr__max_iter': 1}
Test MSE : 0.19751604155133265



## Test Set Evaluation

In [61]:
import Utilities
test = pd.read_csv('/Users/Tomas/Desktop/Kaggle-House-Prices-Challenge/data/test.csv')
test.shape

(1459, 80)

In [29]:
print(test.shape)
Utilities.summarize_missing(test)

(1459, 80)


MSZoning           4
LotFrontage      227
Alley           1352
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType        16
MasVnrArea        15
BsmtQual          44
BsmtCond          45
BsmtExposure      44
BsmtFinType1      42
BsmtFinSF1         1
BsmtFinType2      42
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu      730
GarageType        76
GarageYrBlt       78
GarageFinish      78
GarageCars         1
GarageArea         1
GarageQual        78
GarageCond        78
PoolQC          1456
Fence           1169
MiscFeature     1408
SaleType           1
dtype: int64

In [30]:
test = Utilities.drop_unecessary_columns(test)
print(test.shape)

(1459, 75)


In [31]:
missing_cols = list(Utilities.summarize_missing(test).index.values)
missing_cols
col_info = {}
for col in missing_cols:
    print("{} : {}".format(col, test[col].dtype))
    col_info[col] = str(test[col].dtype)

MSZoning : object
LotFrontage : float64
Utilities : object
Exterior1st : object
Exterior2nd : object
MasVnrType : object
MasVnrArea : float64
BsmtQual : object
BsmtCond : object
BsmtExposure : object
BsmtFinType1 : object
BsmtFinSF1 : float64
BsmtFinType2 : object
BsmtFinSF2 : float64
BsmtUnfSF : float64
TotalBsmtSF : float64
BsmtFullBath : float64
BsmtHalfBath : float64
KitchenQual : object
Functional : object
GarageType : object
GarageYrBlt : float64
GarageFinish : object
GarageCars : float64
GarageArea : float64
GarageQual : object
GarageCond : object
SaleType : object


In [32]:
col_info

{'BsmtCond': 'object',
 'BsmtExposure': 'object',
 'BsmtFinSF1': 'float64',
 'BsmtFinSF2': 'float64',
 'BsmtFinType1': 'object',
 'BsmtFinType2': 'object',
 'BsmtFullBath': 'float64',
 'BsmtHalfBath': 'float64',
 'BsmtQual': 'object',
 'BsmtUnfSF': 'float64',
 'Exterior1st': 'object',
 'Exterior2nd': 'object',
 'Functional': 'object',
 'GarageArea': 'float64',
 'GarageCars': 'float64',
 'GarageCond': 'object',
 'GarageFinish': 'object',
 'GarageQual': 'object',
 'GarageType': 'object',
 'GarageYrBlt': 'float64',
 'KitchenQual': 'object',
 'LotFrontage': 'float64',
 'MSZoning': 'object',
 'MasVnrArea': 'float64',
 'MasVnrType': 'object',
 'SaleType': 'object',
 'TotalBsmtSF': 'float64',
 'Utilities': 'object'}

In [33]:
# impute missing data, whether categorical or numeric
# from sklearn_pandas import CategoricalImputer
# imputer = CategoricalImputer()
# x = test
# x['MSZoning'] = imputer.fit_transform(test['MSZoning'])
# Utilities.summarize_missing(x)
for col in col_info:
    if col_info[col] == "object":
        test = Utilities.impute_categorical(test, col)        
    else:
        test = Utilities.impute_missing(test, col)
print(test.shape)
Utilities.summarize_missing(test)

(1459, 75)


Series([], dtype: int64)

In [34]:
test = Utilities.encodeAndStandardize(test)
print(test.shape)

Dummized shape : (1459, 218)
Normalized shape : (1459, 36)
Combined shape : (1459, 254)
(1459, 254)


In [None]:
predictions = model.predict(test)