In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
%matplotlib inline
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000
pd.options.display.float_format = '{:.0f}'.format

# Import Data

In [2]:
train = pd.read_csv('train.csv')
holdout = pd.read_csv('test.csv')

In [3]:
houses = pd.concat([train,holdout],ignore_index=True)
houses.shape

(2919, 81)

# Clean the Data
## Get rid of Missing Values

In [4]:
# Replace nulls in holdout saleprice with zeros
houses.loc[1460:,'SalePrice'] = 0

In [5]:
# If a column in houses dataframe has less than 200 missing values in it fill in the missing values with the median of 
# the column.
missing_less_than_200 = houses.loc[:,houses.isnull().sum()<200]

for col in missing_less_than_200:
    houses.loc[houses[col].isnull(),col] = houses[col].value_counts().argmax()

In [6]:
# If a column in houses is missing more than 200 values, delete the column
missing_more_than_200 = houses.loc[:,houses.isnull().sum()>=200]

houses.drop(missing_more_than_200.columns,axis=1,inplace=True)

## Normalize Continuous Columns and One Hot Encode Categorical Columns

In [7]:
# Split columns into categorical and continuous types.
categorical_cols = list(houses.loc[:,houses.dtypes=='object'].columns) + \
['MSSubClass','OverallQual','OverallCond','YearBuilt','YearRemodAdd','MoSold','YrSold','BsmtFullBath','BsmtHalfBath',
 'FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageCars']

continuous_cols = [x for x in list(houses.columns) if x not in categorical_cols+['Id','SalePrice']]

In [8]:
# Normalize continuous data and categorize categorical data (one hot encoding)
def normalize_continuous(df,cols):
    for col in cols:
        df.loc[:,col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
    return df
def dummy_categorical(df,cols):
    dummies = pd.get_dummies(df[cols],columns=cols)
    df = pd.concat([df,dummies],axis=1)
    df.drop(cols,axis=1,inplace=True)
    return(df)

houses = normalize_continuous(houses,continuous_cols)
houses = dummy_categorical(houses,categorical_cols)


# Train and Test

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

X = houses.loc[:1459,[x for x in houses.columns if (x != 'SalePrice') and (x != 'Id')]]
y = houses.loc[:1459,'SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

model = RandomForestRegressor()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [10]:
def rmsle(predicted, real):
    sum=0.0
    for x in range(len(predicted)):
        p = np.log(predicted[x]+1)
        r = np.log(real[x]+1)
        sum = sum + (p - r)**2
    return (sum/len(predicted))**0.5

In [11]:
error = rmsle(y_pred,y_test.values)
print(error)

0.163278459214


# Generate Baseline Submission with Random Forest Regressor

In [12]:
model.fit(X,y)
X_holdout = houses.loc[1460:,[x for x in houses if (x != 'SalePrice') and (x != 'Id')]]
final_predictions = model.predict(X_holdout)
submission = pd.DataFrame({'Id':houses.loc[1460:,'Id'],'SalePrice':final_predictions})
submission.to_csv('submission.csv',index=False)

# Improve with Gridsearch and Recursive Feature Elimination

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor

In [14]:
from sklearn.feature_selection import RFE

def select_features(model):
    selector = RFE(model,step=10)
    selector.fit(X,y)
    best_columns = list(X.columns[selector.support_])
    return best_columns

best_features = select_features(GradientBoostingRegressor())
print(len(best_features)/X.shape[1])

0.5


In [15]:
# GridSearch
import warnings
warnings.filterwarnings('ignore')

grids = [
    {'name':'KNeighborsRegressor',
    'estimator':KNeighborsRegressor(),
    'hyperparameters':{
        'n_neighbors': range(3,12,2),
        'weights': ['distance','uniform'],
        'algorithm': ['ball_tree','kd_tree','brute'],
        'p': [1]}
    },
    {'name':'SVR',
    'estimator':SVR(),
    'hyperparameters':{
        'C':[0.1,1],
        'kernel':['linear','poly','rbf']}
    },
    {'name':'RandomForestRegressor',
    'estimator':RandomForestRegressor(),
    'hyperparameters':{
        'n_estimators': [50],
        'criterion': ['mse'],
        'max_depth': [10],
        'max_features': ['auto','log2','sqrt'],
        'min_samples_leaf': [5],
        'min_samples_split': [5]}
    },
    {'name':'GradientBoostingRegressor',
          'estimator':GradientBoostingRegressor(),
          'hyperparameters':{
             'n_estimators':[20,50,100,500],
             'max_depth':[1,3,5,10]
             }
    },
     {'name':'AdaBoostRegressor',
      'estimator':AdaBoostRegressor(),
      'hyperparameters':{'n_estimators':[25,50,100,200]}
     }
]

X = houses.loc[:1459,[x for x in houses.columns if (x != 'SalePrice') and (x != 'Id')]]
y = houses.loc[:1459,'SalePrice']

for grid in grids:
    print(grid['name'])
    print('-'*len(grid['name']))
    
    models = GridSearchCV(grid['estimator'],param_grid=grid['hyperparameters'],cv=5)
    models.fit(X,y)
    
    grid['best_params'] = models.best_params_
    grid['best_score'] = models.best_score_
    grid['best_estimator'] = models.best_estimator_
    
    print("Best Score: {}".format(grid["best_score"]))
    print("Best Parameters: {}\n".format(grid["best_params"]))


KNeighborsRegressor
-------------------
Best Score: 0.7983630076135435
Best Parameters: {'algorithm': 'ball_tree', 'n_neighbors': 5, 'p': 1, 'weights': 'distance'}

SVR
---
Best Score: -0.006796654218184319
Best Parameters: {'C': 1, 'kernel': 'linear'}

RandomForestRegressor
---------------------
Best Score: 0.8434559509528226
Best Parameters: {'criterion': 'mse', 'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 50}

GradientBoostingRegressor
-------------------------
Best Score: 0.88504448779941
Best Parameters: {'max_depth': 3, 'n_estimators': 500}

AdaBoostRegressor
-----------------
Best Score: 0.8003601974853036
Best Parameters: {'n_estimators': 50}



In [25]:
from sklearn.model_selection import KFold

X = houses.loc[:1459,[x for x in houses.columns if (x != 'SalePrice') and (x != 'Id')]]
y = houses.loc[:1459,'SalePrice']

model = grids[3]['best_estimator']

kf = KFold(n_splits=10)

error_list = []
for train_indicies, test_indicies in kf.split(X):
    model.fit(X.loc[train_indicies],y.loc[train_indicies])
    y_pred = model.predict(X.loc[test_indicies])
    error = rmsle(y_pred,y.loc[test_indicies].values)
    error_list.append(error)
    print('error: {}'.format(error))
print(np.mean(error_list))


error: 0.12773564991858485
error: 0.10872278144423453
error: 0.13457443161200375
error: 0.17342189289587803
error: 0.1562274668435016
error: 0.11409542115856539
error: 0.12788668004755568
error: 0.11927416292088763
error: 0.11721042282621044
error: 0.13854631496539294
0.131769522463


In [26]:
model.fit(X,y)
X_holdout = houses.loc[1460:,[x for x in houses if (x != 'SalePrice') and (x != 'Id')]]
final_predictions = model.predict(X_holdout)
submission = pd.DataFrame({'Id':houses.loc[1460:,'Id'],'SalePrice':final_predictions})
submission.to_csv('submission1.csv',index=False)

1
2
3
4


# Improve with Bagging

In [27]:
from itertools import product
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import KFold

model = BaggingRegressor(grids[3]['best_estimator'],max_samples=0.5, max_features=0.5)

kf = KFold(n_splits=10)

error_list = []
for train_indicies, test_indicies in kf.split(X):
    model.fit(X.loc[train_indicies],y.loc[train_indicies])
    y_pred = model.predict(X.loc[test_indicies])
    error = rmsle(y_pred,y.loc[test_indicies].values)
    error_list.append(error)
    print('error: {}'.format(error))
print(np.mean(error_list))

error: 0.11963607295629392
error: 0.11248151721644267
error: 0.1298732987834333
error: 0.17680627795079598
error: 0.1550774552002972
error: 0.11191496955528427
error: 0.1314443026997464
error: 0.11868269497315996
error: 0.14865912192730088
error: 0.13815697223537102
0.13427326835


In [28]:
model.fit(X,y)
X_holdout = houses.loc[1460:,[x for x in houses if (x != 'SalePrice') and (x != 'Id')]]
final_predictions = model.predict(X_holdout)
submission = pd.DataFrame({'Id':houses.loc[1460:,'Id'],'SalePrice':final_predictions})
submission.to_csv('submission2.csv',index=False)