In [465]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
%matplotlib inline
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000
pd.options.display.float_format = '{:.0f}'.format

# Import Data

In [466]:
train = pd.read_csv('train.csv')
holdout = pd.read_csv('test.csv')

In [467]:
houses = pd.concat([train,holdout],ignore_index=True)
houses.shape

(2919, 81)

# Clean the Data
## Get rid of Missing Values

In [468]:
# Replace nulls in holdout saleprice with zeros
houses.loc[1460:,'SalePrice'] = 0

In [469]:
# If a column in houses dataframe has less than 200 missing values in it fill in the missing values with the median of 
# the column.
missing_less_than_200 = houses.loc[:,houses.isnull().sum()<200]

for col in missing_less_than_200:
    houses.loc[houses[col].isnull(),col] = houses[col].value_counts().argmax()

In [470]:
# If a column in houses is missing more than 200 values, delete the column
missing_more_than_200 = houses.loc[:,houses.isnull().sum()>=200]

houses.drop(missing_more_than_200.columns,axis=1,inplace=True)

## Normalize Continuous Columns and One Hot Encode Categorical Columns

In [471]:
# Split columns into categorical and continuous types.
categorical_cols = list(houses.loc[:,houses.dtypes=='object'].columns) + \
['MSSubClass','OverallQual','OverallCond','YearBuilt','YearRemodAdd','MoSold','YrSold','BsmtFullBath','BsmtHalfBath',
 'FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageCars']

continuous_cols = [x for x in list(houses.columns) if x not in categorical_cols+['Id','SalePrice']]

In [472]:
# Normalize continuous data and categorize categorical data (one hot encoding)
def normalize_continuous(df,cols):
    for col in cols:
        df.loc[:,col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
    return df
def dummy_categorical(df,cols):
    dummies = pd.get_dummies(df[cols],columns=cols)
    df = pd.concat([df,dummies],axis=1)
    df.drop(cols,axis=1,inplace=True)
    return(df)

houses = normalize_continuous(houses,continuous_cols)
houses = dummy_categorical(houses,categorical_cols)


# Train and Test

In [473]:
from sklearn.model_selection import train_test_split

X = houses.loc[:1459,[x for x in houses.columns if (x != 'SalePrice') and (x != 'Id')]]
y = houses.loc[:1459,'SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)


from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=1)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [474]:
def rmsle(predicted, real):
    sum=0.0
    for x in range(len(predicted)):
        p = np.log(predicted[x]+1)
        r = np.log(real[x]+1)
        sum = sum + (p - r)**2
    return (sum/len(predicted))**0.5

In [475]:
error = rmsle(y_pred,y_test.values)
print(error)

0.16497476532


# Generate Submission

In [476]:
model.fit(X,y)
X_holdout = houses.loc[1460:,[x for x in houses if (x != 'SalePrice') and (x != 'Id')]]
final_predictions = model.predict(X_holdout)
submission = pd.DataFrame({'Id':houses.loc[1460:,'Id'],'SalePrice':final_predictions})
submission.to_csv('submission.csv',index=False)

# Improve with Gridsearch

In [498]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC, SVC

In [493]:
# GridSearch
grids = [
#     {'name':'LogisticRegression',
#     'estimator':LogisticRegression(),
#     'hyperparameters':{
#         'solver':['newton-cg','lbfgs','liblinear'],
#         'C':[1,0.1]}
#     }#,
#     {'name':'KNeighborsRegressor',
#     'estimator':KNeighborsRegressor(),
#     'hyperparameters':{
#         'n_neighbors': range(1,12,2),
#         'weights': ['distance','uniform'],
#         'algorithm': ['ball_tree','kd_tree','brute'],
#         'p': [1]}
#     }#,
#     {'name':'RandomForestRegressor',
#     'estimator':RandomForestRegressor(),
#     'hyperparameters':{
#         'n_estimators': [50],
#         'criterion': ['mse'],
#         'max_depth': [10],
#         'max_features': ['auto','log2','sqrt'],
#         'min_samples_leaf': [5],
#         'min_samples_split': [5]}
#     }#,
    {'name':'SVC',
    'estimator':SVC(),
    'hyperparameters':{
        'C': [1,0.1,0.01],
        'loss': ['hinge','squared_hinge']}
    }#,
#     {'name':'GaussianNB',
#     'estimator':GaussianNB(),
#     'hyperparameters':{
#         }
#     }
]

X = houses.loc[:1459,[x for x in houses.columns if (x != 'SalePrice') and (x != 'Id')]]
y = houses.loc[:1459,'SalePrice']

for grid in grids:
    print(grid['name'])
    print('-'*len(grid['name']))
    
    models = GridSearchCV(grid['estimator'],param_grid=grid['hyperparameters'],cv=5)
    models.fit(X,y)
    
    grid['best_params'] = models.best_params_
    grid['best_score'] = models.best_score_
    grid['best_estimator'] = models.best_estimator_
    
    print("Best Score: {}".format(grid["best_score"]))
    print("Best Parameters: {}\n".format(grid["best_params"]))


LinearSVC
---------




Best Score: 0.015753424657534248
Best Parameters: {'C': 0.01, 'loss': 'hinge'}



In [497]:
X = houses.loc[:1459,[x for x in houses.columns if (x != 'SalePrice') and (x != 'Id')]]
y = houses.loc[:1459,'SalePrice']

from sklearn.model_selection import KFold

model = LinearSVC(C=0.01,loss='hinge')
# model = KNeighborsRegressor(n_neighbors=5,weights='distance',algorithm='ball_tree',p=1)
# model = RandomForestRegressor(criterion='mse',max_depth=10,max_features='auto',min_samples_leaf=5,min_samples_split=5,
#                              n_estimators=50)
kf = KFold(n_splits=10)

error_list = []
for train_indicies, test_indicies in kf.split(X):
    model.fit(X.loc[train_indicies],y.loc[train_indicies])
    y_pred = model.predict(X.loc[test_indicies])
    error = rmsle(y_pred,y.loc[test_indicies].values)
    error_list.append(error)
    print('error: {}'.format(error))
print(np.mean(error_list))



error: 0.24148654419705518
error: 0.27456766948471945
error: 0.2218598181704498
error: 0.29613643496724323
error: 0.279605008654048
error: 0.2322699379454781
error: 0.29509404668702877
error: 0.2469767478238832
error: 0.24566464893898796
error: 0.24558800523499869
0.25792488621


# Generate Improved Submission (Improved via GridSearch)

In [486]:
model.fit(X,y)
X_holdout = houses.loc[1460:,[x for x in houses if (x != 'SalePrice') and (x != 'Id')]]
final_predictions = model.predict(X_holdout)
submission = pd.DataFrame({'Id':houses.loc[1460:,'Id'],'SalePrice':final_predictions})
submission.to_csv('submission.csv',index=False)