In [37]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
from scipy.stats import skew

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

# metrics & utils
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_regression

In [30]:
df_train = pd.read_csv('train.csv')
df_submit = pd.read_csv('test.csv')
all_data = pd.concat((df_train.loc[:,'MSSubClass':'SaleCondition'],
                     df_submit.loc[:,'MSSubClass':'SaleCondition']))

In [31]:
#log transform the target:
df_train["SalePrice"] = np.log1p(df_train["SalePrice"])
#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index


In [44]:
skewed_feats = df_train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

In [46]:
# Convert all categorical data to binary representation
all_data = pd.get_dummies(all_data)

In [49]:
# Fill all the NaN values with the mean value of that col
all_data = all_data.fillna(all_data.mean())

In [3]:
# set mean price for each neighborhood
mean_price = df_train[['Neighborhood', 'SalePrice']].groupby(['Neighborhood'], 
as_index=False).mean().sort_values(by='SalePrice', ascending=False)

df_train = df_train.merge(mean_price, left_on='Neighborhood', right_on='Neighborhood', how='left')
df_train = df_train.rename(index=str, columns={'SalePrice_x':'SalePrice','SalePrice_y':'NeighMean' })
df_submit = df_submit.merge(mean_price, left_on='Neighborhood', right_on='Neighborhood', how='left')
df_submit = df_submit.rename(index=str, columns={'SalePrice':'NeighMean'})

In [4]:
df_train.NeighMean = df_train.NeighMean.transform(lambda x: x / 1000)
df_submit.NeighMean = df_submit.NeighMean.transform(lambda x: x / 1000)

In [5]:
# Creating a new is PoolPresent feature.
df_train.loc[ df_train['PoolArea'] <= 0, 'PoolPresent'] = 0
df_train.loc[ df_train['PoolArea'] > 0, 'PoolPresent'] = 1
df_submit.loc[ df_train['PoolArea'] <= 0, 'PoolPresent'] = 0
df_submit.loc[ df_train['PoolArea'] > 0, 'PoolPresent'] = 1

In [6]:
# Creating a new is GarageBand feature, using qcut for 6 clusters.
df_train.loc[ df_train['GarageArea'] <= 281, 'GarageBand'] = 0
df_train.loc[(df_train['GarageArea'] > 281) & (df_train['GarageArea'] <= 400), 'GarageBand'] = 1
df_train.loc[(df_train['GarageArea'] > 400) & (df_train['GarageArea'] <= 480), 'GarageBand'] = 2
df_train.loc[(df_train['GarageArea'] > 480) & (df_train['GarageArea'] <= 540), 'GarageBand'] = 3
df_train.loc[(df_train['GarageArea'] > 540) & (df_train['GarageArea'] <= 659.3), 'GarageBand'] = 4
df_train.loc[ df_train['GarageArea'] > 659.3, 'GarageBand'] = 5
df_submit.loc[ df_submit['GarageArea'] <= 281, 'GarageBand'] = 0
df_submit.loc[(df_submit['GarageArea'] > 281) & (df_submit['GarageArea'] <= 400), 'GarageBand'] = 1
df_submit.loc[(df_submit['GarageArea'] > 400) & (df_submit['GarageArea'] <= 480), 'GarageBand'] = 2
df_submit.loc[(df_submit['GarageArea'] > 480) & (df_submit['GarageArea'] <= 540), 'GarageBand'] = 3
df_submit.loc[(df_submit['GarageArea'] > 540) & (df_submit['GarageArea'] <= 659.3), 'GarageBand'] = 4
df_submit.loc[ df_submit['GarageArea'] > 659.3, 'GarageBand'] = 5

In [7]:
# Convert categorical SaleType feature to binary, add to df
train_SaleType_dummies = pd.get_dummies(df_train.SaleType)
submit_SaleType_dummies = pd.get_dummies(df_submit.SaleType)
df_train = df_train.join(train_SaleType_dummies)
df_submit = df_submit.join(submit_SaleType_dummies)

In [8]:
# Convert categorical SaleCondition feature to binary, add to df
train_SaleCondition_dummies = pd.get_dummies(df_train.SaleCondition)
submit_SaleCondition_dummies = pd.get_dummies(df_submit.SaleCondition)
df_train = df_train.join(train_SaleCondition_dummies)
df_submit = df_submit.join(submit_SaleCondition_dummies)

In [9]:
# Convert categorical ExterQual feature to binary, add to df
train_ExterQual_dummies = pd.get_dummies(df_train.ExterQual)
submit_ExterQual_dummies = pd.get_dummies(df_submit.ExterQual)
df_train = df_train.join(train_ExterQual_dummies)
df_submit = df_submit.join(submit_ExterQual_dummies)

In [10]:
# # Convert categorical ExterCond feature to binary, add to df --- NEED to sort out col name clashes.????
# train_ExterCond_dummies = pd.get_dummies(df_train.ExterCond)
# submit_ExterCond_dummies = pd.get_dummies(df_submit.ExterCond)
# df_train = df_train.join(train_ExterCond_dummies)
# df_submit = df_submit.join(submit_ExterCond_dummies)

In [11]:
# Convert categorical ExterQual feature to binary, add to df
train_MSZoning_dummies = pd.get_dummies(df_train.MSZoning)
submit_MSZoning_dummies = pd.get_dummies(df_submit.MSZoning)
df_train = df_train.join(train_MSZoning_dummies)
df_submit = df_submit.join(submit_MSZoning_dummies)

In [12]:
train_columns = ['LotArea', 'GarageArea', 'TotRmsAbvGrd', 'GarageBand', '1stFlrSF',
                'OverallQual','OverallCond', 'LotFrontage','NeighMean', 'PoolPresent',
                'COD', 'CWD','Con', 'ConLD', 'ConLI', 'ConLw', 'New', 'Oth', 'WD',
                'Normal', 'Abnorml', 'AdjLand', 'Alloca', 'Family', 'Partial',
                'Ex', 'Gd', 'TA', 'Fa',
                'C (all)', 'FV','RH', 'RL','RM',
                'SalePrice']
submit_columns = ['LotArea', 'GarageArea', 'TotRmsAbvGrd', 'GarageBand', '1stFlrSF',
                'OverallQual','OverallCond', 'LotFrontage','NeighMean', 'PoolPresent',
                'COD', 'CWD','Con', 'ConLD', 'ConLI', 'ConLw', 'New', 'Oth', 'WD',
                'Normal', 'Abnorml', 'AdjLand', 'Alloca', 'Family', 'Partial',
                'Ex', 'Gd', 'TA', 'Fa',
                'C (all)', 'FV','RH', 'RL','RM']

In [13]:
df_train_simple = df_train.loc[:,train_columns]
df_submit_simple = df_submit.loc[:,submit_columns]

In [14]:
# this method needs to be improved - use mean across a groupby or ration of front to area
LotFrontage_mean = df_train['LotFrontage'].mean()
df_submit_simple['LotFrontage'].fillna(value=LotFrontage_mean, inplace=True)
df_train_simple['LotFrontage'].fillna(value=LotFrontage_mean, inplace=True)

In [15]:
train, test = train_test_split(df_train_simple, test_size=0.1)
X_train = train.drop('SalePrice', axis=1)
Y_train = train['SalePrice']
X_test  = test.drop('SalePrice', axis=1)
Y_test = test['SalePrice']
X_train.shape, Y_train.shape, X_test.shape, X_test.shape

((1314, 34), (1314,), (146, 34), (146, 34))

In [16]:
df_train_simple.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1460 entries, 0 to 1459
Data columns (total 35 columns):
LotArea         1460 non-null int64
GarageArea      1460 non-null int64
TotRmsAbvGrd    1460 non-null int64
GarageBand      1460 non-null float64
1stFlrSF        1460 non-null int64
OverallQual     1460 non-null int64
OverallCond     1460 non-null int64
LotFrontage     1460 non-null float64
NeighMean       1460 non-null float64
PoolPresent     1460 non-null float64
COD             1460 non-null uint8
CWD             1460 non-null uint8
Con             1460 non-null uint8
ConLD           1460 non-null uint8
ConLI           1460 non-null uint8
ConLw           1460 non-null uint8
New             1460 non-null uint8
Oth             1460 non-null uint8
WD              1460 non-null uint8
Normal          1460 non-null uint8
Abnorml         1460 non-null uint8
AdjLand         1460 non-null uint8
Alloca          1460 non-null uint8
Family          1460 non-null uint8
Partial         1460 non-n

In [17]:
df_submit_simple['GarageArea'].fillna(value=400, inplace=True)
df_submit_simple['GarageBand'].fillna(value=3, inplace=True)
df_submit_simple.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1459 entries, 0 to 1458
Data columns (total 34 columns):
LotArea         1459 non-null int64
GarageArea      1459 non-null float64
TotRmsAbvGrd    1459 non-null int64
GarageBand      1459 non-null float64
1stFlrSF        1459 non-null int64
OverallQual     1459 non-null int64
OverallCond     1459 non-null int64
LotFrontage     1459 non-null float64
NeighMean       1459 non-null float64
PoolPresent     1459 non-null float64
COD             1459 non-null uint8
CWD             1459 non-null uint8
Con             1459 non-null uint8
ConLD           1459 non-null uint8
ConLI           1459 non-null uint8
ConLw           1459 non-null uint8
New             1459 non-null uint8
Oth             1459 non-null uint8
WD              1459 non-null uint8
Normal          1459 non-null uint8
Abnorml         1459 non-null uint8
AdjLand         1459 non-null uint8
Alloca          1459 non-null uint8
Family          1459 non-null uint8
Partial         1459 non

In [18]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_test_pred = logreg.predict(X_test)

In [19]:
Y_pred_test = logreg.predict(X_test)
Y_pred_submit = logreg.predict(df_submit_simple)

In [20]:
print('SQRT Mean Squared Error :',
      np.round(np.sqrt(metrics.mean_squared_error(Y_test, Y_test_pred)))) # 57370.0
print('Mean Absolute Error :',
      metrics.mean_absolute_error(Y_test, Y_test_pred)) # 57370.0
print('R2 Score :',
      metrics.r2_score(Y_test, Y_test_pred)) # 57370.0
print('Explained Variance Score :',
      metrics.explained_variance_score(Y_test, Y_pred_test))
print('Accuracy :',
      logreg.score(X_train, Y_train)) # 0.0707762557078


SQRT Mean Squared Error : 56747.0
Mean Absolute Error : 37856.0958904
R2 Score : 0.342791905862
Explained Variance Score : 0.345761891585
Accuracy : 0.287671232877


In [21]:
# LAST RESULTS STORED HERE!
# SQRT Mean Squared Error : 54941.0
# Mean Absolute Error : 38560.239726
# R2 Score : 0.485073813664
# Explained Variance Score : 0.501997672907
# Accuracy : 0.133181126332
    
# (array([   98.82456079,   815.43915806,   511.06510154,  2156.13969875,
#             7.90427961,   171.68709881,  1490.8653491 ]),
#  array([  1.68349643e-022,   6.93264986e-140,   7.88090840e-096,
#           3.21467312e-279,   5.00518683e-003,   5.92565385e-037,
#           1.64896651e-218]))

In [22]:
f = f_regression(X_train, Y_train)
f

(array([  1.06141400e+02,   8.37768515e+02,   5.24699774e+02,
          7.55444093e+02,   7.59034639e+02,   2.20613433e+03,
          1.08586854e+01,   1.82326309e+02,   1.60123872e+03,
          1.25158889e+01,   8.86862775e+00,   3.20596995e-01,
          2.41410723e+00,   1.55764114e+00,   2.80661219e-01,
          1.86087156e+00,   1.96227843e+02,   1.76099209e+00,
          8.37361882e+01,   3.58358942e+01,   1.53770380e+01,
          3.71005344e+00,   5.27049206e-01,   2.64324453e+00,
          1.88763886e+02,   3.45590231e+02,   3.15645158e+02,
          6.75380445e+02,   1.61563288e+01,   1.64045454e+01,
          9.18567892e+00,   6.24008126e+00,   9.52616092e+01,
          1.32304124e+02]),
 array([  5.47513146e-024,   7.28869943e-143,   5.88418233e-098,
          1.00224476e-131,   3.20604517e-132,   2.67722189e-283,
          1.00959922e-003,   5.31774063e-039,   1.60868669e-229,
          4.17583508e-004,   2.95439325e-003,   5.71345927e-001,
          1.20487680e-001,   2

In [23]:
# titles = df_train_simple.columns.values
# np.concatenate(titles, f)

In [24]:
rf = RandomForestRegressor()
rf.fit(X_train, Y_train)
rf.score(X_test, Y_test)

0.80698888421184556

In [25]:
# dir(rf)
rf.feature_importances_

array([  4.95209277e-02,   4.63291824e-02,   4.87859965e-02,
         5.10197104e-03,   1.07508765e-01,   5.47845420e-01,
         1.42172266e-02,   2.28211550e-02,   1.34371825e-01,
         6.22459016e-04,   6.63335651e-04,   1.03363801e-05,
         6.54783442e-07,   1.20291198e-04,   1.79395556e-05,
         9.61177593e-06,   4.32698115e-04,   6.13732366e-05,
         2.33718798e-03,   1.00236661e-03,   1.05753586e-03,
         4.03976681e-07,   7.19810858e-04,   1.12248873e-03,
         2.11945429e-03,   5.39150837e-03,   2.48141498e-03,
         2.01815282e-03,   1.34710942e-04,   1.58117976e-04,
         2.33747483e-04,   1.07461384e-04,   8.90934321e-04,
         1.78353471e-03])

In [26]:
submission = pd.DataFrame({
        'Id': df_submit['Id'],
        'SalePrice': Y_pred_submit
    })
submission.to_csv('submission_logReg.csv', index=False)

In [27]:
Y_pred_submit_rf = rf.predict(df_submit_simple)

In [28]:
submission = pd.DataFrame({
        'Id': df_submit['Id'],
        'SalePrice': Y_pred_submit_rf
    })
submission.to_csv('submission_randomForest.csv', index=False)