In [2]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
from scipy.stats import skew

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LinearRegression, LogisticRegression, LassoCV
from sklearn.ensemble import RandomForestRegressor

# metrics & utils
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import f_regression

In [5]:
df_train = pd.read_csv('data/train.csv')
df_submit = pd.read_csv('data/test.csv')
all_data = pd.concat((df_train.loc[:,'MSSubClass':'SaleCondition'],
                     df_submit.loc[:,'MSSubClass':'SaleCondition']))

In [6]:
# Test dropping some features to improve RMSE
# all_data = all_data.drop(['PoolArea', 'PoolQC','Alley', 'MiscFeature', 'MiscVal', 'Fence'], axis=1)

In [7]:
#log transform the target:
df_train["SalePrice"] = np.log1p(df_train["SalePrice"])
#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = df_train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

In [8]:
# Convert all categorical data to binary representation
all_data = pd.get_dummies(all_data)
# Fill all the NaN values with the mean value of that col
all_data = all_data.fillna(all_data.mean())

In [9]:
#creating matrices for sklearn:
X_train = all_data[:df_train.shape[0]]
X_test = all_data[df_train.shape[0]:]
y = df_train.SalePrice

In [10]:
# # Create a test, train and submission X and y
# X = df_train.loc[:,'MSSubClass':'SaleCondition']
# y = df_train.SalePrice
# X_submit = df_submit
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [11]:
# Boruta
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier

# NOTE BorutaPy accepts numpy arrays only, hence the .values attribute
# X = pd.read_csv('examples/test_X.csv', index_col=0).values
# y = pd.read_csv('examples/test_y.csv', header=None, index_col=0).values
# data = pd.read_csv('data/train.csv', index_col=0)
# y = data[data.columns[-1:]]
# y = y.values
# X = data[data.columns[:80]]
# X = X.values
# y = y.ravel()

X_train = X_train.values
y = y.values

In [14]:
y.ravel()
y

array([ 12.24769912,  12.10901644,  12.31717117, ...,  12.49313327,
        11.86446927,  11.90159023])

In [18]:
# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestRegressor()

# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)

# find all relevant features - 5 features should be selected
feat_selector.fit(X_train, y)

# check selected features - first 5 features are selected
feat_selector.support_

# check ranking of features
feat_selector.ranking_

# call transform() on X to filter it down to selected features
X_filtered = feat_selector.transform(X_train)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	288
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	288
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	288
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	288
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	288
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	288
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	288
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	25
Rejected: 	263


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	9 / 100
Confirmed: 	16
Tentative: 	9
Rejected: 	263


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	10 / 100
Confirmed: 	16
Tentative: 	9
Rejected: 	263


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	11 / 100
Confirmed: 	16
Tentative: 	9
Rejected: 	263


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	12 / 100
Confirmed: 	16
Tentative: 	7
Rejected: 	265


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	13 / 100
Confirmed: 	16
Tentative: 	7
Rejected: 	265


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	14 / 100
Confirmed: 	16
Tentative: 	7
Rejected: 	265


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	15 / 100
Confirmed: 	16
Tentative: 	7
Rejected: 	265


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	16 / 100
Confirmed: 	16
Tentative: 	7
Rejected: 	265


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	17 / 100
Confirmed: 	16
Tentative: 	7
Rejected: 	265


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	18 / 100
Confirmed: 	16
Tentative: 	7
Rejected: 	265


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	19 / 100
Confirmed: 	16
Tentative: 	7
Rejected: 	265


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	20 / 100
Confirmed: 	16
Tentative: 	7
Rejected: 	265


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	21 / 100
Confirmed: 	16
Tentative: 	7
Rejected: 	265


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	22 / 100
Confirmed: 	16
Tentative: 	6
Rejected: 	266


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	23 / 100
Confirmed: 	16
Tentative: 	6
Rejected: 	266


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	24 / 100
Confirmed: 	16
Tentative: 	6
Rejected: 	266


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	25 / 100
Confirmed: 	16
Tentative: 	6
Rejected: 	266


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	26 / 100
Confirmed: 	16
Tentative: 	6
Rejected: 	266


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	27 / 100
Confirmed: 	16
Tentative: 	6
Rejected: 	266


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	28 / 100
Confirmed: 	16
Tentative: 	6
Rejected: 	266


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	29 / 100
Confirmed: 	16
Tentative: 	5
Rejected: 	267


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	30 / 100
Confirmed: 	16
Tentative: 	5
Rejected: 	267


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	31 / 100
Confirmed: 	16
Tentative: 	5
Rejected: 	267


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	32 / 100
Confirmed: 	17
Tentative: 	3
Rejected: 	268


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	33 / 100
Confirmed: 	17
Tentative: 	3
Rejected: 	268


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	34 / 100
Confirmed: 	17
Tentative: 	3
Rejected: 	268


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	35 / 100
Confirmed: 	17
Tentative: 	3
Rejected: 	268


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	36 / 100
Confirmed: 	17
Tentative: 	3
Rejected: 	268


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	37 / 100
Confirmed: 	17
Tentative: 	3
Rejected: 	268


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	38 / 100
Confirmed: 	17
Tentative: 	3
Rejected: 	268


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	39 / 100
Confirmed: 	17
Tentative: 	3
Rejected: 	268


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	40 / 100
Confirmed: 	18
Tentative: 	2
Rejected: 	268


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	41 / 100
Confirmed: 	18
Tentative: 	2
Rejected: 	268


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	42 / 100
Confirmed: 	18
Tentative: 	2
Rejected: 	268


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	43 / 100
Confirmed: 	18
Tentative: 	2
Rejected: 	268


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	44 / 100
Confirmed: 	18
Tentative: 	2
Rejected: 	268


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	45 / 100
Confirmed: 	18
Tentative: 	2
Rejected: 	268


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	46 / 100
Confirmed: 	18
Tentative: 	1
Rejected: 	269


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	47 / 100
Confirmed: 	18
Tentative: 	1
Rejected: 	269


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	48 / 100
Confirmed: 	18
Tentative: 	1
Rejected: 	269


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	49 / 100
Confirmed: 	18
Tentative: 	1
Rejected: 	269


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	50 / 100
Confirmed: 	18
Tentative: 	1
Rejected: 	269


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	51 / 100
Confirmed: 	18
Tentative: 	1
Rejected: 	269


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	52 / 100
Confirmed: 	18
Tentative: 	1
Rejected: 	269


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	53 / 100
Confirmed: 	18
Tentative: 	1
Rejected: 	269


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	54 / 100
Confirmed: 	18
Tentative: 	1
Rejected: 	269


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	55 / 100
Confirmed: 	18
Tentative: 	1
Rejected: 	269


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	56 / 100
Confirmed: 	18
Tentative: 	1
Rejected: 	269


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	57 / 100
Confirmed: 	18
Tentative: 	1
Rejected: 	269


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	58 / 100
Confirmed: 	18
Tentative: 	1
Rejected: 	269


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	59 / 100
Confirmed: 	18
Tentative: 	1
Rejected: 	269


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	60 / 100
Confirmed: 	18
Tentative: 	1
Rejected: 	269


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	61 / 100
Confirmed: 	18
Tentative: 	1
Rejected: 	269


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	62 / 100
Confirmed: 	18
Tentative: 	1
Rejected: 	269


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	63 / 100
Confirmed: 	18
Tentative: 	1
Rejected: 	269


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	64 / 100
Confirmed: 	18
Tentative: 	1
Rejected: 	269


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	65 / 100
Confirmed: 	18
Tentative: 	1
Rejected: 	269


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	66 / 100
Confirmed: 	18
Tentative: 	1
Rejected: 	269
Iteration: 	67 / 100
Confirmed: 	18
Tentative: 	0
Rejected: 	270


BorutaPy finished running.

Iteration: 	68 / 100
Confirmed: 	18
Tentative: 	0
Rejected: 	270


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


In [22]:
X_filtered = feat_selector.transform(X_train)
X_filtered

array([[ 4.18965474,  9.04204006,  7.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 4.39444915,  9.16962254,  6.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 4.2341065 ,  9.32821229,  7.        , ...,  0.        ,
         0.        ,  1.        ],
       ..., 
       [ 4.20469262,  9.10974626,  7.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 4.2341065 ,  9.18173511,  5.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 4.33073334,  9.20412107,  5.        , ...,  0.        ,
         0.        ,  1.        ]])

In [None]:
# Create a function which calculates Root Mean Squared Error (RMSE)
def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(model, X_train, y, scoring='neg_mean_squared_error', 
                                   cv = 5))
    return rmse

In [None]:
linreg = LinearRegression()
linreg.fit(X_train, y)
rmse_cv(linreg).mean()

In [None]:
lasso = LassoCV(alphas = [1, 0.1, 0.002, 0.001, 0.0008, 0.0005], verbose=True)
lasso.fit(X_train, y)
rmse_cv(lasso).mean()

In [None]:
y_pred_test = np.expm1(lasso.predict(X_test))

In [None]:
submission = pd.DataFrame({
        'Id': df_submit['Id'],
        'SalePrice': y_pred_test
    })
submission.to_csv('submissions/submission_lasso.csv', index=False)

In [None]:
lasso_coef = pd.Series(lasso.coef_, index = X_train.columns)
print("Lasso picked " + str(sum(lasso_coef != 0)) + " variables and eliminated the other " +  str(sum(lasso_coef == 0)) + " variables")

In [None]:
imp_coef = pd.concat([lasso_coef.sort_values().head(20),
                     lasso_coef.sort_values().tail(20)])
plt.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = "barh")
plt.title("Coefficients in the Lasso Model")

In [None]:
#let's look at the residuals as well:
plt.rcParams['figure.figsize'] = (6.0, 6.0)

preds = pd.DataFrame({"preds":lasso.predict(X_train), "true":y})
preds["residuals"] = preds["true"] - preds["preds"]
preds.plot(x = "preds", y = "residuals",kind = "scatter")

In [None]:
# Boruta
from boruta import BorutaPy

# load X and y
# NOTE BorutaPy accepts numpy arrays only, hence the .values attribute
# X = pd.read_csv('examples/test_X.csv', index_col=0).values
# y = pd.read_csv('examples/test_y.csv', header=None, index_col=0).values
data = pd.read_csv('data/train.csv', index_col=0)
y = data[data.columns[-1:]]
y = y.values
X = data[data.columns[:80]]
X = X.values
y = y.ravel()

In [None]:
from sklearn.ensemble import RandomForestClassifier


In [None]:
# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='auto', max_depth=5)

# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)

# find all relevant features - 5 features should be selected
feat_selector.fit(X, y)

# check selected features - first 5 features are selected
feat_selector.support_

# check ranking of features
feat_selector.ranking_

# call transform() on X to filter it down to selected features
X_filtered = feat_selector.transform(X)