# Feautre Selection

#### Data preprocess

In [211]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn import linear_model
from mutual_info import mutual_information_2d, mutual_information
import operator
from matplotlib import pyplot as plt
from scipy import stats
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
import matplotlib
from sklearn.metrics import mean_squared_error

In [150]:
training_data = pd.DataFrame.from_csv('train.csv')
test_data = pd.DataFrame.from_csv('test.csv')
target = 'SalePrice'
saleprice = training_data[target]
data = pd.concat([training_data, test_data])

Check each feature's data type

In [151]:
categorical_variables = []
numerical_variables = []
for col in data.columns:
    if data[col].dtype == object or 'yr' in col.lower() or 'year' in col.lower():
        categorical_variables.append(col)
    elif col != target:
        numerical_variables.append(col)

In [152]:
fig = plt.figure(1)
for idx, col_name in enumerate(numerical_variables):
    fig.add_subplot(8, 4, idx+1)
    plt.scatter(training_data[col_name], saleprice)
    plt.title(col_name)
plt.show()

In [153]:
continuous_variables = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
                       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 
                       'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'ScreenPorch',
                       'MiscVal']

In [154]:
categorical_variables = [var for var in data.columns.values.tolist() if var not in continuous_variables and var != target]

In [155]:
training_data[continuous_variables] = training_data[continuous_variables].fillna(training_data[continuous_variables].mean())
training_data[categorical_variables] = training_data[categorical_variables].fillna('NA')
for col in categorical_variables: training_data[col] = training_data[col].astype('category')

data[continuous_variables] = data[continuous_variables].fillna(training_data[continuous_variables].mean())
data[categorical_variables] = data[categorical_variables].fillna('NA')
for col in categorical_variables: data[col] = data[col].astype('category')

Categorical feature selection

In [156]:
training_data[categorical_variables].head()

Unnamed: 0_level_0,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,BsmtFullBath,BsmtHalfBath,...,RoofMatl,RoofStyle,SaleCondition,SaleType,Street,TotRmsAbvGrd,Utilities,YearBuilt,YearRemodAdd,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,,3,1Fam,TA,No,GLQ,Unf,1,0,...,CompShg,Gable,Normal,WD,Pave,8,AllPub,2003,2003,2008
2,0,,3,1Fam,TA,Gd,ALQ,Unf,0,1,...,CompShg,Gable,Normal,WD,Pave,6,AllPub,1976,1976,2007
3,0,,3,1Fam,TA,Mn,GLQ,Unf,1,0,...,CompShg,Gable,Normal,WD,Pave,6,AllPub,2001,2002,2008
4,0,,3,1Fam,Gd,No,ALQ,Unf,1,0,...,CompShg,Gable,Abnorml,WD,Pave,7,AllPub,1915,1970,2006
5,0,,4,1Fam,TA,Av,GLQ,Unf,1,0,...,CompShg,Gable,Normal,WD,Pave,9,AllPub,2000,2000,2008


In [157]:
p_vals = {}
for col_name in categorical_variables:
    df = training_data.groupby(col_name).SalePrice
    f_val, p_val = stats.f_oneway(*[tup[1] for tup in df])  
    if p_val < 0.05:
        p_vals[col_name] = p_val
    else:
        print(col_name)

3SsnPorch
BsmtHalfBath
LandSlope
LowQualFinSF
MoSold
Street
Utilities
YrSold


In [158]:
anova_features = sorted(p_vals.items(), key=operator.itemgetter(1))

In [159]:
training_data.boxplot(column=target, by='ExterQual', figsize=(8,8))
plt.show()

In [160]:
selected_categorical_features = [feature[0] for feature in anova_features[:10]]

In [161]:
selected_categorical_features

['OverallQual',
 'Neighborhood',
 'GarageCars',
 'ExterQual',
 'BsmtQual',
 'KitchenQual',
 'FullBath',
 'GarageFinish',
 'FireplaceQu',
 'YearBuilt']

Continuous variables

In [162]:
cvar_cor_rankings = {}
for col_name in continuous_variables:
    cvar_cor_rankings[col_name] = np.corrcoef(training_data[col_name], training_data[target])[0,1]

In [163]:
sorted_cvar_cor_rankings = sorted(cvar_cor_rankings.items(), key=operator.itemgetter(1), reverse=True)

In [164]:
sorted_cvar_cor_rankings

[('GrLivArea', 0.70862447761265224),
 ('GarageArea', 0.62343143891836172),
 ('TotalBsmtSF', 0.61358055155919544),
 ('1stFlrSF', 0.60585218469191482),
 ('MasVnrArea', 0.47524131677366821),
 ('BsmtFinSF1', 0.38641980624215316),
 ('LotFrontage', 0.33490085154055105),
 ('WoodDeckSF', 0.32441344456812948),
 ('2ndFlrSF', 0.31933380283206764),
 ('OpenPorchSF', 0.31585622711605543),
 ('LotArea', 0.26384335387140573),
 ('BsmtUnfSF', 0.21447910554696892),
 ('ScreenPorch', 0.11144657114291109),
 ('BsmtFinSF2', -0.011378121450215136),
 ('MiscVal', -0.021189579640303248),
 ('EnclosedPorch', -0.1285779579259565)]

In [165]:
fig = plt.figure(1)
for idx, col_tuple in enumerate(sorted_cvar_cor_rankings):
    fig.add_subplot(len(sorted_cvar_cor_rankings)/4, 4, idx+1)
    plt.scatter(training_data[col_tuple[0]], saleprice)
    plt.title(col_tuple[0])
plt.show()

In [166]:
selected_continuous_features = [feature[0] for feature in sorted_cvar_cor_rankings[:12]]

Get data ready

In [377]:
dummy_df = pd.get_dummies(data[selected_categorical_features], dummy_na=True)
actual_data = pd.concat([data[selected_continuous_features+[target]], dummy_df], axis=1)

In [358]:
#actual_data[selected_continuous_features+[target]] = (actual_data[selected_continuous_features+[target]] - actual_data[:len(training_data)][selected_continuous_features+[target]].mean()) / actual_data[:len(training_data)][selected_continuous_features+[target]].std(ddof=0)

In [379]:
features = actual_data.columns.values.tolist()
features.remove('SalePrice')

In [380]:
X_train = actual_data[:len(training_data)][features]
y = actual_data[:len(training_data)][target]

In [383]:
coefs = []
for l1_penalty in [1, 3, 5, 10, 15, 30, 50, 70,100,150,250]:
    print('Training model with l1_penalty {}'.format(l1_penalty))
    error = np.sqrt(-cross_val_score(linear_model.Lasso(alpha=l1_penalty, max_iter=1000000, normalize=True), X_train, y, scoring="neg_mean_squared_error", cv=5)).mean()
    coefs.append(tuple((l1_penalty, error)))

Training model with l1_penalty 1
Training model with l1_penalty 3
Training model with l1_penalty 5
Training model with l1_penalty 10
Training model with l1_penalty 15
Training model with l1_penalty 30
Training model with l1_penalty 50
Training model with l1_penalty 70
Training model with l1_penalty 100
Training model with l1_penalty 150
Training model with l1_penalty 250


In [384]:
lasso_alphas = [x[0] for x in coefs]
ridge_errors = [x[1] for x in coefs]

In [385]:
errors = pd.Series(ridge_errors, index=lasso_alphas)
errors.plot(title='Validation error with different l1 penalty')
plt.xlabel('l1 penalty')
plt.ylabel('rmse')

<matplotlib.text.Text at 0x1840cc39c50>

In [386]:
plt.show()

In [387]:
best_l1_penalty = errors.sort_values().head(1).index[0]

In [388]:
lasso_model = linear_model.Lasso(alpha=best_l1_penalty, max_iter=1000000, normalize=True)
lasso_model.fit(X_train, y)

Lasso(alpha=30, copy_X=True, fit_intercept=True, max_iter=1000000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [389]:
lasso_coef = pd.Series(lasso_model.coef_, index = features)

In [390]:
print("Lasso picked " + str(sum(lasso_coef != 0)) + " variables and eliminated the other " +  str(sum(lasso_coef == 0)) + " variables")

Lasso picked 63 variables and eliminated the other 148 variables


In [395]:
imp_coef = pd.concat([lasso_coef.sort_values().head(10),
                      lasso_coef.sort_values().tail(10)])

In [396]:
lasso_significant_pos10 = imp_coef.sort_values(ascending=False).index.values.tolist()[:10]

In [397]:
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = "barh")
plt.title("Coefficients in the Lasso Model")

<matplotlib.text.Text at 0x18409c98128>

In [398]:
plt.show()

In [399]:
lasso_significant_pos10

['OverallQual_10.0',
 'OverallQual_9.0',
 'FullBath_3.0',
 'Neighborhood_NoRidge',
 'Neighborhood_StoneBr',
 'OverallQual_8.0',
 'BsmtQual_Ex',
 'GarageCars_3.0',
 'YearBuilt_1989.0',
 'YearBuilt_2009.0']

In [400]:
lasso_training_error = mean_squared_error(y, lasso_model.predict(X_train))**0.5

In [401]:
lasso_training_error

29732.268303308832

In [402]:
X_test = actual_data[len(training_data):][features]

In [409]:
predictions = lasso_model.predict(X_test)

In [410]:
predictions = pd.Series(predictions, index=X_test.index.values.tolist())

In [412]:
predictions.to_csv('Lasso_2.csv')

In [6]:
categorical_variables = dummy_df.columns.values.tolist()

In [7]:
features = training_data.columns.values.tolist()
features.remove(target)

In [8]:
training_data[features] = training_data[features].fillna(training_data.mean())

Rank continous features

In [9]:
SalePrices = np.array(training_data[target], dtype=float).reshape(len(training_data), 1)

In [10]:
continous_rankings = {}

In [111]:
for col in continuous_variables:
    feature = np.array(training_data[col], dtype=float).reshape(len(training_data),1)
    continous_rankings[col] = mutual_information((feature, SalePrices), k=20) 

In [112]:
sorted_continous = sorted(continous_rankings.items(), key=operator.itemgetter(1))

In [113]:
sorted_continous

[('MoSold', -39.354483361565329),
 ('BsmtHalfBath', -39.250420169817716),
 ('KitchenAbvGr', -39.225732500544183),
 ('Fireplaces', -39.189652359233214),
 ('OverallCond', -39.184526617254562),
 ('GarageCars', -39.183955255063935),
 ('FullBath', -39.077608538800774),
 ('HalfBath', -39.003546819617),
 ('BedroomAbvGr', -38.974633021765044),
 ('BsmtFullBath', -38.90432189448434),
 ('OverallQual', -38.752249346192926),
 ('MSSubClass', -38.350979170543987),
 ('TotRmsAbvGrd', -38.148842311509526),
 ('PoolArea', -38.11166120931972),
 ('MiscVal', -37.96844193074125),
 ('3SsnPorch', -37.644428892897963),
 ('LowQualFinSF', -37.581578050926822),
 ('ScreenPorch', -36.306444798507641),
 ('BsmtFinSF2', -34.911597605974364),
 ('EnclosedPorch', -33.970478515267793),
 ('WoodDeckSF', -25.643959965084886),
 ('MasVnrArea', -23.793829671457615),
 ('LotFrontage', -23.103014296630242),
 ('2ndFlrSF', -22.917872090786446),
 ('OpenPorchSF', -20.911634286151063),
 ('BsmtFinSF1', -13.432084967869486),
 ('GarageArea'

Rank categorical variables

In [26]:
categorical_rankings = {}

In [106]:
for idx, col in enumerate(categorical_variables):
    interested_col = np.array(training_data[col], dtype=float).reshape(len(training_data), 1)
    relevance = mutual_information((interested_col, SalePrices), k=7)
    redundancy = 0
    for col2 in categorical_variables:
        if col2 != col:
            current_col = np.array(training_data[col2], dtype=float).reshape(len(training_data), 1)
            redundancy += mutual_information((current_col, interested_col), k=7)
    redundancy = redundancy / len(categorical_variables)
    categorical_rankings[col] = relevance - redundancy
    if idx % 5 == 0:
        print(idx)

0
5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95
100
105
110
115
120
125
130
135
140
145
150
155
160
165
170
175
180
185
190
195
200
205
210
215
220
225
230
235
240
245
250
255
260
265
270
275
280
285
290
295
300
305
310
315
320
325
330
335
340
345
350
355
360
365
370
375
380
385
390
395
400
405
410
415
420
425
430
435
440
445
450
455
460
465
470
475
480
485
490
495
500
505
510
515
520
525
530
535
540
545
550
555
560
565
570


In [107]:
sorted_categotical = sorted(categorical_rankings.items(), key=operator.itemgetter(1))

In [110]:
sorted_categotical

[('MasVnrType_None', -43.625993440055311),
 ('HouseStyle_1Story', -42.677642095155399),
 ('Foundation_CBlock', -42.58448319160857),
 ('HeatingQC_Ex', -42.583193976544841),
 ('LotShape_Reg', -42.332957243165772),
 ('MasVnrType_BrkFace', -42.143972667765716),
 ('KitchenQual_TA', -41.884115176223084),
 ('GarageFinish_RFn', -41.603780778416379),
 ('GarageFinish_Unf', -41.590355326351585),
 ('Foundation_PConc', -41.55535141235238),
 ('KitchenQual_Gd', -41.540125961048346),
 ('Exterior1st_VinylSd', -41.360605870491298),
 ('HeatingQC_TA', -41.343308856579476),
 ('BsmtExposure_No', -41.34168177892213),
 ('LotShape_IR1', -41.199829954943517),
 ('GarageType_Attchd', -41.145121893816153),
 ('Fence_nan', -41.142196650911302),
 ('Exterior2nd_VinylSd', -41.016959984404288),
 ('FireplaceQu_nan', -40.943315385490152),
 ('BsmtQual_TA', -40.855925631387358),
 ('LotConfig_Inside', -40.801131593103975),
 ('BsmtFinType1_Unf', -40.76044465987929),
 ('FireplaceQu_TA', -40.553486894365847),
 ('MSZoning_RL', -

In [114]:
len(training_data)

1460

In [120]:
len(categorical_variables)

47

In [15]:
len(continuous_variables)

16

#### Handle continous variables

In [19]:
import numpy as np

In [83]:
sorted_cvar_cor_rankings = sorted(cvar_cor_rankings.items(), key=operator.itemgetter(1), reverse=True)

In [84]:
sorted_cvar_cor_rankings

[('GrLivArea', 0.70862447761265224),
 ('GarageArea', 0.62343143891836172),
 ('TotalBsmtSF', 0.61358055155919544),
 ('1stFlrSF', 0.60585218469191482),
 ('MasVnrArea', 0.47524131677366821),
 ('BsmtFinSF1', 0.38641980624215316),
 ('LotFrontage', 0.33490085154055105),
 ('WoodDeckSF', 0.32441344456812948),
 ('2ndFlrSF', 0.31933380283206764),
 ('OpenPorchSF', 0.31585622711605543),
 ('LotArea', 0.26384335387140573),
 ('BsmtUnfSF', 0.21447910554696892),
 ('ScreenPorch', 0.11144657114291109),
 ('BsmtFinSF2', -0.011378121450215136),
 ('MiscVal', -0.021189579640303248),
 ('EnclosedPorch', -0.1285779579259565)]

In [150]:
cvar_mi_rankings = {}
for col_name in continuous_variables:
    feature = np.array(training_data[col_name], dtype=float).reshape(len(training_data),1)
    cvar_mi_rankings[col_name] = mutual_information((feature, SalePrices), k=20) 

In [151]:
sorted_cvar_mi_rankings = sorted(cvar_mi_rankings.items(), key=operator.itemgetter(1), reverse=True)

In [152]:
sorted_cvar_mi_rankings

[('GrLivArea', -1.7100681342097879),
 ('LotArea', -1.9459710364725673),
 ('1stFlrSF', -2.0276254703596202),
 ('TotalBsmtSF', -3.2408740479361953),
 ('BsmtUnfSF', -4.454803322259874),
 ('GarageArea', -12.222131816751792),
 ('BsmtFinSF1', -13.432084967869486),
 ('OpenPorchSF', -20.911634286151063),
 ('2ndFlrSF', -22.917872090786446),
 ('LotFrontage', -23.103014296630242),
 ('MasVnrArea', -23.793829671457615),
 ('WoodDeckSF', -25.643959965084886),
 ('EnclosedPorch', -33.970478515267793),
 ('BsmtFinSF2', -34.911597605974364),
 ('ScreenPorch', -36.306444798507641),
 ('MiscVal', -37.96844193074125)]

In [153]:
fig = plt.figure(1)
for idx, col_tuple in enumerate(sorted_cvar_mi_rankings):
    fig.add_subplot(len(sorted_cvar_mi_rankings)/4, 4, idx+1)
    plt.scatter(training_data[col_tuple[0]], saleprice)
    plt.title(col_tuple[0])
plt.show()

In [219]:
fig = plt.figure(1)
for idx, col_name in enumerate(categorical_variables[24:36]):
    ax = fig.add_subplot(4, 3, idx+1)
    df = training_data.groupby(col_name).SalePrice
    data = [(name, np.array(group)) for name, group in df]
    names = [t[0] for t in data]
    data_to_plot = [t[1] for t in data]
    # Create the boxplot
    bp = ax.boxplot(data_to_plot)
    ax.set_xticklabels(names)
    plt.title(col_name)
plt.show()

In [220]:
# TODO
# 1. use ANOVA test to select categocial features
# 2. rank both continous variables and categorical variables
# 3. combine and train model

# Learn ANOVA, F distribution, t- test
# Learn MI low priority https://pdfs.semanticscholar.org/2bde/35a8b8cb62a906e0233372b7ddeb7c6b5099.pdf