# Feautre Selection

#### Data preprocess

In [153]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn import linear_model
from mutual_info import mutual_information_2d, mutual_information
import operator
from matplotlib import pyplot as plt
from scipy import stats
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
import matplotlib
from sklearn.metrics import mean_squared_error
from scipy.stats import skew

In [154]:
training_data = pd.DataFrame.from_csv('train.csv')
test_data = pd.DataFrame.from_csv('test.csv')
training_data_idx = len(training_data)
data = pd.concat([training_data, test_data])
target = 'SalePrice'
saleprice = training_data[target]

Check each feature's data type

In [155]:
categorical_variables = []
numerical_variables = []
for col in data.columns:
    if data[col].dtype == object or 'yr' in col.lower() or 'year' in col.lower():
        categorical_variables.append(col)
    elif col != target:
        numerical_variables.append(col)

numerical variables can still be categorical

In [156]:
fig = plt.figure(1)
for idx, col_name in enumerate(numerical_variables):
    fig.add_subplot(8, 4, idx+1)
    plt.scatter(training_data[col_name], saleprice)
    plt.title(col_name)
plt.show()

In [157]:
continuous_variables = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
                       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 
                       'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'ScreenPorch',
                       'MiscVal']

In [158]:
categorical_variables = [var for var in data.columns.values.tolist() if var not in continuous_variables and var != target]

In [159]:
data[continuous_variables] = data[continuous_variables].fillna(data[:training_data_idx][continuous_variables].mean())
data[categorical_variables] = data[categorical_variables].fillna('NA')
for col in categorical_variables: data[col] = data[col].astype('category')

Categorical feature selection

In [160]:
training_data[categorical_variables].head()

Unnamed: 0_level_0,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,BsmtFullBath,BsmtHalfBath,...,RoofMatl,RoofStyle,SaleCondition,SaleType,Street,TotRmsAbvGrd,Utilities,YearBuilt,YearRemodAdd,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,,3,1Fam,TA,No,GLQ,Unf,1,0,...,CompShg,Gable,Normal,WD,Pave,8,AllPub,2003,2003,2008
2,0,,3,1Fam,TA,Gd,ALQ,Unf,0,1,...,CompShg,Gable,Normal,WD,Pave,6,AllPub,1976,1976,2007
3,0,,3,1Fam,TA,Mn,GLQ,Unf,1,0,...,CompShg,Gable,Normal,WD,Pave,6,AllPub,2001,2002,2008
4,0,,3,1Fam,Gd,No,ALQ,Unf,1,0,...,CompShg,Gable,Abnorml,WD,Pave,7,AllPub,1915,1970,2006
5,0,,4,1Fam,TA,Av,GLQ,Unf,1,0,...,CompShg,Gable,Normal,WD,Pave,9,AllPub,2000,2000,2008


In [161]:
p_vals = {}
for col_name in categorical_variables:
    df = training_data.groupby(col_name).SalePrice
    f_val, p_val = stats.f_oneway(*[tup[1] for tup in df])
    p_vals[col_name] = p_val

In [162]:
anova_features = sorted(p_vals.items(), key=operator.itemgetter(1))

In [163]:
anova_features[:5]

[('OverallQual', 0.0),
 ('Neighborhood', 1.5586002827707996e-225),
 ('GarageCars', 1.1211287650614395e-211),
 ('ExterQual', 1.4395510967787893e-204),
 ('KitchenQual', 3.0322127528402335e-192)]

In [164]:
training_data.boxplot(column=target, by='Alley', figsize=(8,8))
plt.show()

In [165]:
selected_categorical_features = [feature[0] for feature in anova_features if feature[1] < 0.05]

In [166]:
selected_categorical_features[:5]

['OverallQual', 'Neighborhood', 'GarageCars', 'ExterQual', 'KitchenQual']

Continuous variables

In [167]:
cvar_cor_rankings = {}
for col_name in continuous_variables:
    cvar_cor_rankings[col_name] = np.corrcoef(data[:training_data_idx][col_name], saleprice)[0,1]

In [168]:
sorted_cvar_cor_rankings = sorted(cvar_cor_rankings.items(), key=operator.itemgetter(1), reverse=True)

In [169]:
selected_continuous_features = [feature[0] for feature in sorted_cvar_cor_rankings]

In [170]:
fig = plt.figure(1)
for idx, col_tuple in enumerate(sorted_cvar_cor_rankings):
    fig.add_subplot(len(sorted_cvar_cor_rankings)/4, 4, idx+1)
    plt.scatter(training_data[col_tuple[0]], saleprice)
    plt.title(col_tuple[0])
plt.show()

Check continuous data skewness

In [171]:
fig = plt.figure(1)
for idx, col_name in enumerate(selected_continuous_features):
    fig.add_subplot(len(selected_continuous_features)/4, 4, idx+1)
    plt.hist(data[:training_data_idx][col_name])
    plt.title(col_name)
plt.show()

In [172]:
for col_name in selected_continuous_features:
    if skew(data[:training_data_idx][col_name]) > 0.75:
        print('Log transform skewed numeric features {}'.format(col_name))
        data[col_name] = np.log1p(data[col_name])
saleprice = np.log1p(saleprice)

Log transform skewed numeric features GrLivArea
Log transform skewed numeric features TotalBsmtSF
Log transform skewed numeric features 1stFlrSF
Log transform skewed numeric features MasVnrArea
Log transform skewed numeric features BsmtFinSF1
Log transform skewed numeric features LotFrontage
Log transform skewed numeric features WoodDeckSF
Log transform skewed numeric features 2ndFlrSF
Log transform skewed numeric features OpenPorchSF
Log transform skewed numeric features LotArea
Log transform skewed numeric features BsmtUnfSF
Log transform skewed numeric features ScreenPorch
Log transform skewed numeric features BsmtFinSF2
Log transform skewed numeric features MiscVal
Log transform skewed numeric features EnclosedPorch


Get data ready

In [173]:
dummy_df = pd.get_dummies(data[selected_categorical_features], dummy_na=True)
data = pd.concat([data[selected_continuous_features], dummy_df], axis=1)

In [176]:
features = data.columns.values.tolist()

In [178]:
selected_continuous_features

['GrLivArea',
 'GarageArea',
 'TotalBsmtSF',
 '1stFlrSF',
 'MasVnrArea',
 'BsmtFinSF1',
 'LotFrontage',
 'WoodDeckSF',
 '2ndFlrSF',
 'OpenPorchSF',
 'LotArea',
 'BsmtUnfSF',
 'ScreenPorch',
 'BsmtFinSF2',
 'MiscVal',
 'EnclosedPorch']

In [179]:
def calculate_validation_error_and_penalty(x, y):
    penalties = [1, 3, 5, 10, 15, 30, 50, 70,100,150,250]
    best_penalty = None
    best_error = None
    for penalty in penalties:
        error = np.sqrt(-cross_val_score(linear_model.Lasso(alpha=penalty, max_iter=1000000), x, y, scoring='neg_mean_squared_error', cv=5)).mean()
        if best_error is None or error < best_error:
            best_error = error
            best_penalty = penalty
    return best_error, best_penalty

In [180]:
feature_tuples = [([], None, None)]
selected_continuous_features_copy = selected_continuous_features.copy()
selected_categorical_features_copy = selected_categorical_features.copy()
while len(selected_continuous_features_copy) or len(selected_categorical_features_copy):
    print('{} continuous features have not been checked'.format(len(selected_continuous_features_copy)))
    print('{} categorical features have not been checked'.format(len(selected_categorical_features_copy)))
    if not len(selected_continuous_features_copy):
        try_categorical_feature = selected_categorical_features_copy[0]
        try_categorical_features = feature_tuples[-1][0] + [f for f in features if f.startswith(try_categorical_feature)]
        categorical_error, categorical_penalty = calculate_validation_error_and_penalty(data[:training_data_idx][try_categorical_features], saleprice)
        feature_tuples.append((try_categorical_features, categorical_error, categorical_penalty))
        selected_categorical_features_copy.pop(0)
        print('Only categorical features left, added {} to feature list'.format(try_categorical_feature))
    elif not len(selected_categorical_features_copy):
        selected_continuous_feature = selected_continuous_features_copy[0]
        try_continuous_features = feature_tuples[-1][0] + [selected_continuous_feature]
        continuous_error, continuous_penalty = calculate_validation_error_and_penalty(data[:training_data_idx][try_continuous_features], saleprice)
        feature_tuples.append((try_continuous_features, continuous_error, continuous_penalty))
        selected_continuous_features_copy.pop(0)
        print('Only continuous features left, added {} to feature list'.format(selected_continuous_feature))
    else:
        selected_continuous_feature = selected_continuous_features_copy[0]
        try_continuous_features = feature_tuples[-1][0] + [selected_continuous_feature]
        try_categorical_feature = selected_categorical_features_copy[0]
        try_categorical_features = feature_tuples[-1][0] + [f for f in features if f.startswith(try_categorical_feature)]

        continuous_error, continuous_penalty = calculate_validation_error_and_penalty(data[:training_data_idx][try_continuous_features], saleprice)
        categorical_error, categorical_penalty = calculate_validation_error_and_penalty(data[:training_data_idx][try_categorical_features], saleprice)
        if continuous_error < categorical_error:
            feature_tuples.append((try_continuous_features, continuous_error, continuous_penalty))
            selected_continuous_features_copy.pop(0)
            print('Continuous feature {} vs Categorical feature {}. Winner is {}'.format(
                selected_continuous_feature,
                try_categorical_feature,
                selected_continuous_feature))
        else:
            feature_tuples.append((try_categorical_features, categorical_error, categorical_penalty))
            selected_categorical_features_copy.pop(0)
            print('Continuous feature {} vs Categorical feature {}. Winner is {}'.format(
                selected_continuous_feature,
                try_categorical_feature,
                try_categorical_feature))

16 continuous features have not been checked
53 categorical features have not been checked
Continuous feature GrLivArea vs Categorical feature OverallQual. Winner is OverallQual
16 continuous features have not been checked
52 categorical features have not been checked
Continuous feature GrLivArea vs Categorical feature Neighborhood. Winner is Neighborhood
16 continuous features have not been checked
51 categorical features have not been checked
Continuous feature GrLivArea vs Categorical feature GarageCars. Winner is GarageCars
16 continuous features have not been checked
50 categorical features have not been checked
Continuous feature GrLivArea vs Categorical feature ExterQual. Winner is ExterQual
16 continuous features have not been checked
49 categorical features have not been checked
Continuous feature GrLivArea vs Categorical feature KitchenQual. Winner is KitchenQual
16 continuous features have not been checked
48 categorical features have not been checked
Continuous feature GrLi

In [181]:
feature_tuples_copy = feature_tuples[1:].copy()

In [182]:
feature_tuples_copy = sorted(feature_tuples_copy, key=lambda x: x[1])

In [184]:
best_lasso_model = linear_model.Lasso(alpha=feature_tuples_copy[0][2])
best_lasso_model.fit(data[:training_data_idx][feature_tuples_copy[0][0]], saleprice)

Lasso(alpha=1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [188]:
predictions = pd.Series(np.exp(best_lasso_model.predict(data[training_data_idx:][feature_tuples_copy[0][0]]))-1, index=data[training_data_idx:].index)

In [189]:
predictions.to_csv('Lasso_prediction.csv')

In [190]:
predictions

Id
1461    226607.490664
1462    137559.908263
1463    168522.167955
1464    166124.473741
1465    173421.873840
1466    160278.425524
1467    156495.804321
1468    151530.479619
1469    173421.873840
1470    177401.635365
1471    174460.440737
1472    129896.713395
1473    138880.358515
1474    160278.425524
1475    136904.398570
1476    232362.065228
1477    267204.302672
1478    226607.490664
1479    212963.977632
1480    297521.927239
1481    233753.603624
1482    168925.134729
1483    158375.822461
1484    159705.257107
1485    152802.453610
1486    158944.219681
1487    298946.481455
1488    271060.885252
1489    212455.957898
1490    178038.314161
            ...      
2890    131143.604466
2891    149910.630347
2892     94772.539343
2893     94772.539343
2894     94772.539343
2895    171158.737530
2896    177401.635365
2897    188541.578022
2898    197766.101320
2899    170139.825633
2900    178038.314161
2901    175924.891660
2902    166124.473741
2903    222318.898427
2904   

In [106]:
for idx, col in enumerate(categorical_variables):
    interested_col = np.array(training_data[col], dtype=float).reshape(len(training_data), 1)
    relevance = mutual_information((interested_col, SalePrices), k=7)
    redundancy = 0
    for col2 in categorical_variables:
        if col2 != col:
            current_col = np.array(training_data[col2], dtype=float).reshape(len(training_data), 1)
            redundancy += mutual_information((current_col, interested_col), k=7)
    redundancy = redundancy / len(categorical_variables)
    categorical_rankings[col] = relevance - redundancy
    if idx % 5 == 0:
        print(idx)

0
5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95
100
105
110
115
120
125
130
135
140
145
150
155
160
165
170
175
180
185
190
195
200
205
210
215
220
225
230
235
240
245
250
255
260
265
270
275
280
285
290
295
300
305
310
315
320
325
330
335
340
345
350
355
360
365
370
375
380
385
390
395
400
405
410
415
420
425
430
435
440
445
450
455
460
465
470
475
480
485
490
495
500
505
510
515
520
525
530
535
540
545
550
555
560
565
570
