# FeatureSelection

In [1]:

#importing libraries
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso

In [2]:
df = pd.read_csv("..\..\Project_DATA\HealthCareCat_flat_1.csv")

In [3]:
df.head()

Unnamed: 0,State_County_CD,AGE,SEX,race,CCS101,CCS104,CCS105,CCS106,CCS108,CCS257,...,HCC154,HCC155,HCC157,HCC158,HCC161,HCC164,HCC174,HCC176,HCC177,Spending_Cat
0,22070,67,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,22150,67,1,1,4,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,22160,53,1,1,14,0,16,2,0,1,...,0,0,0,0,0,0,0,0,0,1
3,22040,67,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,22090,67,1,1,0,0,0,36,3,1,...,0,0,0,0,0,0,0,0,0,1


#### The dataframe I have is very imbalanced :
#### df[df.Spending_Cat == 1] -- 125309
#### df[df.Spending_Cat == 2] -- 4719  --   x 25
#### df[df.Spending_Cat == 3] -- 1075  --   x 115
#### df[df.Spending_Cat == 4] -- 170   --   x 736
#### I will split it to train, dev and test now, before i starting feature selection process. This way test set will not be changed by over-sampling 

In [4]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2)

df_test.to_csv("..\..\Project_DATA\HealthCareCat_Test.csv", index=False)


In [5]:
X = df_train.drop("Spending_Cat",1)   #Feature Matrix
y = df_train["Spending_Cat"]          #Target Variable

#### Use Synthetic Minority Oversampling Technique to make dataset balanced

In [6]:

from imblearn.over_sampling import SMOTE

oversample = SMOTE()
X, y = oversample.fit_resample(X, y)


In [7]:
X["Spending_Cat"] = y
### Oversampled dataset split to train and dev and save as csv files
df_train, df_dev = train_test_split(X, test_size=0.2)
df_dev.to_csv("..\..\Project_DATA\HealthCareCat_Dev.csv", index=False)
df_train.to_csv("..\..\Project_DATA\HealthCareCat_Train.csv", index=False)

In [7]:
### read presaved results if exists

### featuresVoting = pd.read_csv("..\..\Project_DATA\FeatureSelection_results.csv")

In [8]:
df_train = pd.read_csv("..\..\Project_DATA\HealthCareCat_Train.csv")
df_dev = pd.read_csv("..\..\Project_DATA\HealthCareCat_Dev.csv")

X_train = df_train.drop("Spending_Cat",1)   #Feature Matrix
y_train = df_train["Spending_Cat"]          #Target Variable

X_dev = df_dev.drop("Spending_Cat",1)   #Feature Matrix
y_dev = df_dev["Spending_Cat"]          #Target Variable

### Univariate Analysis: Corellation of outcome variable with all other variables:

In [23]:
### create voting table for feature selection

featuresVoting = pd.DataFrame(X_train.columns)

In [12]:
### make univariate analysis using tableone 

from tableone import TableOne
groupby = 'Spending_Cat'
mytable = TableOne(df_train, groupby=groupby, pval=True)


In [13]:
mytable

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by Spending_Cat,Grouped by Spending_Cat,Grouped by Spending_Cat,Grouped by Spending_Cat,Grouped by Spending_Cat,Grouped by Spending_Cat,Grouped by Spending_Cat
Unnamed: 0_level_1,Unnamed: 1_level_1,Missing,Overall,1,2,3,4,P-Value
n,,,320748,80285,79984,80216,80263,
"State_County_CD, n (%)",22000,0,12749 (4.0),5935 (7.4),3155 (3.9),1588 (2.0),2071 (2.6),<0.001
"State_County_CD, n (%)",22001,,701 (0.2),,308 (0.4),180 (0.2),213 (0.3),
"State_County_CD, n (%)",22002,,686 (0.2),,315 (0.4),184 (0.2),187 (0.2),
"State_County_CD, n (%)",22003,,678 (0.2),,275 (0.3),198 (0.2),205 (0.3),
...,...,...,...,...,...,...,...,...
"HCC174, n (%)",1,,341 (0.1),175 (0.2),65 (0.1),98 (0.1),3 (0.0),
"HCC176, n (%)",0,0,316325 (98.6),79385 (98.9),78861 (98.6),78626 (98.0),79453 (99.0),<0.001
"HCC176, n (%)",1,,4423 (1.4),900 (1.1),1123 (1.4),1590 (2.0),810 (1.0),
"HCC177, n (%)",0,0,320113 (99.8),80165 (99.9),79923 (99.9),80130 (99.9),79895 (99.5),<0.001


In [25]:
### The tableone results say that almost all the features have a significant p-value (< 0.05). It followed with warning message 
### about possible invalid measurements, So I prefer to perform chi-square tests for every feature. 

from scipy import stats

featuresVoting["Univariable"] = 0
pvalues = []


rows, cols = X_train.shape
features = list(X_train.columns)
for i in range(cols):
    crosstab = pd.crosstab(y_train, X_train[features[i]])
    curr_pvalue = stats.chi2_contingency(crosstab)[1]
    pvalues.append(1 if curr_pvalue < 0.05 else 0)    
featuresVoting["Univariable"] = pvalues


In [26]:
featuresVoting

Unnamed: 0,0,Univariable
0,State_County_CD,1
1,AGE,1
2,SEX,1
3,race,1
4,CCS101,1
...,...,...
350,HCC161,1
351,HCC164,1
352,HCC174,1
353,HCC176,1


In [27]:
featuresVoting.to_csv("..\..\Project_DATA\FeatureSelection_results.csv", index=False)

  featuresVoting.to_csv("..\..\Project_DATA\FeatureSelection_results.csv", index=False)


#### I got mostly the same p-values that way 

### Multivariate analysis: 


#### Lasso

In [29]:
reg = LassoCV()
reg.fit(X_train, y_train)
print("Best alpha using built-in LassoCV: %f" % reg.alpha_)
print("Best score using built-in LassoCV: %f" %reg.score(X_train, y_train))
coef = pd.Series(reg.coef_, index = X_train.columns)

Best alpha using built-in LassoCV: 0.025719
Best score using built-in LassoCV: 0.373614


In [30]:
featuresVoting['Lasso1'] = coef.values
featuresVoting.loc[featuresVoting['Lasso1'] != 0, 'Lasso'] = 1
featuresVoting.loc[featuresVoting['Lasso1'] == 0, 'Lasso'] = 0
featuresVoting.Lasso = pd.to_numeric(featuresVoting.Lasso, downcast='integer')
featuresVoting = featuresVoting.drop(['Lasso1'], axis=1)
featuresVoting.head()

Unnamed: 0,0,Univariable,Lasso
0,State_County_CD,1,1
1,AGE,1,1
2,SEX,1,1
3,race,1,0
4,CCS101,1,1


In [31]:
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")

Lasso picked 144 variables and eliminated the other 211 variables


In [32]:
featuresVoting.to_csv("..\..\Project_DATA\FeatureSelection_results.csv", index=False)

  featuresVoting.to_csv("..\..\Project_DATA\FeatureSelection_results.csv", index=False)


#### Random Forest

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

sel = SelectFromModel(RandomForestClassifier(n_estimators = 100))
sel.fit(X_train, y_train)


SelectFromModel(estimator=RandomForestClassifier())

In [34]:
rf_sup = sel.get_support()

In [35]:
featuresVoting['RF'] = rf_sup.tolist()
featuresVoting['RF'] = featuresVoting['RF'].astype(int)

featuresVoting.head()

Unnamed: 0,0,Univariable,Lasso,RF
0,State_County_CD,1,1,1
1,AGE,1,1,1
2,SEX,1,1,1
3,race,1,0,1
4,CCS101,1,1,1


In [36]:
featuresVoting.to_csv("..\..\Project_DATA\FeatureSelection_results.csv", index=False)

  featuresVoting.to_csv("..\..\Project_DATA\FeatureSelection_results.csv", index=False)


#### Gradient Boost

In [37]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.3, max_depth=10, random_state=0)
gbc.fit(X_train, y_train)


GradientBoostingClassifier(learning_rate=0.3, max_depth=10, random_state=0)

In [38]:
# feature importance
gb_imp = gbc.feature_importances_

In [39]:
featuresVoting['GradientBoost1'] = gb_imp.tolist()
featuresVoting.loc[featuresVoting['GradientBoost1'] != 0, 'GradientBoost'] = 1
featuresVoting.loc[featuresVoting['GradientBoost1'] == 0, 'GradientBoost'] = 0
featuresVoting.GradientBoost = pd.to_numeric(featuresVoting.GradientBoost, downcast='integer')
featuresVoting = featuresVoting.drop(['GradientBoost1'], axis=1)
featuresVoting.head()

Unnamed: 0,0,Univariable,Lasso,RF,GradientBoost
0,State_County_CD,1,1,1,1
1,AGE,1,1,1,1
2,SEX,1,1,1,1
3,race,1,0,1,1
4,CCS101,1,1,1,1


In [40]:
featuresVoting.to_csv("..\..\Project_DATA\FeatureSelection_results.csv", index=False)

  featuresVoting.to_csv("..\..\Project_DATA\FeatureSelection_results.csv", index=False)


#### SVM

In [21]:
# from sklearn import svm

# svm = svm.LinearSVC()
# svm.fit(X_train, y_train)

from sklearn.svm import LinearSVC
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
svm = LinearSVC(C=0.01, penalty="l1", dual=False, max_iter=10000).fit(X_train, y_train)


In [22]:
pd.Series(abs(svm.coef_[0]), index=X_train.columns).nlargest(100)

HCC18      1.325324
HCC17      1.024352
HCC16      0.994149
HCC1       0.912624
HCC27      0.882052
             ...   
CCS2617    0.029521
CCS54      0.029068
CCS129     0.028966
CCS116     0.028850
CCS20      0.028809
Length: 100, dtype: float64

In [23]:
featuresVoting['SVM1'] = pd.Series(abs(svm.coef_[0]), index=X.columns).values

featuresVoting.loc[featuresVoting['SVM1'] > 0.5, 'SVM'] = 1
featuresVoting.loc[featuresVoting['SVM1'] <= 0.5, 'SVM'] = 0
featuresVoting.SVM = pd.to_numeric(featuresVoting.SVM, downcast='integer')
featuresVoting = featuresVoting.drop(['SVM1'], axis=1)
featuresVoting.head()



Unnamed: 0,0,Univariable,Lasso,RF,GradientBoost,SVM
0,State_County_CD,1,1,1,1,0
1,AGE,1,1,1,1,0
2,SEX,1,1,1,1,1
3,race,1,0,1,1,0
4,CCS101,1,1,1,1,0


In [24]:
featuresVoting.to_csv("..\..\Project_DATA\FeatureSelection_results.csv", index=False)

#### Selection based on voting:


In [25]:
featuresVoting["Sum"] = featuresVoting.Univariable + featuresVoting.Lasso + featuresVoting.RF + featuresVoting.GradientBoost + featuresVoting.SVM
featuresVoting.sort_values(by='Sum', ascending=False)

Unnamed: 0,0,Univariable,Lasso,RF,GradientBoost,SVM,Sum
2,SEX,1,1,1,1,1,5
293,HCC10,1,1,1,1,1,5
298,HCC19,1,1,1,1,1,5
0,State_County_CD,1,1,1,1,0,4
81,CCS237,1,1,1,1,0,4
...,...,...,...,...,...,...,...
248,CCS189,1,0,0,0,0,1
266,CCS180,0,0,0,0,0,0
277,CCS194,0,0,0,0,0,0
285,CCS218,0,0,0,0,0,0


In [26]:
featuresInfluent =  featuresVoting[featuresVoting['Sum'] > 3]
featuresInfluent

Unnamed: 0,0,Univariable,Lasso,RF,GradientBoost,SVM,Sum
0,State_County_CD,1,1,1,1,0,4
1,AGE,1,1,1,1,0,4
2,SEX,1,1,1,1,1,5
4,CCS101,1,1,1,1,0,4
5,CCS104,1,1,1,1,0,4
...,...,...,...,...,...,...,...
185,CCS231,1,1,1,1,0,4
200,CCS45,1,1,1,1,0,4
207,CCS39,1,1,1,1,0,4
293,HCC10,1,1,1,1,1,5


In [27]:
featuresInfluent.to_csv("..\..\Project_DATA\SelectedFeatures.csv", index=False)