# FeatureSelection

In [None]:

#importing libraries
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso

In [None]:
df = pd.read_csv("..\..\Project_DATA\HealthCarePred_flat_1.csv")
df["Spending2012"] = df.Spending2012.astype(int)  ### cast Spendings2012 to integer because some classifiers cannot work with continuous data


In [None]:
df.head()

In [None]:
### read presaved results if exists

featuresVoting = pd.read_csv("..\..\Project_DATA\FeatureSelection_results.csv")

### Univariate Analysis: Corellation of outcome variable with all other variables:

In [None]:
featuresVotingList = []

rows, cols = df.shape
features = list(df.columns)
for i in range(cols - 1):
    curr_corr = df["Spending2012"].corr(df[features[i]])
    currFeature = {'Feature': features[i], 'Univariable': 1 if curr_corr > 0.2 else 0 }
    featuresVotingList.append(currFeature)    
featuresVoting = pd.DataFrame(featuresVotingList)        

In [None]:
featuresVoting.head()

#featuresVoting[featuresVoting.Univariable > 0.2].sort_values(by='Univariable', ascending=False)

### Multivariate analysis: 


In [None]:
X = df.drop("Spending2012",1)   #Feature Matrix
y = df["Spending2012"]          #Target Variable

#### Lasso

In [None]:
reg = LassoCV()
reg.fit(X, y)
print("Best alpha using built-in LassoCV: %f" % reg.alpha_)
print("Best score using built-in LassoCV: %f" %reg.score(X,y))
coef = pd.Series(reg.coef_, index = X.columns)

In [None]:
featuresVoting['Lasso1'] = coef.values
featuresVoting.loc[featuresVoting['Lasso1'] != 0, 'Lasso'] = 1
featuresVoting.loc[featuresVoting['Lasso1'] == 0, 'Lasso'] = 0
featuresVoting.Lasso = pd.to_numeric(featuresVoting.Lasso, downcast='integer')
featuresVoting = featuresVoting.drop(['Lasso1'], axis=1)
featuresVoting.head()

In [None]:
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")

#### Random Forest

In [None]:
### Cast Spending2012 to float because RandomForestClassifier cannot work with continuous features
#df.Spending2012 = df.Spending2012.astype(int)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel

sel = SelectFromModel(RandomForestRegressor(n_estimators = 100))
sel.fit(X, y)


In [None]:
rf_sup = sel.get_support()

In [None]:
featuresVoting['RF'] = rf_sup.tolist()
featuresVoting['RF'] = featuresVoting['RF'].astype(int)

featuresVoting.head()

In [None]:
featuresVoting.to_csv("..\..\Project_DATA\FeatureSelection_results.csv", index=False)

#### Gradient Boost

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbc = GradientBoostingRegressor(n_estimators=100, learning_rate=0.3, max_depth=3, random_state=0)
gbc.fit(X, y)


In [None]:
# feature importance
gb_imp = gbc.feature_importances_

In [None]:
featuresVoting['GradientBoost1'] = gb_imp.tolist()
featuresVoting.loc[featuresVoting['GradientBoost1'] != 0, 'GradientBoost'] = 1
featuresVoting.loc[featuresVoting['GradientBoost1'] == 0, 'GradientBoost'] = 0
featuresVoting.GradientBoost = pd.to_numeric(featuresVoting.GradientBoost, downcast='integer')
featuresVoting = featuresVoting.drop(['GradientBoost1'], axis=1)
featuresVoting.head()

#### SVM

In [None]:
from sklearn import svm

svm = svm.SVC(kernel='linear')
svm.fit(X, y)

In [None]:

svm.coef_