# Feature Selection

We'll use the $Chi^2$ test to perform feature selection on the Titanic dataset.  There's a few ways to use it in Scikit Learn so we'll look at soem different options available to us.

In [None]:
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import chi2
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
%matplotlib inline
plt.rcParams['figure.figsize'] = (10, 6)
plt.style.use('ggplot')

In [None]:
from scipy import stats

plt.figure( figsize=( 7, 3 ) )

xs = np.linspace( 0, 10, 100 )
ys = stats.chi2.pdf( xs, 1 )
plt.plot( xs, ys )
plt.ylim( [-.1,1] )

x = 7
plt.plot( x, 0.0, "ro" )
xs = np.linspace( x, 10, 100 )
ys = stats.chi2.pdf( xs, 1 )
plt.fill_between( xs, ys, color="green" )
print( "Percent for %f is %.3f%%" % ( x, ((1-stats.chi2.cdf(x,1))*100) ) )

In [None]:
titanic = pd.read_csv( "../Week5/decisionTrees/titanic.csv" )

## Fix the data...

# Fill in missing values...
titanic[ "Age" ] = titanic["Age"].fillna( titanic["Age"].mean() )

def sex_to_numeric(x):
    if x=='male':
        return 0
    if x=='female':
        return 1
    else:
        return x

titanic["Sex"] = titanic["Sex"].apply(sex_to_numeric)

# this will break if run more than once
def embarked_to_numeric(x):
    if x=="S":
        return 0
    if x=="C":
        return 1
    if x=="Q":
        return 2
    else: 
        return 3
    
titanic["Embarked"] = titanic["Embarked"].apply(embarked_to_numeric)

features = ["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
titanic = titanic[features]

titanic.head( 10 )
# titanic.describe()
for fare in [0, 10, 50, 100, 500] :
    total = titanic[ titanic.Fare >= fare ].Survived.count()
    surv  = titanic[ titanic.Fare >= fare ].Survived.sum()
    print( "Total paying move than %3d: %3d.  Survived: %3d.  Percent: %5.1f%%" % ( fare, 
                       total, surv, surv/total*100 ) )

print( "Percent female survivors: %.1f%%" % (titanic[ titanic.Sex == 1 ].Survived.sum() / 314 * 100) )

In [None]:
def splitData( features ):
    labels = titanic["Survived"].values
    predictors =  titanic[features].values

    # Split into training and test sets
    XTrain, XTest, yTrain, yTest = train_test_split( predictors, labels, 
                                                     random_state=1, test_size=0.5 )
    return XTrain, XTest, yTrain, yTest

In [None]:
predictor_features = titanic.columns[1:] # Everything but whether the person survived...

XTrain, XTest, yTrain, yTest = splitData( predictor_features )

selector = SelectPercentile( chi2 )

X_new = selector.fit_transform( XTrain, yTrain )

print( "Predictor Features:" , predictor_features.values )

print( "P-Values:" )
for l_p in zip( predictor_features, selector.pvalues_, selector.scores_ ):  # What does "zip" do?
    lab = l_p[0]
    pv  = l_p[1] * 100.0
    sc  = l_p[2]
    print( "   %8s: %.3f%%  (Score: %7.2f)" % (lab, pv, sc) )

# stats.chi2.sf( 1.0, 1 ) # 1 - CDF()

In [None]:
selector = SelectKBest( chi2, k=3 ) # What are the best K (3 in this case) features to use?
X_new = selector.fit_transform(XTrain, yTrain)

print( predictor_features.values )
print( selector.get_support() )
print( "Best features to use:", predictor_features[ selector.get_support() ].values )

In [None]:
titanic["Embarked"].value_counts()

In [None]:
print( "Survived based on embarkation location:" )
titanic[ titanic["Survived"] == 1 ].Embarked.value_counts()

In [None]:
print( "Died based on embarkation location:" )
titanic[ titanic["Survived" ] == 0].Embarked.value_counts()