Exploring the concept of recursive feature elimination and understanding how a particular feature plays and important role

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [7]:
from sklearn.datasets import load_breast_cancer

In [9]:
#Exploring brease_cancer_data_set
bc = load_breast_cancer()

In [11]:
print bc.DESCR

Breast Cancer Wisconsin (Diagnostic) Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, field
        13 is Radius SE, field 23 is Worst Radius.

        

In [12]:
data_df = pd.DataFrame(data=bc.data,columns=bc.feature_names)

In [13]:
data_df['target']=bc.target

In [15]:
data_df.head(10)
data_df.tail(10)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
559,11.51,23.93,74.52,403.5,0.09261,0.1021,0.1112,0.04105,0.1388,0.0657,...,37.16,82.28,474.2,0.1298,0.2517,0.363,0.09653,0.2112,0.08732,1
560,14.05,27.15,91.38,600.4,0.09929,0.1126,0.04462,0.04304,0.1537,0.06171,...,33.17,100.2,706.7,0.1241,0.2264,0.1326,0.1048,0.225,0.08321,1
561,11.2,29.37,70.67,386.0,0.07449,0.03558,0.0,0.0,0.106,0.05502,...,38.3,75.19,439.6,0.09267,0.05494,0.0,0.0,0.1566,0.05905,1
562,15.22,30.62,103.4,716.9,0.1048,0.2087,0.255,0.09429,0.2128,0.07152,...,42.79,128.7,915.0,0.1417,0.7917,1.17,0.2356,0.4089,0.1409,0
563,20.92,25.09,143.0,1347.0,0.1099,0.2236,0.3174,0.1474,0.2149,0.06879,...,29.41,179.1,1819.0,0.1407,0.4186,0.6599,0.2542,0.2929,0.09873,0
564,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,0.1726,0.05623,...,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115,0
565,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,0.05533,...,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637,0
566,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,0.159,0.05648,...,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782,0
567,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,0.2397,0.07016,...,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124,0
568,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,0.1587,0.05884,...,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039,1


In [16]:
bc.target_names

array(['malignant', 'benign'],
      dtype='|S9')

In [18]:
bc.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='|S23')

In [19]:
#A NaiveBayes classification

In [20]:
from sklearn.naive_bayes import GaussianNB

In [21]:
from sklearn.cross_validation import train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(data_df.drop(['target'],axis=1), data_df['target'], test_size=0.33, random_state=42)

In [23]:
gnb = GaussianNB()

In [24]:
gnb.fit(X_train,y_train)

GaussianNB(priors=None)

In [25]:
pred = gnb.predict(X_test)

In [26]:
from sklearn.metrics import classification_report

In [27]:
print classification_report(y_test,pred)

             precision    recall  f1-score   support

          0       0.92      0.91      0.92        67
          1       0.95      0.96      0.95       121

avg / total       0.94      0.94      0.94       188



In [28]:
from sklearn.ensemble import AdaBoostClassifier

In [29]:
ada = AdaBoostClassifier()

In [30]:
ada.fit(X_train,y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [31]:
ada_pred = ada.predict(X_test)

In [32]:
print classification_report(y_test,ada_pred)

             precision    recall  f1-score   support

          0       0.91      0.96      0.93        67
          1       0.97      0.95      0.96       121

avg / total       0.95      0.95      0.95       188



In [33]:
from sklearn.feature_selection import RFE

In [37]:
from sklearn.ensemble import RandomForestClassifier

In [38]:
rforest = RandomForestClassifier()

In [48]:
rfe= RFE(rforest,n_features_to_select=2)

In [49]:
rfe.fit(X_train,y_train)

RFE(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
  n_features_to_select=2, step=1, verbose=0)

In [50]:
rfe_pred=rfe.predict(X_test)

In [51]:
print classification_report(y_test,rfe_pred)

             precision    recall  f1-score   support

          0       0.81      0.88      0.84        67
          1       0.93      0.88      0.91       121

avg / total       0.89      0.88      0.88       188



In [82]:
#Feature ranking

In [55]:
rfe.ranking_

array([14,  5, 12, 26, 23, 24,  8,  1, 21, 20, 13, 18,  7, 10, 16, 25, 15,
       19, 28, 29,  4,  9,  2,  3, 17, 27,  6,  1, 11, 22])

In [57]:
rfe.support_

array([False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False], dtype=bool)

In [58]:
data_df.columns

Index([u'mean radius', u'mean texture', u'mean perimeter', u'mean area',
       u'mean smoothness', u'mean compactness', u'mean concavity',
       u'mean concave points', u'mean symmetry', u'mean fractal dimension',
       u'radius error', u'texture error', u'perimeter error', u'area error',
       u'smoothness error', u'compactness error', u'concavity error',
       u'concave points error', u'symmetry error', u'fractal dimension error',
       u'worst radius', u'worst texture', u'worst perimeter', u'worst area',
       u'worst smoothness', u'worst compactness', u'worst concavity',
       u'worst concave points', u'worst symmetry', u'worst fractal dimension',
       u'target'],
      dtype='object')

In [60]:
feature_rank = pd.DataFrame(rfe.ranking_,columns=['ranking'])

In [65]:
feature_rank['feature']=data_df.drop(['target'],axis=1).columns

In [68]:
feature_rank.sort_values('ranking')

Unnamed: 0,ranking,feature
27,1,worst concave points
7,1,mean concave points
22,2,worst perimeter
23,3,worst area
20,4,worst radius
1,5,mean texture
26,6,worst concavity
12,7,perimeter error
6,8,mean concavity
21,9,worst texture


In [69]:
#considering worst features for classification
#i.e symmetry error and fractal dimension error

In [71]:
gnb_worst = GaussianNB()
gnb_best = GaussianNB()

In [72]:
data_worst = data_df[['symmetry error','fractal dimension error']]
data_best = data_df[['worst concave points','mean concave points']]

In [73]:
X_train, X_test, y_train, y_test = train_test_split(data_worst, data_df['target'], test_size=0.33, random_state=42)

In [74]:
gnb_worst.fit(X_train,y_train)

GaussianNB(priors=None)

In [75]:
gnb_worst_pred = gnb_worst.predict(X_test)

In [76]:
print classification_report(y_test,gnb_worst_pred)

             precision    recall  f1-score   support

          0       0.32      0.10      0.16        67
          1       0.64      0.88      0.74       121

avg / total       0.52      0.60      0.53       188



In [77]:
X_train, X_test, y_train, y_test = train_test_split(data_best, data_df['target'], test_size=0.33, random_state=42)

In [78]:
gnb_best.fit(X_train,y_train)

GaussianNB(priors=None)

In [80]:
gnb_best_pred = gnb_best.predict(X_test)

In [81]:
print classification_report(y_test,gnb_best_pred)

             precision    recall  f1-score   support

          0       0.82      0.90      0.86        67
          1       0.94      0.89      0.92       121

avg / total       0.90      0.89      0.89       188

