In [1]:
# report which features were selected by RFE

from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

import pandas as pd

In [2]:
path = './feature_selection/credit_cards_dataset.csv'
data = pd.read_csv(path)

In [3]:
data = data.rename(columns={'default.payment.next.month':'DEFAULT_NEXT_MONTH'})

In [4]:
data.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT_NEXT_MONTH
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [5]:
y = 'DEFAULT_NEXT_MONTH'
X = [name for name in data.columns if name not in [y, 'ID']]

In [6]:
X

['LIMIT_BAL',
 'SEX',
 'EDUCATION',
 'MARRIAGE',
 'AGE',
 'PAY_0',
 'PAY_2',
 'PAY_3',
 'PAY_4',
 'PAY_5',
 'PAY_6',
 'BILL_AMT1',
 'BILL_AMT2',
 'BILL_AMT3',
 'BILL_AMT4',
 'BILL_AMT5',
 'BILL_AMT6',
 'PAY_AMT1',
 'PAY_AMT2',
 'PAY_AMT3',
 'PAY_AMT4',
 'PAY_AMT5',
 'PAY_AMT6']

In [7]:
X=data[X]
X.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,20000.0,2,2,1,24,2,2,-1,-1,-2,...,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0
1,120000.0,2,2,2,26,-1,2,0,0,0,...,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0
2,90000.0,2,2,2,34,0,0,0,0,0,...,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0
3,50000.0,2,2,1,37,0,0,0,0,0,...,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0
4,50000.0,1,2,1,57,-1,0,-1,0,0,...,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0


In [9]:
y=data[y]
y.head()

0    1
1    1
2    0
3    0
4    0
Name: DEFAULT_NEXT_MONTH, dtype: int64

In [19]:
# define RFE
rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=10)

In [20]:
# fit RFE
rfe.fit(X, y)

RFE(estimator=DecisionTreeClassifier(), n_features_to_select=10)

In [21]:
# summarize all features
for i in range(X.shape[1]):
  print('Column: %d, Selected=%s, Rank: %d' % (i, rfe.support_[i], rfe.ranking_[i]))

Column: 0, Selected=False, Rank: 2
Column: 1, Selected=False, Rank: 11
Column: 2, Selected=False, Rank: 8
Column: 3, Selected=False, Rank: 12
Column: 4, Selected=True, Rank: 1
Column: 5, Selected=True, Rank: 1
Column: 6, Selected=False, Rank: 7
Column: 7, Selected=False, Rank: 9
Column: 8, Selected=False, Rank: 14
Column: 9, Selected=False, Rank: 13
Column: 10, Selected=False, Rank: 10
Column: 11, Selected=True, Rank: 1
Column: 12, Selected=True, Rank: 1
Column: 13, Selected=True, Rank: 1
Column: 14, Selected=False, Rank: 6
Column: 15, Selected=True, Rank: 1
Column: 16, Selected=True, Rank: 1
Column: 17, Selected=False, Rank: 3
Column: 18, Selected=True, Rank: 1
Column: 19, Selected=True, Rank: 1
Column: 20, Selected=False, Rank: 5
Column: 21, Selected=False, Rank: 4
Column: 22, Selected=True, Rank: 1


In [22]:
x = [name for name in data.columns if name not in ['DEFAULT_NEXT_MONTH', 'ID']]
for i in range(X.shape[1]):
    if rfe.support_[i]:
        print(f'{x[i]}, Rank: {rfe.ranking_[i]}')

AGE, Rank: 1
PAY_0, Rank: 1
BILL_AMT1, Rank: 1
BILL_AMT2, Rank: 1
BILL_AMT3, Rank: 1
BILL_AMT5, Rank: 1
BILL_AMT6, Rank: 1
PAY_AMT2, Rank: 1
PAY_AMT3, Rank: 1
PAY_AMT6, Rank: 1
