# Feature Extraction

Which features are important

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
names = ['preg', 'plas', 'pres', 'skin', 'test','mass', 'pedi', 'age', 'class']
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
df_cla = pd.read_csv(url, names=names)
array = df_cla.values
x_cla = array[:,0:8] 
y_cla = array[:,8] 
df_cla

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [9]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

test = SelectKBest(score_func=chi2, k=2) # shows the best 3 according to the chi2 result (comparison between element and class)
fit = test.fit(x_cla, y_cla)
print(fit.scores_)
features = fit.transform(x_cla)
print(features[0:5,:])

[ 111.51969064 1411.88704064   17.60537322   53.10803984 2175.56527292
  127.66934333    5.39268155  181.30368904]
[[148.   0.]
 [ 85.   0.]
 [183.   0.]
 [ 89.  94.]
 [137. 168.]]


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

test_size = 0.33
seed = 1
x_train, x_test, y_train, y_test = train_test_split(features, y_cla, test_size = test_size, random_state = seed)
model = LogisticRegression(solver='lbfgs', max_iter=1000)
model.fit(x_train, y_train)
results = model.score(x_test, y_test)
print(results * 100) # percentage of accuracy

76.77165354330708


In [13]:
# Recursive Feature Elimination (RFE)
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
rfe = RFE(model, n_features_to_select=4)
fit = rfe.fit(x_cla, y_cla)

print(list(df_cla.columns))
print(f"Feature_ranking:{fit.ranking_}")

['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
Feature_ranking:[1 1 3 5 4 1 1 2]


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

test_size = 0.33
seed = 1
# choose the ones with 1 and 2
x_train, x_test, y_train, y_test = train_test_split(df_cla[['preg', 'plas','mass','pedi','age']], y_cla, test_size = test_size, random_state = seed)
model = LogisticRegression(solver='lbfgs', max_iter=1000)
model.fit(x_train, y_train)
results = model.score(x_test, y_test)
print(results * 100) # percentage of accuracy

77.16535433070865


In [15]:
names = ['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'deg-malig', 'breast', 'breast-quad', 'irradiant', 'class']
url = 'breast-cancer.csv'
df_cla = pd.read_csv(url, names=names)
dataset = df_cla.values
x_cla = dataset[:,:-1] 
y_cla = dataset[:,-1] 
df_cla

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiant,class
0,'40-49','premeno','15-19','0-2','yes','3','right','left_up','no','recurrence-events'
1,'50-59','ge40','15-19','0-2','no','1','right','central','no','no-recurrence-events'
2,'50-59','ge40','35-39','0-2','no','2','left','left_low','no','recurrence-events'
3,'40-49','premeno','35-39','0-2','yes','3','right','left_low','yes','no-recurrence-events'
4,'40-49','premeno','30-34','3-5','yes','2','left','right_up','no','recurrence-events'
...,...,...,...,...,...,...,...,...,...,...
281,'50-59','ge40','30-34','6-8','yes','2','left','left_low','no','no-recurrence-events'
282,'50-59','premeno','25-29','3-5','yes','2','left','left_low','yes','no-recurrence-events'
283,'30-39','premeno','30-34','6-8','yes','2','right','right_up','no','no-recurrence-events'
284,'50-59','premeno','15-19','0-2','no','2','right','left_low','no','no-recurrence-events'


In [18]:
df_cla.isna().sum()

age            0
menopause      0
tumor-size     0
inv-nodes      0
node-caps      8
deg-malig      0
breast         0
breast-quad    1
irradiant      0
class          0
dtype: int64

In [19]:
x_cla = x_cla.astype(str)
x_cla

array([["'40-49'", "'premeno'", "'15-19'", ..., "'right'", "'left_up'",
        "'no'"],
       ["'50-59'", "'ge40'", "'15-19'", ..., "'right'", "'central'",
        "'no'"],
       ["'50-59'", "'ge40'", "'35-39'", ..., "'left'", "'left_low'",
        "'no'"],
       ...,
       ["'30-39'", "'premeno'", "'30-34'", ..., "'right'", "'right_up'",
        "'no'"],
       ["'50-59'", "'premeno'", "'15-19'", ..., "'right'", "'left_low'",
        "'no'"],
       ["'50-59'", "'ge40'", "'40-44'", ..., "'left'", "'right_up'",
        "'no'"]], dtype='<U11')

In [22]:
pd.DataFrame(x_cla).isna().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
dtype: int64

In [24]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html

In [25]:
x_train, x_test, y_train, y_test = train_test_split(x_cla, y_cla, test_size = 0.33, random_state = 1)

In [26]:
# prepare input data
from sklearn.preprocessing import OrdinalEncoder

def prepare_inputs(x_train, x_test):
    oe = OrdinalEncoder()
    oe.fit(x_train, x_test)
    x_train_enc = oe.transform(x_train)
    x_test_enc = oe.transform(x_test)
    return x_train_enc, x_test_enc

In [27]:
# prepare input data
x_train_enc, x_test_enc = prepare_inputs(x_train, x_test)
# prepare output data
x_train_enc

array([[ 3.,  0.,  4., ...,  0.,  3.,  0.],
       [ 1.,  2.,  9., ...,  0.,  3.,  0.],
       [ 3.,  2., 10., ...,  1.,  2.,  1.],
       ...,
       [ 4.,  0.,  1., ...,  1.,  1.,  0.],
       [ 4.,  0.,  7., ...,  1.,  1.,  0.],
       [ 4.,  0.,  8., ...,  0.,  0.,  0.]])

In [30]:
y_train_enc, y_test_enc = prepare_inputs(y_train.reshape(-1,1), y_test.reshape(-1,1))
y_train_enc

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],

In [35]:
# prepare target
from sklearn.preprocessing import LabelEncoder

def prepare_target(y_train, y_test):
    le_train = LabelEncoder()
    le_train.fit(y_train)
    
    le_test = LabelEncoder()
    le_test.fit(y_train)
    y_train_enc = le_train.transform(y_train)
    y_test_enc = le_test.transform(y_test)
    return y_train_enc, y_test_enc

In [37]:
# prepare input data
y_train_enc, y_test_enc = prepare_target(y_train, y_test)
# prepare output data
y_train_enc

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0])

In [40]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

test_size = 0.33
seed = 1
x_train, x_test, y_train, y_test = train_test_split(x_train_enc, y_train_enc, test_size = test_size, random_state = seed)
model = LogisticRegression(solver='lbfgs', max_iter=1000, )
model.fit(x_train, y_train)
results = model.score(x_test, y_test)
print(results * 100) # percentage of accuracy

75.0


In [50]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

test = SelectKBest(score_func=chi2, k=4) # shows the best 3 according to the chi2 result (comparison between element and class)
fit = test.fit(x_train_enc, y_train_enc)
print(fit.scores_)
features = fit.transform(x_train_enc)
print(names)
print(features)

[4.72552966e-01 2.91926930e-02 2.13765782e+00 2.93810590e+01
 8.22260110e+00 8.10018314e+00 1.27382179e+00 9.50681603e-01
 3.69998905e+00]
['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'deg-malig', 'breast', 'breast-quad', 'irradiant', 'class']
[[0. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 1. 1.]
 [0. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 1.]
 [0. 0. 2. 0.]
 [0. 0. 1. 0.]
 [4. 1. 1. 1.]
 [0. 0. 0. 0.]
 [0. 0. 2. 0.]
 [2. 1. 2. 0.]
 [2. 1. 2. 0.]
 [0. 1. 2. 0.]
 [0. 0. 2. 0.]
 [0. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [6. 0. 1. 1.]
 [4. 0. 1. 1.]
 [4. 1. 2. 0.]
 [4. 0. 0. 0.]
 [4. 0. 2. 0.]
 [0. 0. 1. 0.]
 [0. 0. 2. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 1. 2. 1.]
 [0. 0. 0. 0.]
 [5. 0. 1. 1.]
 [4. 0. 2. 1.]
 [0. 0. 2. 0.]
 [0. 0. 2. 0.]
 [4. 0. 1. 1.]
 [0. 0. 1. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 1.]
 [5. 1. 2. 0.]
 [0. 0. 2. 1.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 1. 1.]
 [0. 0. 1. 0.]
 [0. 1. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 2. 0.]
 [0. 0. 0. 0.]
 [0. 

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

test_size = 0.33
seed = 1
x_train, x_test, y_train, y_test = train_test_split(features, y_train_enc, test_size = test_size, random_state = seed)
model = LogisticRegression(solver='lbfgs', max_iter=1000)
model.fit(x_train, y_train)
results = model.score(x_test, y_test)
print(results * 100) # percentage of accuracy

79.6875


In [52]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

test = SelectKBest(score_func=chi2, k=3) # shows the best 3 according to the chi2 result (comparison between element and class)
fit = test.fit(x_train_enc, y_train_enc)
print(fit.scores_)
features = fit.transform(x_train_enc)
print(names)
print(features)


[4.72552966e-01 2.91926930e-02 2.13765782e+00 2.93810590e+01
 8.22260110e+00 8.10018314e+00 1.27382179e+00 9.50681603e-01
 3.69998905e+00]
['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'deg-malig', 'breast', 'breast-quad', 'irradiant', 'class']
[[0. 0. 0.]
 [0. 0. 1.]
 [0. 1. 1.]
 [0. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 2.]
 [0. 0. 1.]
 [4. 1. 1.]
 [0. 0. 0.]
 [0. 0. 2.]
 [2. 1. 2.]
 [2. 1. 2.]
 [0. 1. 2.]
 [0. 0. 2.]
 [0. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [6. 0. 1.]
 [4. 0. 1.]
 [4. 1. 2.]
 [4. 0. 0.]
 [4. 0. 2.]
 [0. 0. 1.]
 [0. 0. 2.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 2.]
 [0. 0. 0.]
 [5. 0. 1.]
 [4. 0. 2.]
 [0. 0. 2.]
 [0. 0. 2.]
 [4. 0. 1.]
 [0. 0. 1.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [5. 1. 2.]
 [0. 0. 2.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 1.]
 [0. 0. 1.]
 [0. 0. 2.]
 [0. 0. 0.]
 [0. 0. 2.]
 [0. 0. 1.]
 [6. 1. 1.]
 [0. 0. 1.]
 [0. 0. 2.]
 [0. 0. 2.]
 [0. 0. 1.]
 [0. 2. 1.]
 [0. 0. 1.]
 [5. 0. 1.]
 [4. 0. 1.]
 [4. 1. 2.]
 [0. 0. 

In [54]:
features.shape

(191, 3)

In [53]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

test_size = 0.33
seed = 1
x_train, x_test, y_train, y_test = train_test_split(features, y_train_enc, test_size = test_size, random_state = seed)
model = LogisticRegression(solver='lbfgs', max_iter=1000, )
model.fit(x_train, y_train)
results = model.score(x_test, y_test)
print(results * 100) # percentage of accuracy

81.25


In [56]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
rfe = RFE(model, n_features_to_select=3)
fit = rfe.fit(x_train_enc, y_train_enc)

print(list(df_cla.columns))
print(f"Feature_ranking:{fit.ranking_}")

['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'deg-malig', 'breast', 'breast-quad', 'irradiant', 'class']
Feature_ranking:[3 7 6 5 1 1 1 4 2]


In [57]:
['node-caps', 'deg-malig', 'breast']

['node-caps', 'deg-malig', 'breast']

In [60]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

test_size = 0.33
seed = 1
x_train, x_test, y_train, y_test = train_test_split(x_train_enc[:, 4:7], y_train_enc, test_size = test_size, random_state = seed)
model = LogisticRegression(solver='lbfgs', max_iter=1000, )
model.fit(x_train, y_train)
results = model.score(x_test, y_test)
print(results * 100) # percentage of accuracy

76.5625
