#Feature Selection - Diabetes dataset.


##Libraries

In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold, RFE
from sklearn.metrics import accuracy_score

##Importing dataset

In [69]:
df = pd.read_csv('diabetes.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


##1) Missing Values Ratio

In [70]:
print("\Missing Value Percentage")
missing = df.isnull().mean()
print(missing*100)

drop_features=missing[missing > 30]
print(drop_features)

\Missing Value Percentage
Pregnancies                 0.0
Glucose                     0.0
BloodPressure               0.0
SkinThickness               0.0
Insulin                     0.0
BMI                         0.0
DiabetesPedigreeFunction    0.0
Age                         0.0
Outcome                     0.0
dtype: float64
Series([], dtype: float64)


-> As of now we can see there are no missing values in the Diabetes dataset given .

In [71]:
#Predicting the accuracy
X = df.drop('Outcome', axis=1)
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [72]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of the Model after Missing Value Ratio :", accuracy)

Accuracy of the Model after Missing Value Ratio : 0.7467532467532467


##3) High Correlation filter

In [73]:
corr_matrix = df.corr()

high_corr_pairs = corr_matrix.abs().unstack().sort_values(kind="quicksort", ascending=False)
high_corr_pairs = high_corr_pairs[high_corr_pairs != 1]  # Remove self-correlation

# Filter pairs with correlation greater than 0.8
high_corr_pairs = high_corr_pairs[high_corr_pairs > 0.8]

print(high_corr_pairs)

Series([], dtype: float64)


In [74]:
X_corr = df.drop('Outcome', axis=1)
y_corr = df['Outcome']
X_train_corr, X_test_corr, y_train_corr, y_test_corr = train_test_split(X_corr, y_corr, test_size=0.2, random_state=42)
#Model prediction
corr_model = LogisticRegression(max_iter=1000)
corr_model.fit(X_train_corr, y_train_corr)
y_pred_corr = corr_model.predict(X_test_corr)
accuracy_corr = accuracy_score(y_test_corr, y_pred_corr)
print("Accuracy of the Model after High Correlation Filter :", accuracy_corr)

Accuracy of the Model after High Correlation Filter : 0.7467532467532467


##5) Low Variance filter.

In [75]:
low_var_df = df.drop('Outcome', axis=1)
var = low_var_df.var()
print(var)
low_variance_threshold = 0.1 # low variability selecting as 0.1
low_variance_features = var[var < low_variance_threshold]

print(low_variance_features)


Pregnancies                    11.354056
Glucose                      1022.248314
BloodPressure                 374.647271
SkinThickness                 254.473245
Insulin                     13281.180078
BMI                            62.159984
DiabetesPedigreeFunction        0.109779
Age                           138.303046
dtype: float64
Series([], dtype: float64)


In [76]:
X_low = df.drop('Outcome', axis=1)
y_low = df['Outcome']
X_train_low, X_test_low, y_train_low, y_test_low = train_test_split(X_low, y_low, test_size=0.2, random_state=42)

low_var_model = LogisticRegression(max_iter=1000)
low_var_model.fit(X_train_low, y_train_low)
y_pred_low = low_var_model.predict(X_test_low)
accuracy_low = accuracy_score(y_test_low, y_pred_low)
print("Accuracy of the Model after Low Variance Filter :", accuracy_low)

Accuracy of the Model after Low Variance Filter : 0.7467532467532467


##7) Forward Feature Selection

In [77]:
X = df.drop(columns='Outcome')
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LogisticRegression(max_iter=1000)

# Perform forward feature selection using SequentialFeatureSelector
forward_selector = SequentialFeatureSelector(model, n_features_to_select='auto', direction='forward')
forward_selector.fit(X_train, y_train)

selected_features_forward = X_train.columns[forward_selector.get_support()]
model.fit(X_train[selected_features_forward], y_train)

# predictions
y_pred_forward = model.predict(X_test[selected_features_forward])
accuracy_forward = accuracy_score(y_test, y_pred_forward)

print(selected_features_forward)
print("Optimal number of features : ", len(selected_features_forward))
print("Accuracy of the Model after Forward Feature Selection :", accuracy_forward)

Index(['Glucose', 'BMI', 'DiabetesPedigreeFunction', 'Age'], dtype='object')
Optimal number of features :  4
Accuracy of the Model after Forward Feature Selection : 0.7359307359307359


##9) Backward Feature Elimination

In [78]:
decision_tree = DecisionTreeClassifier(random_state=42)

backward_selector = RFE(decision_tree, n_features_to_select=5, step=1)
backward_selector.fit(X_train, y_train)

ranking_backward = pd.Series(backward_selector.ranking_, index=X_train.columns).sort_values()

print("Feature Ranking (Least important to most important):")
print(ranking_backward)

selected_features_backward = X_train.columns[backward_selector.support_]
decision_tree.fit(X_train[selected_features_backward], y_train)

y_pred_backward = decision_tree.predict(X_test[selected_features_backward])
accuracy_backward = accuracy_score(y_test, y_pred_backward)

print("\nFinal selected features after backward elimination:", selected_features_backward)
print("Accuracy with the final set of features:", accuracy_backward)


Feature Ranking (Least important to most important):
Glucose                     1
BloodPressure               1
BMI                         1
DiabetesPedigreeFunction    1
Age                         1
SkinThickness               2
Insulin                     3
Pregnancies                 4
dtype: int64

Final selected features after backward elimination: Index(['Glucose', 'BloodPressure', 'BMI', 'DiabetesPedigreeFunction', 'Age'], dtype='object')
Accuracy with the final set of features: 0.696969696969697


##11) Random Forest

In [84]:
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_test)
feature_importances = rfc.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)
print(feature_importance_df)


                    Feature  Importance
1                   Glucose    0.282089
5                       BMI    0.158120
7                       Age    0.142116
6  DiabetesPedigreeFunction    0.113127
2             BloodPressure    0.084052
0               Pregnancies    0.080552
3             SkinThickness    0.070559
4                   Insulin    0.069385


In [85]:
top_5_features = feature_importance_df['Feature'].head(5).values

X_train_reduced = X_train[top_5_features]
X_test_reduced = X_test[top_5_features]

rf_reduced = RandomForestClassifier(random_state=42)
rf_reduced.fit(X_train_reduced, y_train)

y_pred_reduced = rf_reduced.predict(X_test_reduced)

accuracy_reduced = accuracy_score(y_test, y_pred_reduced)
print("Accuracy of the Model after Random Forest selecting Top 5 features :", accuracy_reduced)


Accuracy of the Model after Random Forest selecting Top 5 features : 0.7489177489177489
