In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC

In [2]:
df = pd.read_csv("dataset/wdbc.csv")
df = df.drop("id", axis=1)
df

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [3]:
encoder = LabelEncoder()

df["diagnosis"] = encoder.fit_transform(df["diagnosis"])

In [4]:
X = df.drop("diagnosis", axis=1)
y = pd.DataFrame(df["diagnosis"]).copy()

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y.values.ravel(), test_size= 0.20, random_state=42)

In [6]:
from sklearn.linear_model import LogisticRegression

In [7]:
clf = LogisticRegression(max_iter=3000, random_state=42)
clf.fit(X_train, y_train)
print(f"Accuracy : {clf.score(X_test, y_test):.3f}")

Accuracy : 0.956


## 1. Univariate Selection

In [8]:
from sklearn.feature_selection import SelectKBest, chi2

best_features = SelectKBest(score_func=chi2, k=5).fit(X_train, y_train)
selected_features = X_train.columns[best_features.get_support()]

print(f"Number of selected features: {len(selected_features)}")
print(f"Selected features : {list(selected_features)}")

Number of selected features: 5
Selected features : ['perimeter_mean', 'area_mean', 'area_se', 'perimeter_worst', 'area_worst']


In [9]:
X_train_selected = best_features.transform(X_train)
X_test_selected = best_features.transform(X_test)

clf.fit(X_train_selected, y_train)
print(f"Accuracy : {clf.score(X_test_selected, y_test):.3f}")

Accuracy : 0.974


## 2. Recursive Feature Elimination

In [10]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE

best_features = RFE(estimator=ExtraTreesClassifier(n_estimators=100), n_features_to_select=5)
best_features.fit(X_train, y_train)
selected_features = X.columns[best_features.support_]

print(f"Number of selected features: {len(selected_features)}")
print(f"Selected features : {list(selected_features)}")

Number of selected features: 5
Selected features : ['perimeter_mean', 'concave points_mean', 'radius_worst', 'perimeter_worst', 'concave points_worst']


In [11]:
X_train_selected = best_features.transform(X_train)
X_test_selected = best_features.transform(X_test)

clf.fit(X_train_selected, y_train)
print(f"Accuracy : {clf.score(X_test_selected, y_test):.3f}")

Accuracy : 0.974


## 3. LASSO

In [17]:
from sklearn.linear_model import Lasso

best_features = Lasso(alpha=0.01)
best_features.fit(X, y)
feature_coefficients = pd.Series(best_features.coef_, index=X.columns)
selected_features = feature_coefficients[feature_coefficients != 0].index

print(f"Number of selected features: {len(selected_features)}")
print(f"Selected features : {list(selected_features)}")

Number of selected features: 8
Selected features : ['texture_mean', 'area_mean', 'area_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'concavity_worst']


In [20]:
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

clf.fit(X_train_selected, y_train)
print(f"Accuracy : {clf.score(X_test_selected, y_test):.3f}")

Accuracy : 0.965
