In [None]:
!pip install deap

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, f1_score
from deap import base, creator, tools, algorithms
import random
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier

def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)


set_seed(9)
# print('*'*10,seed,'*'*10)
# Load the dataset from the UCI Machine Learning Repository or your source
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
names = ['id_number', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
        'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave_points_mean',
        'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se',
        'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave_points_se',
        'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst',
        'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst',
        'concavity_worst', 'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst']

data = pd.read_csv(url, names=names)

print(len(data))

# Preprocessing
# Drop unnecessary columns
data.drop(['id_number'], axis=1, inplace=True)
# Convert diagnosis (M/B) to binary labels (1/0)
data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0})

# Splitting the data into features and target
X = data.drop(['diagnosis'], axis=1)
y = data['diagnosis']

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

cols = [x for x in range(len(X_train.columns))]
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Genetic Algorithm Feature Selection
def evaluate_individual(individual, X, y):
    selected_features = [feature for feature, mask in zip(cols, individual) if mask]
    if len(selected_features) == 0:
        return 0,

    clf = SVC(kernel='linear')
    print(selected_features)
    X_selected = X[:,selected_features]
    scores = cross_val_score(clf, X_selected, y, cv=5)
    return scores.mean(),

creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(X_train.columns))
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("evaluate", evaluate_individual, X=X_train_scaled, y=y_train)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

population = toolbox.population(n=10)
algorithms.eaSimple(population, toolbox, cxpb=0.5, mutpb=0.2, ngen=5, verbose=True)

best_individual = tools.selBest(population, k=1)[0]
selected_features = [feature for feature, mask in zip(X_train.columns, best_individual) if mask]
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Convert NumPy arrays back to DataFrames after feature selection
X_train_selected = pd.DataFrame(X_train_selected, columns=selected_features)
X_test_selected = pd.DataFrame(X_test_selected, columns=selected_features)

# RFC Model Training
model = RandomForestClassifier()
param_grid = {
    'n_estimators': [100],
    'max_depth': [10],
    'min_samples_split': [2],
    'min_samples_leaf': [1]
}
scorer = make_scorer(f1_score)
grid_search = GridSearchCV(model, param_grid, scoring=scorer, cv=5)
grid_search.fit(X_train_selected, y_train)
best_model = grid_search.best_estimator_

# Model Evaluation
y_pred = best_model.predict(X_test_selected)
print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred))

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,estimator_name='RFC')


#SVM

svm = SVC()

# Define the parameter grid for GridSearchCV
param_grid = {
    'C': [0.1],
    'gamma': [0.01],
    'kernel': ['linear']
}

# Create GridSearchCV object
grid_search = GridSearchCV(svm, param_grid, cv=5)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train_selected, y_train)

# Get the best parameters and best estimator from GridSearchCV
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print("Best Parameters:", best_params)

# Evaluate the best estimator on the test set
y_pred = best_estimator.predict(X_test_selected)
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("ROC-AUC Score:", roc_auc_score(y_test, y_pred))

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,estimator_name='SVM')

#MLP
mlp = MLPClassifier(max_iter=100)

# Define the parameter grid for GridSearchCV
param_grid = {
    'hidden_layer_sizes': [(100,50)],
    'activation': ['relu'],
    'solver': ['adam'],
    'alpha': [0.01],
}

# Create GridSearchCV object
grid_search = GridSearchCV(mlp, param_grid, cv=5)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train_selected, y_train)

# Get the best parameters and best estimator from GridSearchCV
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print("Best Parameters:", best_params)

# Evaluate the best estimator on the test set
y_pred = best_estimator.predict(X_test_selected)
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("ROC-AUC Score:", roc_auc_score(y_test, y_pred))

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,estimator_name='SVM')

In [None]:
display.plot()
plt.show()

In [None]:
metrics.ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
plt.show()


In [None]:
len(data.columns)

In [None]:
len(selected_features)