In [49]:
# install pandas and its dependencies
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install seaborn

# Import the libraries again after reinstallation
import pandas as pd  # for data manipulation or analysis
import numpy as np  # for numeric calculation
import matplotlib.pyplot as plt  # for data visualization
import seaborn as sns  # for data visualization



In [50]:
import pandas as pd
# Load dataset
df = pd.read_csv('/content/drive/MyDrive/Thesis/Dataset/data 1.csv')

# Drop ID column
df.drop(columns=['id'], inplace=True)

# Encode diagnosis column: M = 1, B = 0
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

# Separate features and target
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Check basic info
print(df.info())
print("Missing values:\n", df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   radius_mean              569 non-null    float64
 1   texture_mean             569 non-null    float64
 2   perimeter_mean           569 non-null    float64
 3   area_mean                569 non-null    float64
 4   smoothness_mean          569 non-null    float64
 5   compactness_mean         569 non-null    float64
 6   concavity_mean           569 non-null    float64
 7   concave points_mean      569 non-null    float64
 8   symmetry_mean            569 non-null    float64
 9   fractal_dimension_mean   569 non-null    float64
 10  radius_se                569 non-null    float64
 11  texture_se               569 non-null    float64
 12  perimeter_se             569 non-null    float64
 13  area_se                  569 non-null    float64
 14  smoothness_se            5

Check Initial Class Distribution

In [51]:
print("Original class distribution:\n", y.value_counts())


Original class distribution:
 diagnosis
0    357
1    212
Name: count, dtype: int64


Remove Outliers Using Isolation Forest

In [52]:
from sklearn.ensemble import IsolationForest

iso = IsolationForest(contamination=0.05, random_state=42)
y_pred_outliers = iso.fit_predict(X)
mask = y_pred_outliers != -1

X_clean = X[mask]
y_clean = y[mask]

print("After outlier removal:\n", y_clean.value_counts())


After outlier removal:
 diagnosis
0    350
1    190
Name: count, dtype: int64


Feature Scaling

In [53]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_clean)


Split Data (80/20 with Stratification)

In [54]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_clean, test_size=0.2, random_state=42, stratify=y_clean)


Handle Imbalanced Dataset with SMOTE

In [55]:
from imblearn.over_sampling import SMOTE
from collections import Counter

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Class distribution after SMOTE:\n", Counter(y_train_resampled))


Class distribution after SMOTE:
 Counter({0: 280, 1: 280})


Feature Selection – ANOVA F-test + GridSearchCV

In [56]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

pipeline = Pipeline([
    ('select', SelectKBest(score_func=f_classif)),
    ('clf', LogisticRegression(max_iter=1000))
])

param_grid = {
    'select__k': [5, 10, 15, 20],
    'clf__C': [0.1, 1, 10]
}

grid = GridSearchCV(pipeline, param_grid, cv=skf, scoring='f1')
grid.fit(X_train_resampled, y_train_resampled)

print("Best Params:", grid.best_params_)
print("Best F1 Score (train):", grid.best_score_)


Best Params: {'clf__C': 1, 'select__k': 20}
Best F1 Score (train): 0.9730473912052859


PCA

In [48]:
'''from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

# Apply PCA to selected features
k = grid.best_params_['select__k']
selector = SelectKBest(score_func=f_classif, k=k)
X_selected = selector.fit_transform(X_train_resampled, y_train_resampled)

pca = PCA().fit(X_selected)

# Plot explained variance
plt.figure(figsize=(8, 4))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.show()
'''

"from sklearn.decomposition import PCA\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n# Apply PCA to selected features\nk = grid.best_params_['select__k']\nselector = SelectKBest(score_func=f_classif, k=k)\nX_selected = selector.fit_transform(X_train_resampled, y_train_resampled)\n\npca = PCA().fit(X_selected)\n\n# Plot explained variance\nplt.figure(figsize=(8, 4))\nplt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o')\nplt.xlabel('Number of Principal Components')\nplt.ylabel('Cumulative Explained Variance')\nplt.grid(True)\nplt.show()\n"

Shared Step: Prepare Selected Features

In [57]:
# Use best_k from GridSearchCV earlier
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(score_func=f_classif, k=best_k)
X_train_sel = selector.fit_transform(X_train_resampled, y_train_resampled)
X_test_sel = selector.transform(X_test)

In [58]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_sel, y_train_resampled)

y_pred_rf = rf_model.predict(X_test_sel)
y_prob_rf = rf_model.predict_proba(X_test_sel)[:, 1]

print("🔍 Random Forest Evaluation:")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob_rf))


🔍 Random Forest Evaluation:
[[70  0]
 [ 3 35]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98        70
           1       1.00      0.92      0.96        38

    accuracy                           0.97       108
   macro avg       0.98      0.96      0.97       108
weighted avg       0.97      0.97      0.97       108

ROC AUC Score: 0.9994360902255639


In [59]:
from sklearn.neural_network import MLPClassifier

mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
mlp_model.fit(X_train_sel, y_train_resampled)

y_pred_mlp = mlp_model.predict(X_test_sel)
y_prob_mlp = mlp_model.predict_proba(X_test_sel)[:, 1]

print("🔍 MLP Classifier Evaluation:")
print(confusion_matrix(y_test, y_pred_mlp))
print(classification_report(y_test, y_pred_mlp))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob_mlp))


🔍 MLP Classifier Evaluation:
[[70  0]
 [ 0 38]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        70
           1       1.00      1.00      1.00        38

    accuracy                           1.00       108
   macro avg       1.00      1.00      1.00       108
weighted avg       1.00      1.00      1.00       108

ROC AUC Score: 1.0


Evaluate Model on Test Data

In [60]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Retrain using best params on full training set
best_k = grid.best_params_['select__k']
best_C = grid.best_params_['clf__C']

selector = SelectKBest(score_func=f_classif, k=best_k)
X_train_best = selector.fit_transform(X_train_resampled, y_train_resampled)
X_test_best = selector.transform(X_test)

model = LogisticRegression(C=best_C, max_iter=1000)
model.fit(X_train_best, y_train_resampled)

y_pred = model.predict(X_test_best)
y_prob = model.predict_proba(X_test_best)[:, 1]

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))


[[70  0]
 [ 0 38]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        70
           1       1.00      1.00      1.00        38

    accuracy                           1.00       108
   macro avg       1.00      1.00      1.00       108
weighted avg       1.00      1.00      1.00       108

ROC AUC Score: 1.0


In [61]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def get_metrics(y_true, y_pred, y_prob, model_name):
    return {
        'Model': model_name,
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1 Score': f1_score(y_true, y_pred),
        'ROC AUC': roc_auc_score(y_true, y_prob)
    }

# Gather metrics
results = []
results.append(get_metrics(y_test, y_pred, y_prob, 'Logistic Regression'))
results.append(get_metrics(y_test, y_pred_rf, y_prob_rf, 'Random Forest'))
results.append(get_metrics(y_test, y_pred_mlp, y_prob_mlp, 'MLP Classifier'))

# Create DataFrame
results_df = pd.DataFrame(results)

# Format and display
print("🔍 Model Performance Comparison:")
print(results_df.round(4))



🔍 Model Performance Comparison:
                 Model  Accuracy  Precision  Recall  F1 Score  ROC AUC
0  Logistic Regression    1.0000        1.0  1.0000    1.0000   1.0000
1        Random Forest    0.9722        1.0  0.9211    0.9589   0.9994
2       MLP Classifier    1.0000        1.0  1.0000    1.0000   1.0000
