In [None]:
import pandas as pd

data_path = data_path = '/content/Breast_Cancer_dataset.csv'
data = pd.read_csv(data_path)

data_info = data.info()
data_head = data.head()

data_info, data_head


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024 entries, 0 to 4023
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Age                     3823 non-null   float64
 1   Race                    3622 non-null   object 
 2   Marital Status          3703 non-null   object 
 3   T Stage                 4024 non-null   object 
 4   N Stage                 4024 non-null   object 
 5   6th Stage               4024 non-null   object 
 6   differentiate           4024 non-null   object 
 7   Grade                   4024 non-null   object 
 8   A Stage                 4024 non-null   object 
 9   Tumor Size              3622 non-null   float64
 10  Estrogen Status         3823 non-null   object 
 11  Progesterone Status     4024 non-null   object 
 12  Regional Node Examined  3421 non-null   float64
 13  Reginol Node Positive   4024 non-null   int64  
 14  Survival Months         4024 non-null   

(None,
     Age   Race Marital Status T Stage  N Stage 6th Stage  \
 0  68.0  White        Married       T1      N1       IIA   
 1  50.0  White            NaN       T2      N2      IIIA   
 2  58.0  White       Divorced       T3      N3      IIIC   
 3  58.0  White        Married       T1      N1       IIA   
 4  47.0  White        Married       T2      N1       IIB   
 
                differentiate Grade   A Stage  Tumor Size Estrogen Status  \
 0      Poorly differentiated     3  Regional         4.0        Positive   
 1  Moderately differentiated     2  Regional        35.0        Positive   
 2  Moderately differentiated     2  Regional        63.0        Positive   
 3      Poorly differentiated     3  Regional         NaN        Positive   
 4      Poorly differentiated     3  Regional        41.0             NaN   
 
   Progesterone Status  Regional Node Examined  Reginol Node Positive  \
 0            Positive                    24.0                      1   
 1            P

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# Step 1: handling missing values using SimpleImputer
# for numerical columns, used mean imputation, for categorical - mode imputation

# Separate numerical and categorical columns
num_cols = data.select_dtypes(include=['float64', 'int64']).columns
cat_cols = data.select_dtypes(include=['object']).columns

# impute missing values
imputer_num = SimpleImputer(strategy='mean')
imputer_cat = SimpleImputer(strategy='most_frequent')

data[num_cols] = imputer_num.fit_transform(data[num_cols])
data[cat_cols] = imputer_cat.fit_transform(data[cat_cols])

# Step 2: standardize numerical features
scaler = StandardScaler()
data[num_cols] = scaler.fit_transform(data[num_cols])

missing_after_imputation = data.isnull().sum().sum()
data_head_after_processing = data.head()

missing_after_imputation, data_head_after_processing


(0,
         Age   Race Marital Status T Stage  N Stage 6th Stage  \
 0  1.608907  White        Married       T1      N1       IIA   
 1 -0.449611  White        Married       T2      N2      IIIA   
 2  0.465286  White       Divorced       T3      N3      IIIC   
 3  0.465286  White        Married       T1      N1       IIA   
 4 -0.792697  White        Married       T2      N1       IIB   
 
                differentiate Grade   A Stage  Tumor Size Estrogen Status  \
 0      Poorly differentiated     3  Regional   -1.306632        Positive   
 1  Moderately differentiated     2  Regional    0.218417        Positive   
 2  Moderately differentiated     2  Regional    1.595881        Positive   
 3      Poorly differentiated     3  Regional    0.000000        Positive   
 4      Poorly differentiated     3  Regional    0.513588        Positive   
 
   Progesterone Status  Regional Node Examined  Reginol Node Positive  \
 0            Positive                1.280868              -0.6181

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical features with LabelEncoder
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le  # Store encoders for potential inverse transformation later

# splitting data into features and target
X = data.drop(columns=['Status'])
y = data['Status']

# splitting into training and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape


((3219, 15), (805, 15), (3219,), (805,))

In [None]:
import numpy as np
from collections import Counter

# implemening KNN from scratch
class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = [self._predict(x) for x in X]
        return np.array(predictions)

    def _predict(self, x):
        # computed distances between x and all examples in the training set
        distances = np.linalg.norm(self.X_train - x, axis=1)
        # Sort by distance and return the indices of the first k neighbors
        k_indices = np.argsort(distances)[:self.k]
        # extracted the labels of the k nearest neighbor training samples
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        # return the most common class label among the k neighbors
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

# initialize KNN with k=5 for this example
knn = KNN(k=5)
knn.fit(X_train.values, y_train.values)

# predict on the test set
y_pred_knn = knn.predict(X_test.values)

# check the first few predictions to validate
y_pred_knn[:10]


array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# calculate performance metrics for KNN
accuracy_knn = accuracy_score(y_test, y_pred_knn)
precision_knn = precision_score(y_test, y_pred_knn, average='binary')  # Adjust `average` based on your labels
recall_knn = recall_score(y_test, y_pred_knn, average='binary')
f1_knn = f1_score(y_test, y_pred_knn, average='binary')

# print the classification report and confusion matrix
print("KNN Classification Report:\n", classification_report(y_test, y_pred_knn))
print("KNN Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))

# Store results
knn_results = {
    "Model": "KNN",
    "Accuracy": accuracy_knn,
    "Precision": precision_knn,
    "Recall": recall_knn,
    "F1 Score": f1_knn
}

knn_results


KNN Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.97      0.93       685
           1       0.70      0.35      0.47       120

    accuracy                           0.88       805
   macro avg       0.80      0.66      0.70       805
weighted avg       0.87      0.88      0.86       805

KNN Confusion Matrix:
 [[667  18]
 [ 78  42]]


{'Model': 'KNN',
 'Accuracy': 0.8807453416149068,
 'Precision': 0.7,
 'Recall': 0.35,
 'F1 Score': 0.4666666666666667}

In [28]:
# Align columns between training and test sets (after one-hot encoding)
X_train, X_test = X_train.align(X_test, join='inner', axis=1)


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

# Train Naïve Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

# Train Decision Tree
dt = DecisionTreeClassifier(criterion='entropy', random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

# Evaluate models
nb_results = {
    "Model": "Naïve Bayes",
    "Accuracy": accuracy_score(y_test, y_pred_nb),
    "Precision": precision_score(y_test, y_pred_nb, average='binary'),
    "Recall": recall_score(y_test, y_pred_nb, average='binary'),
    "F1 Score": f1_score(y_test, y_pred_nb, average='binary')
}

dt_results = {
    "Model": "Decision Tree",
    "Accuracy": accuracy_score(y_test, y_pred_dt),
    "Precision": precision_score(y_test, y_pred_dt, average='binary'),
    "Recall": recall_score(y_test, y_pred_dt, average='binary'),
    "F1 Score": f1_score(y_test, y_pred_dt, average='binary')
}

nb_results, dt_results


({'Model': 'Naïve Bayes',
  'Accuracy': 0.8385093167701864,
  'Precision': 0.4603174603174603,
  'Recall': 0.48333333333333334,
  'F1 Score': 0.4715447154471545},
 {'Model': 'Decision Tree',
  'Accuracy': 0.8459627329192546,
  'Precision': 0.48360655737704916,
  'Recall': 0.49166666666666664,
  'F1 Score': 0.48760330578512395})

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Step 1: train the baseline Random Forest model
rf_baseline = RandomForestClassifier(random_state=42)
rf_baseline.fit(X_train, y_train)
y_pred_rf_baseline = rf_baseline.predict(X_test)

# Step 2: evaluate the baseline Random Forest model
rf_baseline_results = {
    "Model": "Random Forest (Baseline)",
    "Accuracy": accuracy_score(y_test, y_pred_rf_baseline),
    "Precision": precision_score(y_test, y_pred_rf_baseline, average='binary'),
    "Recall": recall_score(y_test, y_pred_rf_baseline, average='binary'),
    "F1 Score": f1_score(y_test, y_pred_rf_baseline, average='binary')
}

print("Baseline Random Forest Results:", rf_baseline_results)

# Step 3: hyperparameter Tuning using GridSearchCV
from sklearn.model_selection import GridSearchCV

# Define parameter grid for tuning
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# set up GridSearchCV
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)

# Step 4: retrieve the best parameters and score
best_params_rf = grid_search_rf.best_params_
print("Best Parameters:", best_params_rf)

# Step 5: train and evaluate Random Forest with best parameters
rf_tuned = RandomForestClassifier(**best_params_rf, random_state=42)
rf_tuned.fit(X_train, y_train)
y_pred_rf_tuned = rf_tuned.predict(X_test)

# Step 6: evaluate the tuned Random Forest model
rf_tuned_results = {
    "Model": "Random Forest (Tuned)",
    "Accuracy": accuracy_score(y_test, y_pred_rf_tuned),
    "Precision": precision_score(y_test, y_pred_rf_tuned, average='binary'),
    "Recall": recall_score(y_test, y_pred_rf_tuned, average='binary'),
    "F1 Score": f1_score(y_test, y_pred_rf_tuned, average='binary')
}

print("Tuned Random Forest Results:", rf_tuned_results)


Baseline Random Forest Results: {'Model': 'Random Forest (Baseline)', 'Accuracy': 0.9142857142857143, 'Precision': 0.8227848101265823, 'Recall': 0.5416666666666666, 'F1 Score': 0.6532663316582915}
Best Parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 200}
Tuned Random Forest Results: {'Model': 'Random Forest (Tuned)', 'Accuracy': 0.9142857142857143, 'Precision': 0.8311688311688312, 'Recall': 0.5333333333333333, 'F1 Score': 0.649746192893401}


In [None]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to oversample the minority class in the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Original class distribution:", Counter(y_train))
print("Resampled class distribution:", Counter(y_train_resampled))


Original class distribution: Counter({0: 2723, 1: 496})
Resampled class distribution: Counter({0: 2723, 1: 2723})


In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

# recursive feature elimination using RandomForest as a base estimator
selector = RFE(estimator=RandomForestClassifier(random_state=42), n_features_to_select=10, step=1)
selector = selector.fit(X_train_resampled, y_train_resampled)

# Select the top features based on RFE
X_train_selected = selector.transform(X_train_resampled)
X_test_selected = selector.transform(X_test)


In [None]:
# train the Random Forest classifier on resampled and selected data
rf_tuned = RandomForestClassifier(
    n_estimators=200, max_depth=20, min_samples_split=5, random_state=42
)
rf_tuned.fit(X_train_selected, y_train_resampled)
y_pred_rf_tuned = rf_tuned.predict(X_test_selected)

# evaluate the Random Forest classifier
from sklearn.metrics import classification_report, confusion_matrix

print("Random Forest (Tuned) Classification Report:\n", classification_report(y_test, y_pred_rf_tuned))
print("Random Forest (Tuned) Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf_tuned))


Random Forest (Tuned) Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.95      0.94       685
           1       0.67      0.63      0.65       120

    accuracy                           0.90       805
   macro avg       0.80      0.79      0.80       805
weighted avg       0.90      0.90      0.90       805

Random Forest (Tuned) Confusion Matrix:
 [[648  37]
 [ 44  76]]


In [None]:
from sklearn.model_selection import cross_val_score

# tried further tuning Random Forest with cross-validation
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5, 10]
}

grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, scoring='f1')
grid_search_rf.fit(X_train_selected, y_train_resampled)

print("Best Parameters for Random Forest:", grid_search_rf.best_params_)
print("Best Cross-Validation Score:", grid_search_rf.best_score_)

# retrain Random Forest with best parameters
rf_optimized = RandomForestClassifier(**grid_search_rf.best_params_, random_state=42)
rf_optimized.fit(X_train_selected, y_train_resampled)
y_pred_rf_optimized = rf_optimized.predict(X_test_selected)

# evaluating the optimized Random Forest model
print("Optimized Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf_optimized))
print("Optimized Random Forest Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf_optimized))


Best Parameters for Random Forest: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 300}
Best Cross-Validation Score: 0.9198255229073627
Optimized Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.95      0.94       685
           1       0.67      0.62      0.65       120

    accuracy                           0.90       805
   macro avg       0.80      0.79      0.79       805
weighted avg       0.90      0.90      0.90       805

Optimized Random Forest Confusion Matrix:
 [[648  37]
 [ 45  75]]


In [30]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

# C4.5 Decision Tree - Already covered with `criterion='entropy'`
# Training and evaluating Decision Tree (C4.5)
dt = DecisionTreeClassifier(criterion='entropy', random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

# evaluate C4.5 Decision Tree
dt_results = {
    "Model": "Decision Tree (C4.5)",
    "Accuracy": accuracy_score(y_test, y_pred_dt),
    "Precision": precision_score(y_test, y_pred_dt, average='binary'),
    "Recall": recall_score(y_test, y_pred_dt, average='binary'),
    "F1 Score": f1_score(y_test, y_pred_dt, average='binary')
}
print("C4.5 Decision Tree Results:", dt_results)


#  **Gradient Boosting**
# Train Gradient Boosting model
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)

# evaluate Gradient Boosting model
gb_results = {
    "Model": "Gradient Boosting",
    "Accuracy": accuracy_score(y_test, y_pred_gb),
    "Precision": precision_score(y_test, y_pred_gb, average='binary'),
    "Recall": recall_score(y_test, y_pred_gb, average='binary'),
    "F1 Score": f1_score(y_test, y_pred_gb, average='binary')
}
print("Gradient Boosting Results:", gb_results)


# Neural Networks (MLPClassifier)**
# Train Neural Network model (MLP)
mlp = MLPClassifier(random_state=42, max_iter=300)
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)

# evaluating Neural Network model
mlp_results = {
    "Model": "Neural Network (MLP)",
    "Accuracy": accuracy_score(y_test, y_pred_mlp),
    "Precision": precision_score(y_test, y_pred_mlp, average='binary'),
    "Recall": recall_score(y_test, y_pred_mlp, average='binary'),
    "F1 Score": f1_score(y_test, y_pred_mlp, average='binary')
}
print("Neural Network Results:", mlp_results)


C4.5 Decision Tree Results: {'Model': 'Decision Tree (C4.5)', 'Accuracy': 0.8459627329192546, 'Precision': 0.48360655737704916, 'Recall': 0.49166666666666664, 'F1 Score': 0.48760330578512395}
Gradient Boosting Results: {'Model': 'Gradient Boosting', 'Accuracy': 0.915527950310559, 'Precision': 0.8421052631578947, 'Recall': 0.5333333333333333, 'F1 Score': 0.6530612244897959}
Neural Network Results: {'Model': 'Neural Network (MLP)', 'Accuracy': 0.9080745341614906, 'Precision': 0.8108108108108109, 'Recall': 0.5, 'F1 Score': 0.6185567010309279}




In [31]:
from sklearn.model_selection import GridSearchCV

# parameter grid for Gradient Boosting
param_grid_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5]
}

# set up GridSearchCV for Gradient Boosting
grid_search_gb = GridSearchCV(GradientBoostingClassifier(random_state=42), param_grid_gb, cv=5, scoring='accuracy')
grid_search_gb.fit(X_train, y_train)

best_params_gb = grid_search_gb.best_params_
print("Best Parameters for Gradient Boosting:", best_params_gb)

# train the model with best parameters
best_gb = GradientBoostingClassifier(**best_params_gb, random_state=42)
best_gb.fit(X_train, y_train)
y_pred_best_gb = best_gb.predict(X_test)

# evaluate tuned Gradient Boosting model
best_gb_results = {
    "Model": "Tuned Gradient Boosting",
    "Accuracy": accuracy_score(y_test, y_pred_best_gb),
    "Precision": precision_score(y_test, y_pred_best_gb, average='binary'),
    "Recall": recall_score(y_test, y_pred_best_gb, average='binary'),
    "F1 Score": f1_score(y_test, y_pred_best_gb, average='binary')
}
print("Tuned Gradient Boosting Results:", best_gb_results)


Best Parameters for Gradient Boosting: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}
Tuned Gradient Boosting Results: {'Model': 'Tuned Gradient Boosting', 'Accuracy': 0.9080745341614906, 'Precision': 0.8484848484848485, 'Recall': 0.4666666666666667, 'F1 Score': 0.6021505376344086}
