In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


## Step 1 : Preprocessing

In [None]:


# Step 1: Load Data
# data = pd.read_csv('https://drive.google.com/file/d/1WXAFHw2AMZg7pf2VjadF1sKEETguJwdT/export?format=csv')  # Replace with your actual file path
from google.colab import drive
drive.mount('/content/drive')

# Access the file from Google Drive
data = pd.read_csv('/content/drive/My Drive/Breast_Cancer_dataset.csv')

# Splitting the dataset on column Status
X = data.drop(columns=['Status'])  # Replace 'Survival_Status' with your target column
y = data['Status']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encoding categorical features if present in X_train or X_test
for col in X_train.columns:
    if X_train[col].dtype == 'object':
        le = LabelEncoder()
        X_train[col] = le.fit_transform(X_train[col].astype(str))
        X_test[col] = le.transform(X_test[col].astype(str))

# Ensure data is fully numeric and handle missing values
X_train = X_train.apply(pd.to_numeric, errors='coerce').fillna(0)
X_test = X_test.apply(pd.to_numeric, errors='coerce').fillna(0)
# Step 2: Handle Missing Values
# Display missing values for each column
print(data.isnull().sum())

# Impute missing values for numerical columns with the mean or median
num_imputer = SimpleImputer(strategy='mean')
data[data.select_dtypes(include=['float64', 'int64']).columns] = num_imputer.fit_transform(data.select_dtypes(include=['float64', 'int64']))

# Impute missing values for categorical columns with the most frequent value
cat_imputer = SimpleImputer(strategy='most_frequent')
data[data.select_dtypes(include=['object']).columns] = cat_imputer.fit_transform(data.select_dtypes(include=['object']))

# Step 3: Outlier Detection and Handling (using Z-score)
# Applying Z-score to numerical columns
numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
data_no_outliers = data[(np.abs(zscore(data[numeric_cols])) < 3).all(axis=1)]  # Keeping rows with Z-score < 3

# Step 4: Standardization/Normalization of Numerical Features
scaler = StandardScaler()
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

# Step 5: Encoding Categorical Features
# One-Hot Encoding for categorical columns
encoder = OneHotEncoder(drop='first', sparse_output=False)
categorical_cols = data.select_dtypes(include=['object']).columns
encoded_cat_data = pd.DataFrame(encoder.fit_transform(data[categorical_cols]), columns=encoder.get_feature_names_out(categorical_cols))

# Concatenate encoded columns back to the dataset
data = pd.concat([data.drop(categorical_cols, axis=1), encoded_cat_data], axis=1)

# Step 6: Dimensionality Reduction (Optional)
# Apply PCA if you have a large number of features and want to reduce dimensionality
pca = PCA(n_components=0.95)  # Retain 95% variance
data_reduced = pca.fit_transform(data)

# Display the preprocessed data
print(data.head())


Mounted at /content/drive
Age                       201
Race                      402
Marital Status            321
T Stage                     0
N Stage                     0
6th Stage                   0
differentiate               0
Grade                       0
A Stage                     0
Tumor Size                402
Estrogen Status           201
Progesterone Status         0
Regional Node Examined    603
Reginol Node Positive       0
Survival Months             0
Status                      0
dtype: int64
        Age  Tumor Size  Regional Node Examined  Reginol Node Positive  \
0  1.608907   -1.306632                1.280868              -0.618172   
1 -0.449611    0.218417               -0.046684               0.164807   
2  0.465286    1.595881               -0.046684               0.556296   
3  0.465286    0.000000               -1.639745              -0.618172   
4 -0.792697    0.513588               -1.506990              -0.618172   

   Survival Months  Race_Other  Race

## Step 2: Modeling

### KNN

In [None]:
import numpy as np
from collections import Counter

# KNN function implemented from scratch
class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train.values  # Convert X_train to NumPy array
        self.y_train = y_train.values # Convert y_train to NumPy array

    def predict(self, X_test):
        X_test = X_test.values # Convert X_test to NumPy array
        predictions = [self._predict(x) for x in X_test]
        return np.array(predictions)

    def _predict(self, x):
        # Compute distances to all points in the training set
        distances = [np.linalg.norm(x - x_train) for x_train in self.X_train]
        # Get the k nearest samples, labels
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        # Majority vote
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

# Initialize, fit, and predict with our KNN implementation
knn = KNN(k=5)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
# knn_pred

### Naïve Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

# Train and predict with Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_test)


### C4.5 Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Train and predict with Decision Tree
dt = DecisionTreeClassifier(criterion="entropy", max_depth=5)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)


### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train and predict with Random Forest
rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)


### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Train and predict with Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb.fit(X_train, y_train)
gb_pred = gb.predict(X_test)


### Neural Networks

In [None]:
from sklearn.neural_network import MLPClassifier

# Train and predict with Neural Network
nn = MLPClassifier(hidden_layer_sizes=(50,), max_iter=1000, random_state=42)
nn.fit(X_train, y_train)
nn_pred = nn.predict(X_test)


## Step 3: Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter tuning for Random Forest
rf_param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10]
}
rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=5)
rf_grid_search.fit(X_train, y_train)
print("Best Random Forest Params:", rf_grid_search.best_params_)
print("Best Random Forest Score:", rf_grid_search.best_score_)

# Hyperparameter tuning for Gradient Boosting
gb_param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}
gb_grid_search = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_param_grid, cv=5)
gb_grid_search.fit(X_train, y_train)
print("Best Gradient Boosting Params:", gb_grid_search.best_params_)
print("Best Gradient Boosting Score:", gb_grid_search.best_score_)


Best Random Forest Params: {'max_depth': 15, 'min_samples_split': 10, 'n_estimators': 100}
Best Random Forest Score: 0.903696280053708
Best Gradient Boosting Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Best Gradient Boosting Score: 0.9005897240226808


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

# Store the predictions for each model
model_predictions = {
    'KNN': knn_pred,
    'Naive Bayes': nb_pred,
    'Decision Tree': dt_pred,
    'Random Forest': rf_pred,
    'Gradient Boosting': gb_pred,
    'Neural Network': nn_pred
}

# Initialize a list to store the result summary
results_summary = []

# Calculate metrics for each model
for model_name, predictions in model_predictions.items():
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    f1 = f1_score(y_test, predictions, average='weighted')

    # Append results to the summary list
    results_summary.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    })

# Convert results summary to DataFrame
results_df = pd.DataFrame(results_summary)
print("Model Performance Summary:")
print(results_df)

# Display feature importance for models that have it
feature_importances = {
    'Random Forest': rf.feature_importances_,
    'Gradient Boosting': gb.feature_importances_
}

for model_name, importances in feature_importances.items():
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)
    print(f"\nFeature Importance for {model_name}:")
    print(feature_importance_df)


Model Performance Summary:
               Model  Accuracy  Precision    Recall  F1 Score
0                KNN  0.885714   0.873081  0.885714  0.872689
1        Naive Bayes  0.847205   0.850912  0.847205  0.848978
2      Decision Tree  0.913043   0.907968  0.913043  0.904636
3      Random Forest  0.909317   0.912353  0.909317  0.893871
4  Gradient Boosting  0.900621   0.892311  0.900621  0.892093
5     Neural Network  0.894410   0.885450  0.894410  0.880074

Feature Importance for Random Forest:
                   Feature  Importance
14         Survival Months    0.585354
13   Reginol Node Positive    0.071184
4                  N Stage    0.058566
0                      Age    0.044557
9               Tumor Size    0.040039
5                6th Stage    0.039734
10         Estrogen Status    0.038408
11     Progesterone Status    0.034703
12  Regional Node Examined    0.021924
7                    Grade    0.020117
3                 T Stage     0.013860
2           Marital Status    0.