# Machine Learning Classification Models
## Breast Cancer Wisconsin Dataset

This notebook trains 6 classification models and evaluates them using multiple metrics.

In [1]:
!pip install -r ../requirements.txt

Collecting streamlit==1.31.0 (from -r ../requirements.txt (line 1))
  Using cached streamlit-1.31.0-py2.py3-none-any.whl.metadata (8.1 kB)
Collecting scikit-learn==1.4.0 (from -r ../requirements.txt (line 2))
  Using cached scikit-learn-1.4.0.tar.gz (7.7 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): still running...
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting numpy==1.24.3 (from -r ../requirements.txt (line 3))
  Using cached numpy-1.24.3.tar.gz (10.9 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'error'


  error: subprocess-exited-with-error
  
  × Getting requirements to build wheel did not run successfully.
  │ exit code: 1
  ╰─> [32 lines of output]
      Traceback (most recent call last):
        File [35m"C:\Users\yogesh.tolani\AppData\Local\Programs\Python\Python313\Lib\site-packages\pip\_vendor\pyproject_hooks\_in_process\_in_process.py"[0m, line [35m389[0m, in [35m<module>[0m
          [31mmain[0m[1;31m()[0m
          [31m~~~~[0m[1;31m^^[0m
        File [35m"C:\Users\yogesh.tolani\AppData\Local\Programs\Python\Python313\Lib\site-packages\pip\_vendor\pyproject_hooks\_in_process\_in_process.py"[0m, line [35m373[0m, in [35mmain[0m
          json_out["return_val"] = [31mhook[0m[1;31m(**hook_input["kwargs"])[0m
                                   [31m~~~~[0m[1;31m^^^^^^^^^^^^^^^^^^^^^^^^[0m
        File [35m"C:\Users\yogesh.tolani\AppData\Local\Programs\Python\Python313\Lib\site-packages\pip\_vendor\pyproject_hooks\_in_process\_in_process.py"[0m, line [

In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score, 
    recall_score, f1_score, matthews_corrcoef,
    confusion_matrix, classification_report
)
import pickle
import warnings
warnings.filterwarnings('ignore')

## 1. Load and Prepare Dataset

In [3]:
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name='target')

print(f"Dataset Shape: {X.shape}")
print(f"Number of Features: {X.shape[1]}")
print(f"Number of Instances: {X.shape[0]}")
print(f"\nTarget Distribution:")
print(y.value_counts())
print(f"\nFeature Names:")
print(X.columns.tolist())

Dataset Shape: (569, 30)
Number of Features: 30
Number of Instances: 569

Target Distribution:
target
1    357
0    212
Name: count, dtype: int64

Feature Names:
['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness', 'mean compactness', 'mean concavity', 'mean concave points', 'mean symmetry', 'mean fractal dimension', 'radius error', 'texture error', 'perimeter error', 'area error', 'smoothness error', 'compactness error', 'concavity error', 'concave points error', 'symmetry error', 'fractal dimension error', 'worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst smoothness', 'worst compactness', 'worst concavity', 'worst concave points', 'worst symmetry', 'worst fractal dimension']


In [4]:
X.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## 2. Train-Test Split (80-20)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

Training set: (455, 30)
Test set: (114, 30)


In [6]:
train_data = X_train.copy()
train_data['target'] = y_train.values
train_data.to_csv('../train_data.csv', index=False)

test_data = X_test.copy()
test_data['target'] = y_test.values
test_data.to_csv('../test_data.csv', index=False)

X_test.to_csv('../test_data_without_labels.csv', index=False)

print("Saved CSV files:")
print("- train_data.csv (with target labels)")
print("- test_data.csv (with target labels)")
print("- test_data_without_labels.csv (for predictions only)")

Saved CSV files:
- train_data.csv (with target labels)
- test_data.csv (with target labels)
- test_data_without_labels.csv (for predictions only)


## 3. Feature Scaling

In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print("Scaler saved to scaler.pkl")

Scaler saved to scaler.pkl


## 4. Define Evaluation Function

In [8]:
def evaluate_model(model_name, y_true, y_pred, y_pred_proba=None):
    metrics = {
        'Model': model_name,
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred, average='binary', zero_division=0),
        'Recall': recall_score(y_true, y_pred, average='binary', zero_division=0),
        'F1': f1_score(y_true, y_pred, average='binary', zero_division=0),
        'MCC': matthews_corrcoef(y_true, y_pred)
    }
    
    if y_pred_proba is not None:
        try:
            metrics['AUC'] = roc_auc_score(y_true, y_pred_proba)
        except:
            metrics['AUC'] = 0.0
    else:
        metrics['AUC'] = 0.0
    
    return metrics

## 5. Train Models and Evaluate

### 5.1 Logistic Regression

In [9]:
lr_model = LogisticRegression(max_iter=10000, random_state=42)
lr_model.fit(X_train_scaled, y_train)
lr_pred = lr_model.predict(X_test_scaled)
lr_pred_proba = lr_model.predict_proba(X_test_scaled)[:, 1]

lr_metrics = evaluate_model('Logistic Regression', y_test, lr_pred, lr_pred_proba)
print("Logistic Regression Metrics:")
for key, value in lr_metrics.items():
    if key != 'Model':
        print(f"{key}: {value:.4f}")

with open('model_logistic_regression.pkl', 'wb') as f:
    pickle.dump(lr_model, f)
print("\nModel saved to model_logistic_regression.pkl")

Logistic Regression Metrics:
Accuracy: 0.9825
Precision: 0.9861
Recall: 0.9861
F1: 0.9861
MCC: 0.9623
AUC: 0.9954

Model saved to model_logistic_regression.pkl


### 5.2 Decision Tree Classifier

In [10]:
dt_model = DecisionTreeClassifier(random_state=42, max_depth=10)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)
dt_pred_proba = dt_model.predict_proba(X_test)[:, 1]

dt_metrics = evaluate_model('Decision Tree', y_test, dt_pred, dt_pred_proba)
print("Decision Tree Metrics:")
for key, value in dt_metrics.items():
    if key != 'Model':
        print(f"{key}: {value:.4f}")

with open('model_decision_tree.pkl', 'wb') as f:
    pickle.dump(dt_model, f)
print("\nModel saved to model_decision_tree.pkl")

Decision Tree Metrics:
Accuracy: 0.9123
Precision: 0.9559
Recall: 0.9028
F1: 0.9286
MCC: 0.8174
AUC: 0.9157

Model saved to model_decision_tree.pkl


### 5.3 K-Nearest Neighbor Classifier

In [11]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)
knn_pred = knn_model.predict(X_test_scaled)
knn_pred_proba = knn_model.predict_proba(X_test_scaled)[:, 1]

knn_metrics = evaluate_model('K-Nearest Neighbor', y_test, knn_pred, knn_pred_proba)
print("K-Nearest Neighbor Metrics:")
for key, value in knn_metrics.items():
    if key != 'Model':
        print(f"{key}: {value:.4f}")

with open('model_k-nearest_neighbor.pkl', 'wb') as f:
    pickle.dump(knn_model, f)
print("\nModel saved to model_k-nearest_neighbor.pkl")

K-Nearest Neighbor Metrics:
Accuracy: 0.9561
Precision: 0.9589
Recall: 0.9722
F1: 0.9655
MCC: 0.9054
AUC: 0.9788

Model saved to model_k-nearest_neighbor.pkl


### 5.4 Naive Bayes Classifier (Gaussian)

In [12]:
nb_model = GaussianNB()
nb_model.fit(X_train_scaled, y_train)
nb_pred = nb_model.predict(X_test_scaled)
nb_pred_proba = nb_model.predict_proba(X_test_scaled)[:, 1]

nb_metrics = evaluate_model('Naive Bayes', y_test, nb_pred, nb_pred_proba)
print("Naive Bayes Metrics:")
for key, value in nb_metrics.items():
    if key != 'Model':
        print(f"{key}: {value:.4f}")

with open('model_naive_bayes.pkl', 'wb') as f:
    pickle.dump(nb_model, f)
print("\nModel saved to model_naive_bayes.pkl")

Naive Bayes Metrics:
Accuracy: 0.9298
Precision: 0.9444
Recall: 0.9444
F1: 0.9444
MCC: 0.8492
AUC: 0.9868

Model saved to model_naive_bayes.pkl


### 5.5 Random Forest (Ensemble)

In [13]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_pred_proba = rf_model.predict_proba(X_test)[:, 1]

rf_metrics = evaluate_model('Random Forest', y_test, rf_pred, rf_pred_proba)
print("Random Forest Metrics:")
for key, value in rf_metrics.items():
    if key != 'Model':
        print(f"{key}: {value:.4f}")

with open('model_random_forest.pkl', 'wb') as f:
    pickle.dump(rf_model, f)
print("\nModel saved to model_random_forest.pkl")

Random Forest Metrics:
Accuracy: 0.9561
Precision: 0.9589
Recall: 0.9722
F1: 0.9655
MCC: 0.9054
AUC: 0.9937

Model saved to model_random_forest.pkl


### 5.6 XGBoost (Ensemble)

In [14]:
xgb_model = XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
xgb_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

xgb_metrics = evaluate_model('XGBoost', y_test, xgb_pred, xgb_pred_proba)
print("XGBoost Metrics:")
for key, value in xgb_metrics.items():
    if key != 'Model':
        print(f"{key}: {value:.4f}")

with open('model_xgboost.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)
print("\nModel saved to model_xgboost.pkl")

XGBoost Metrics:
Accuracy: 0.9561
Precision: 0.9467
Recall: 0.9861
F1: 0.9660
MCC: 0.9058
AUC: 0.9901

Model saved to model_xgboost.pkl


## 6. Comparison of All Models

In [15]:
all_metrics = [lr_metrics, dt_metrics, knn_metrics, nb_metrics, rf_metrics, xgb_metrics]
results_df = pd.DataFrame(all_metrics)
results_df = results_df[['Model', 'Accuracy', 'AUC', 'Precision', 'Recall', 'F1', 'MCC']]

print("\n" + "="*80)
print("FINAL RESULTS COMPARISON")
print("="*80)
print(results_df.to_string(index=False))

results_df.to_csv('model_results.csv', index=False)
print("\nResults saved to model_results.csv")


FINAL RESULTS COMPARISON
              Model  Accuracy      AUC  Precision   Recall       F1      MCC
Logistic Regression  0.982456 0.995370   0.986111 0.986111 0.986111 0.962302
      Decision Tree  0.912281 0.915675   0.955882 0.902778 0.928571 0.817412
 K-Nearest Neighbor  0.956140 0.978836   0.958904 0.972222 0.965517 0.905447
        Naive Bayes  0.929825 0.986772   0.944444 0.944444 0.944444 0.849206
      Random Forest  0.956140 0.993717   0.958904 0.972222 0.965517 0.905447
            XGBoost  0.956140 0.990079   0.946667 0.986111 0.965986 0.905824

Results saved to model_results.csv


In [16]:
results_df

Unnamed: 0,Model,Accuracy,AUC,Precision,Recall,F1,MCC
0,Logistic Regression,0.982456,0.99537,0.986111,0.986111,0.986111,0.962302
1,Decision Tree,0.912281,0.915675,0.955882,0.902778,0.928571,0.817412
2,K-Nearest Neighbor,0.95614,0.978836,0.958904,0.972222,0.965517,0.905447
3,Naive Bayes,0.929825,0.986772,0.944444,0.944444,0.944444,0.849206
4,Random Forest,0.95614,0.993717,0.958904,0.972222,0.965517,0.905447
5,XGBoost,0.95614,0.990079,0.946667,0.986111,0.965986,0.905824
