# Machine Learning Classification Models
## Breast Cancer Wisconsin Dataset

This notebook trains 6 classification models and evaluates them using multiple metrics.

In [None]:
!pip install -r ../requirements.txt

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score, 
    recall_score, f1_score, matthews_corrcoef,
    confusion_matrix, classification_report
)
import pickle
import warnings
warnings.filterwarnings('ignore')

## 1. Load and Prepare Dataset

In [None]:
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name='target')

print(f"Dataset Shape: {X.shape}")
print(f"Number of Features: {X.shape[1]}")
print(f"Number of Instances: {X.shape[0]}")
print(f"\nTarget Distribution:")
print(y.value_counts())
print(f"\nFeature Names:")
print(X.columns.tolist())

In [None]:
X.head()

## 2. Train-Test Split (80-20)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

In [None]:
train_data = X_train.copy()
train_data['target'] = y_train.values
train_data.to_csv('../train_data.csv', index=False)

test_data = X_test.copy()
test_data['target'] = y_test.values
test_data.to_csv('../test_data.csv', index=False)

X_test.to_csv('../test_data_without_labels.csv', index=False)

print("Saved CSV files:")
print("- train_data.csv (with target labels)")
print("- test_data.csv (with target labels)")
print("- test_data_without_labels.csv (for predictions only)")

## 3. Feature Scaling

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print("Scaler saved to scaler.pkl")

## 4. Define Evaluation Function

In [None]:
def evaluate_model(model_name, y_true, y_pred, y_pred_proba=None):
    metrics = {
        'Model': model_name,
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred, average='binary', zero_division=0),
        'Recall': recall_score(y_true, y_pred, average='binary', zero_division=0),
        'F1': f1_score(y_true, y_pred, average='binary', zero_division=0),
        'MCC': matthews_corrcoef(y_true, y_pred)
    }
    
    if y_pred_proba is not None:
        try:
            metrics['AUC'] = roc_auc_score(y_true, y_pred_proba)
        except:
            metrics['AUC'] = 0.0
    else:
        metrics['AUC'] = 0.0
    
    return metrics

## 5. Train Models and Evaluate

### 5.1 Logistic Regression

In [None]:
lr_model = LogisticRegression(max_iter=10000, random_state=42)
lr_model.fit(X_train_scaled, y_train)
lr_pred = lr_model.predict(X_test_scaled)
lr_pred_proba = lr_model.predict_proba(X_test_scaled)[:, 1]

lr_metrics = evaluate_model('Logistic Regression', y_test, lr_pred, lr_pred_proba)
print("Logistic Regression Metrics:")
for key, value in lr_metrics.items():
    if key != 'Model':
        print(f"{key}: {value:.4f}")

with open('model_logistic_regression.pkl', 'wb') as f:
    pickle.dump(lr_model, f)
print("\nModel saved to model_logistic_regression.pkl")

### 5.2 Decision Tree Classifier

In [None]:
dt_model = DecisionTreeClassifier(random_state=42, max_depth=10)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)
dt_pred_proba = dt_model.predict_proba(X_test)[:, 1]

dt_metrics = evaluate_model('Decision Tree', y_test, dt_pred, dt_pred_proba)
print("Decision Tree Metrics:")
for key, value in dt_metrics.items():
    if key != 'Model':
        print(f"{key}: {value:.4f}")

with open('model_decision_tree.pkl', 'wb') as f:
    pickle.dump(dt_model, f)
print("\nModel saved to model_decision_tree.pkl")

### 5.3 K-Nearest Neighbor Classifier

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)
knn_pred = knn_model.predict(X_test_scaled)
knn_pred_proba = knn_model.predict_proba(X_test_scaled)[:, 1]

knn_metrics = evaluate_model('K-Nearest Neighbor', y_test, knn_pred, knn_pred_proba)
print("K-Nearest Neighbor Metrics:")
for key, value in knn_metrics.items():
    if key != 'Model':
        print(f"{key}: {value:.4f}")

with open('model_k-nearest_neighbor.pkl', 'wb') as f:
    pickle.dump(knn_model, f)
print("\nModel saved to model_k-nearest_neighbor.pkl")

### 5.4 Naive Bayes Classifier (Gaussian)

In [None]:
nb_model = GaussianNB()
nb_model.fit(X_train_scaled, y_train)
nb_pred = nb_model.predict(X_test_scaled)
nb_pred_proba = nb_model.predict_proba(X_test_scaled)[:, 1]

nb_metrics = evaluate_model('Naive Bayes', y_test, nb_pred, nb_pred_proba)
print("Naive Bayes Metrics:")
for key, value in nb_metrics.items():
    if key != 'Model':
        print(f"{key}: {value:.4f}")

with open('model_naive_bayes.pkl', 'wb') as f:
    pickle.dump(nb_model, f)
print("\nModel saved to model_naive_bayes.pkl")

### 5.5 Random Forest (Ensemble)

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_pred_proba = rf_model.predict_proba(X_test)[:, 1]

rf_metrics = evaluate_model('Random Forest', y_test, rf_pred, rf_pred_proba)
print("Random Forest Metrics:")
for key, value in rf_metrics.items():
    if key != 'Model':
        print(f"{key}: {value:.4f}")

with open('model_random_forest.pkl', 'wb') as f:
    pickle.dump(rf_model, f)
print("\nModel saved to model_random_forest.pkl")

### 5.6 XGBoost (Ensemble)

In [None]:
xgb_model = XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
xgb_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

xgb_metrics = evaluate_model('XGBoost', y_test, xgb_pred, xgb_pred_proba)
print("XGBoost Metrics:")
for key, value in xgb_metrics.items():
    if key != 'Model':
        print(f"{key}: {value:.4f}")

with open('model_xgboost.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)
print("\nModel saved to model_xgboost.pkl")

## 6. Comparison of All Models

In [None]:
all_metrics = [lr_metrics, dt_metrics, knn_metrics, nb_metrics, rf_metrics, xgb_metrics]
results_df = pd.DataFrame(all_metrics)
results_df = results_df[['Model', 'Accuracy', 'AUC', 'Precision', 'Recall', 'F1', 'MCC']]

print("\n" + "="*80)
print("FINAL RESULTS COMPARISON")
print("="*80)
print(results_df.to_string(index=False))

results_df.to_csv('../model_results.csv', index=False)
print("\nResults saved to model_results.csv")

In [None]:
results_df