# Mini Project 1: Classification Problem

**Dataset**: UCI Breast Cancer Dataset (auto-download via sklearn)

**Goal**: Predict if a tumor is malignant or benign using classification models.

In [1]:

import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle

# Load dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [2]:

models = {
    "Logistic Regression": LogisticRegression(max_iter=10000),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": GaussianNB(),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier()
}

best_model = None
best_score = 0

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.4f}")
    if acc > best_score:
        best_score = acc
        best_model = model


Logistic Regression Accuracy: 0.9561
Random Forest Accuracy: 0.9649
Naive Bayes Accuracy: 0.9737
KNN Accuracy: 0.9561
Decision Tree Accuracy: 0.9474


In [3]:

# Save best model
with open("best_classification_model.pkl", "wb") as f:
    pickle.dump(best_model, f)
print("Best classification model saved as 'best_classification_model.pkl'")


Best classification model saved as 'best_classification_model.pkl'
