# Classification using Logistic Regression - Breast Cancer Detection

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [9]:
csv_path = Path("breast_cancer_bd.csv")
if not csv_path.exists():
    raise FileNotFoundError("breast_cancer_bd.csv not found in this folder.")

df = pd.read_csv(csv_path)
df.shape
df.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [14]:
# Clean data
df = df.replace('?', np.nan)
df = df.dropna()
df.shape

(683, 11)

In [21]:
# get y by only selecting column 'Class' and x by having all except column 'Class'
y = df['Class']
x = df.drop(columns=['Class'])

# Split the data to train and test - 70/30
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.3, random_state=42, stratify=y
)

(478, 10)
(205, 10)


In [28]:
# Standardization the data to have mean of 0 and standard deviation of 1
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)


# Train a Logistic Regression model on train data and test it on test data

logistic_regression = LogisticRegression(max_iter=1200)
logistic_regression.fit(x_train_scaled, y_train)

predicted_y = logistic_regression.predict(x_test_scaled)

predicted_y[:5]

array([2, 2, 2, 2, 4])

In [32]:
# Evaluates model performance

accuracy = accuracy_score(y_test, predicted_y)
cm = confusion_matrix(y_test, predicted_y)
report = classification_report(y_test, predicted_y)

print(f"Accuracy : {accuracy}")
print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n", report)

Accuracy : 0.9560975609756097

Confusion Matrix:
 [[128   5]
 [  4  68]]

Classification Report:
               precision    recall  f1-score   support

           2       0.97      0.96      0.97       133
           4       0.93      0.94      0.94        72

    accuracy                           0.96       205
   macro avg       0.95      0.95      0.95       205
weighted avg       0.96      0.96      0.96       205

