# Init.

In [None]:
from sklearn.datasets import load_breast_cancer  # Built-in dataset: breast cancer classification
from sklearn.model_selection import train_test_split  # To split data into training and testing
from sklearn.linear_model import LogisticRegression   # ML model: Logistic Regression (linear classifier)
from sklearn.tree import DecisionTreeClassifier       # ML model: Decision Tree (non-linear classifier)
from sklearn.metrics import classification_report, confusion_matrix  # To evaluate model performance
import math   # For mathematical operations (not directly needed yet)

# For visualization (we may use it later to plot data)
import matplotlib.pyplot as plt
import seaborn as sns   
from sklearn.tree import plot_tree

# Ignore warnings to keep the output clean
import warnings
warnings.filterwarnings("ignore")


# Data Prep.

In [None]:
# We use the built-in Breast Cancer dataset from sklearn.
# It contains information about breast tumor cells (numerical features)
# and the target (malignant = cancerous, benign = non-cancerous).
data_loader = load_breast_cancer(as_frame=True)  # 'as_frame=True' gives us a pandas DataFrame
data_loader

In [None]:
# The dataset is stored in a dictionary-like object with several keys.
data_loader.keys()

In [None]:
# The target is what we want to predict: cancerous or not.
data_loader["target_names"]

In [None]:
# Each dataset also comes with a description to explain it.
print(data_loader["DESCR"])

In [None]:
# Preview dataset
data_loader["data"]

In [None]:
# The 'frame' combines features + target into one DataFrame for easier handling.
data = data_loader.frame
data

# EDA (Simple)

In [None]:
# Check how many samples we have for each target class
# (0 = malignant, 1 = benign) from metadata
data.target.value_counts()

In [None]:
# Make the target labels easier to understand (replace 0/1 with words)
target_meaning = {
    0: "malignant",
    1: "benign",
}

# Show class distribution as a bar chart
data.target.replace(target_meaning).value_counts().plot(kind="bar", rot=0, title="class count");

In [None]:
# Get summary statistics (mean, std, min, max, quartiles) for each feature
data.describe()

In [None]:
# Check correlation of each feature with the target
# (closer to 1 or -1 -> stronger relationship)
data_loader["frame"].corr().apply(abs).loc["target"].sort_values(ascending=False)

In [None]:
# Plot histogram of a single feature
# "worst concave points" is one of the most correlated feature to target
data[["worst concave points"]].hist(bins=50)

In [None]:
# Compare distribution of "worst concave points" between malignant and benign tumors
sns.histplot(
    data=data,
    x="worst concave points",
    hue="target",
    bins=50
)

Notes:  

- Class balance: always check how many samples per class. Imbalanced classes can bias the model.  
- Describe(): quick way to understand the scale and spread of features.  
- Correlation with target: helps identify which features are most useful for prediction.  
- Visualizations: plotting features (like worst concave points) by target shows if they separate classes well.  

# Prep

In [None]:
# Select which features (columns) to use for training
# For now we only use one feature: "worst concave points"
# Later, we could try using ALL features: data.columns[:-1]
features = ['worst concave points']
features

In [None]:
# The last column in the DataFrame is our target (0 = malignant, 1 = benign)
target = data.columns[-1]
target

In [None]:
# Separate the dataset into inputs (X) and output/labels (y)
X = data[features]   # Features
y = data[target]     # Target
display(X)           # Show selected features
display(y)           # Show target values

## Split Data

In [None]:
# Split the data into training and testing sets
# - 80% for training (used to fit the model)
# - 20% for testing (used to check model performance on unseen data)
# random_state=42 ensures reproducibility (you get the same split every time)
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=42)

# [Optional] Preview train set (features only)
X_train

In [None]:
# [Optional] Preview test set (features only)
X_test

In [None]:
# [Optional] Preview train set (target only)
y_train

In [None]:
# [Optional] Preview test set (target only)
y_test

# Modeling

In [None]:
# Logistic Regression is a simple linear classifier.
# It tries to separate malignant vs benign using a straight line (or curve in higher dimensions).
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)          # Train the model using training data
y_pred_lr = lr.predict(X_test)    # Make predictions on test data

In [None]:
# Decision Tree is a non-linear model.
# It splits the data into branches based on feature values (like asking yes/no questions).
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)          # Train the model using training data
y_pred_dt = dt.predict(X_test)    # Make predictions on test dataN

Note:  

- Both models are trained using the same training set but may perform differently.  
- Logistic Regression = good baseline, simple, interpretable.  
- Decision Tree = flexible, can capture more complex patterns, but may overfit.  
- Predictions (y_pred_lr, y_pred_dt) are what the models think the test samples should be classified as.  

# Evaluation

## Confusion Matrix

In [None]:
# Confusion Matrix for Logistic Regression
# Rows = actual labels, Columns = predicted labels
# Top-left = correctly predicted malignant
# Bottom-right = correctly predicted benign
confusion_matrix(y_test, y_pred_lr)

In [None]:
# Confusion Matrix for Decision Tree
confusion_matrix(y_test, y_pred_dt)

## Classification Report

In [None]:
# Classification report for Logistic Regression
# Shows Precision, Recall, F1-score, and Accuracy
print(classification_report(y_test, y_pred_lr))

In [None]:
# Classification report for Decision Tree
print(classification_report(y_test, y_pred_dt))

Note:  

- Precision: Of all samples predicted as positive, how many were correct?  
- Recall: Of all actual positives, how many did we find?  
- F1-score: Balance between Precision and Recall.  
- Accuracy: Overall percentage of correct predictions.  

# Use Model

In [None]:
worst_concave_points_test = .03 #.11 #.1105 # .11 # .1125

In [None]:
# Use the trained Decision Tree to predict its class
# Note: we pass the value inside [[ ]] because the model expects a 2D array
test_pred = dt.predict([[worst_concave_points_test]])
test_pred

## Model Prediction

In [None]:
# Convert the numeric prediction (0 or 1) into a human-readable label
target_meaning[test_pred[0]]

In [None]:
# Plot the trained Decision Tree
plt.figure(figsize=(12*2, 6*2))
plot_tree(
    dt, 
    feature_names=features,                        # show the feature name(s)
    class_names=list(target_meaning.values()),     # convert dict_values → list
    filled=True, 
    rounded=True
)
plt.savefig("tree.jpg", dpi=500)
plt.show()


Note:

- Input must follow the same structure as training features (even if it’s only one feature, it must be passed as a 2D array).
- The model output is 0 or 1, but mapping it back with target_meaning makes it understandable ("malignant" or "benign").

## Manual Prediction

1. **Linear function `f(x)`**  

   `f(x) = w * x + b`, where:  

   * `w` = coefficient from `lr.coef_`
   * `b` = intercept from `lr.intercept_`
   * Input `x` = `worst_concave_points_test`

   This gives you a **raw score** (sometimes called logit).

In [None]:
display(lr.coef_, lr.intercept_)

# f(x)
f_x = lambda x: lr.coef_[0][0]*x + lr.intercept_[0] 

f_x(worst_concave_points_test)

2. **Sigmoid function `g(f(x))`**   
   
   `g(z) = 1 / (1 + exp(-z))`

   * Converts the raw score into a probability between **0 and 1**.
   * Example: If `f(x) = -2`, then `g(-2) ≈ 0.12` → low probability of benign.

In [None]:
# g(z)
sigmoid_fn = lambda z: 1 / (1 + math.exp(-z))

# g(f(x))
sigmoid_fn(f_x(worst_concave_points_test))

3. **Threshold classifier `h(g(f(x)))`**
   
   `h(p) = 0 if p < 0.5 else 1`

   * If probability ≥ 0.5 → **class 1**
   * If probability < 0.5 → **class 0**

   That’s what your `classifier()` function does.

In [None]:
# h(threshold, p)
classifier = lambda threshold, p: 0 if p < threshold else 1

# h(g(f(x)))
given_threshold = .5
classifier(sigmoid_fn(f_x(worst_concave_points_test)), given_threshold)

4. **Final mapping**

   * You then map class `0` or `1` into **meaningful labels** using `target_meaning`.
   * Example:

     * `0 → Malignant`
     * `1 → Benign`

In [None]:
target_meaning[
    classifier(sigmoid_fn(f_x(worst_concave_points_test)), given_threshold)
]

Note: 

1. Multiply inputs by weights (linear formula).
2. Pass result through **sigmoid** to squash into [0,1].
3. Compare with **threshold (0.5)** to decide the class.