[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wasim/Data-Science/blob/main/data-analyst-roadmap/10_machine_learning_basics/03_logistic_regression.ipynb)

# Logistic Regression

Classify data into categories.

## What is Logistic Regression?
- Binary classification (Yes/No)
- Predicts probabilities (0-1)
- Uses sigmoid function
- Foundation for classification

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_curve,
    roc_auc_score
)
from sklearn.preprocessing import StandardScaler

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

## 1. Binary Classification

In [None]:
# Load breast cancer dataset
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()
df = pd.DataFrame(
    data.data, 
    columns=data.feature_names
)
df['target'] = data.target

print(f"Dataset shape: {df.shape}")
print(f"\nTarget distribution:")
print(df['target'].value_counts())
print(f"\n0 = Malignant, 1 = Benign")

In [None]:
# Select key features
features = [
    'mean radius',
    'mean texture',
    'mean perimeter',
    'mean area'
]

X = df[features]
y = df['target']

X.head()

In [None]:
# Visualize feature distributions
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.ravel()

for idx, feature in enumerate(features):
    for target in [0, 1]:
        axes[idx].hist(
            df[df['target']==target][feature],
            alpha=0.6,
            label=f"Class {target}",
            bins=20
        )
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('Frequency')
    axes[idx].legend()

plt.tight_layout()
plt.show()

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set: {len(X_train)}")
print(f"Test set: {len(X_test)}")

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled!")

In [None]:
# Train model
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

print("Model trained!")
print(f"\nCoefficients:")
for feature, coef in zip(features, model.coef_[0]):
    print(f"{feature}: {coef:.3f}")

In [None]:
# Make predictions
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)

# Show sample predictions
results = pd.DataFrame({
    'Actual': y_test[:10].values,
    'Predicted': y_pred[:10],
    'Prob_Malignant': y_pred_proba[:10, 0],
    'Prob_Benign': y_pred_proba[:10, 1]
})
results

## 2. Model Evaluation

In [None]:
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f}")
print(f"\nCorrect: {(y_pred == y_test).sum()}")
print(f"Incorrect: {(y_pred != y_test).sum()}")

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(
    cm, 
    annot=True, 
    fmt='d',
    cmap='Blues',
    xticklabels=['Malignant', 'Benign'],
    yticklabels=['Malignant', 'Benign']
)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()

print("\nConfusion Matrix:")
print(f"True Negatives: {cm[0,0]}")
print(f"False Positives: {cm[0,1]}")
print(f"False Negatives: {cm[1,0]}")
print(f"True Positives: {cm[1,1]}")

In [None]:
# Classification Report
print(classification_report(
    y_test, 
    y_pred,
    target_names=['Malignant', 'Benign']
))

## 3. ROC Curve and AUC

In [None]:
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(
    y_test, 
    y_pred_proba[:, 1]
)
auc = roc_auc_score(y_test, y_pred_proba[:, 1])

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(
    fpr, tpr, 
    linewidth=2, 
    label=f'ROC (AUC = {auc:.3f})'
)
plt.plot(
    [0, 1], [0, 1], 
    'k--', 
    label='Random Classifier'
)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True)
plt.show()

print(f"AUC Score: {auc:.3f}")

## 4. Threshold Tuning

In [None]:
# Try different thresholds
thresholds_to_test = [0.3, 0.5, 0.7]

for threshold in thresholds_to_test:
    y_pred_custom = (
        y_pred_proba[:, 1] >= threshold
    ).astype(int)
    
    acc = accuracy_score(y_test, y_pred_custom)
    
    print(f"\nThreshold: {threshold}")
    print(f"Accuracy: {acc:.3f}")
    print(classification_report(
        y_test, 
        y_pred_custom,
        target_names=['Malignant', 'Benign'],
        zero_division=0
    ))

## 5. Feature Importance

In [None]:
# Feature importance
importance = pd.DataFrame({
    'Feature': features,
    'Coefficient': model.coef_[0]
}).sort_values('Coefficient', 
               key=abs, 
               ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(importance['Feature'], 
         importance['Coefficient'])
plt.xlabel('Coefficient')
plt.title('Feature Importance')
plt.tight_layout()
plt.show()

importance

## 6. Making Predictions

In [None]:
# New patient data
new_patient = pd.DataFrame({
    'mean radius': [15.0],
    'mean texture': [20.0],
    'mean perimeter': [95.0],
    'mean area': [700.0]
})

# Scale and predict
new_patient_scaled = scaler.transform(new_patient)
prediction = model.predict(new_patient_scaled)
probability = model.predict_proba(
    new_patient_scaled
)

print(f"Prediction: {prediction[0]}")
print(f"Class: {'Benign' if prediction[0]==1 else 'Malignant'}")
print(f"\nProbabilities:")
print(f"Malignant: {probability[0][0]:.3f}")
print(f"Benign: {probability[0][1]:.3f}")

## Practice Exercises

### Exercise 1
Build a model to predict customer churn 
(stayed vs left).

In [None]:
# Your code here


### Exercise 2
Find the optimal threshold for your 
use case (minimize false negatives).

In [None]:
# Your code here


## Key Takeaways

✅ **Logistic Regression** - Binary classification  
✅ **Probabilities** - 0 to 1 predictions  
✅ **Confusion Matrix** - Understand errors  
✅ **ROC/AUC** - Model performance  
✅ **Threshold** - Tune for use case  

**Next:** [Model Evaluation](04_model_evaluation.ipynb) →