# Lung Cancer ML Model â€”>
This notebook builds a production-style ML pipeline including:
- Imputation
- Standard Scaling
- Logistic Regression
- Hyperparameter tuning (C regularization)
- Evaluation metrics
- ROC Curve
- Feature Importance visualization

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_curve,
    roc_auc_score,
)

%matplotlib inline

## 1. Load Dataset

In [None]:
df = pd.read_csv("lung_cancer_dataset.csv")  # Update path if needed
df.head()

## 2. Dataset Info

In [None]:
df.info()
df.isnull().sum()

## 3. Feature & Target Split

In [None]:
TARGET = "LUNG_CANCER"  # Update if different
X = df.drop(columns=[TARGET])
y = df[TARGET]

X.shape, y.shape

## 4. Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape

## 5. Build Pipeline

In [None]:
pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=5000))
])

## 6. Hyperparameter Tuning (GridSearchCV)

In [None]:
param_grid = {
    "model__C": [0.001, 0.01, 0.1, 1, 10, 100],
    "model__penalty": ["l2"],  # L2 regularization
    "model__solver": ["lbfgs"]
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best Parameters:", grid.best_params_)
print("Best CV Score:", grid.best_score_)

## 7. Model Evaluation

In [None]:
best_model = grid.best_estimator_

y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:,1]

acc = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

print("Test Accuracy:", acc)
print("Test ROC-AUC:", roc_auc)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

## 8. Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure()
plt.imshow(cm)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.colorbar()
plt.show()

## 9. ROC Curve

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_proba)

plt.figure()
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1])
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

## 10. Feature Importance (Coefficients)

In [None]:
feature_names = X.columns
coefficients = best_model.named_steps["model"].coef_[0]

importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Coefficient": coefficients
}).sort_values(by="Coefficient", key=abs, ascending=False)

plt.figure()
plt.barh(importance_df["Feature"], importance_df["Coefficient"])
plt.title("Feature Importance (Logistic Regression Coefficients)")
plt.xlabel("Coefficient Value")
plt.ylabel("Feature")
plt.show()