# Lung Cancer Survival Prediction
## Logistic Regression with Proper Preprocessing & GridSearchCV

This notebook follows professional ML engineering standards:
- No row dropping
- Proper numeric & categorical imputation
- Date feature engineering
- ColumnTransformer preprocessing
- Standard scaling (numeric only)
- OneHotEncoding (categorical only)
- GridSearchCV tuning for C regularization
- ROC-AUC evaluation
- Confusion matrix
- ROC Curve
- Feature importance extraction


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_curve,
    roc_auc_score,
)

%matplotlib inline

## 1. Load Dataset

In [None]:
df = pd.read_csv("lung_cancer_dataset.csv")  # Update path if needed
df.head()

## 2. Date Feature Engineering
Convert dates and create treatment duration feature.

In [None]:
df["diagnosis_date"] = pd.to_datetime(df["diagnosis_date"])
df["end_treatment_date"] = pd.to_datetime(df["end_treatment_date"])

df["treatment_duration_days"] = (
    df["end_treatment_date"] - df["diagnosis_date"]
).dt.days

# Drop original date columns (information preserved via duration)
df = df.drop(columns=["diagnosis_date", "end_treatment_date"])

df.head()

## 3. Feature & Target Split

In [None]:
TARGET = "survived"

X = df.drop(columns=[TARGET])
y = df[TARGET]

X.shape, y.shape

## 4. Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape

## 5. Identify Numeric & Categorical Columns

In [None]:
numeric_features = X.select_dtypes(include=['int64','float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

print("Numeric Columns:", list(numeric_features))
print("Categorical Columns:", list(categorical_features))

## 6. Build Preprocessing Pipelines

In [None]:
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numeric_features),
    ("cat", categorical_pipeline, categorical_features)
])

## 7. Full Model Pipeline

In [None]:
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LogisticRegression(max_iter=5000))
])

## 8. Hyperparameter Tuning (GridSearchCV)

In [None]:
param_grid = {
    "model__C": [0.001, 0.01, 0.1, 1, 10, 100]
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best Parameters:", grid.best_params_)
print("Best CV ROC-AUC:", grid.best_score_)

## 9. Model Evaluation

In [None]:
best_model = grid.best_estimator_

y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:,1]

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

## 10. Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure()
plt.imshow(cm)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.colorbar()
plt.show()

## 11. ROC Curve

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_proba)

plt.figure()
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1])
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

## 12. Feature Importance

In [None]:
# Get feature names after preprocessing
ohe = best_model.named_steps["preprocessor"].named_transformers_["cat"].named_steps["encoder"]
encoded_cat_features = ohe.get_feature_names_out(categorical_features)

all_features = list(numeric_features) + list(encoded_cat_features)

coefficients = best_model.named_steps["model"].coef_[0]

importance_df = pd.DataFrame({
    "Feature": all_features,
    "Coefficient": coefficients
}).sort_values(by="Coefficient", key=abs, ascending=False)

plt.figure()
plt.barh(importance_df["Feature"][:20], importance_df["Coefficient"][:20])
plt.title("Top 20 Feature Importances")
plt.xlabel("Coefficient")
plt.ylabel("Feature")
plt.show()