# Lung Cancer Survival Prediction
## Logistic Regression with GridSearchCV + Prediction Section

This notebook includes:
- Proper preprocessing
- Imputation (no row dropping)
- Date feature engineering
- ColumnTransformer
- GridSearchCV tuning
- Evaluation metrics
- ROC Curve
- Feature importance
- Sample prediction section


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_curve,
    roc_auc_score,
)

%matplotlib inline

## 1. Load Dataset

In [None]:
df = pd.read_csv("lung_cancer_dataset.csv")  # Update path if needed
df.head()

## 2. Date Feature Engineering

In [None]:
df["diagnosis_date"] = pd.to_datetime(df["diagnosis_date"])
df["end_treatment_date"] = pd.to_datetime(df["end_treatment_date"])

df["treatment_duration_days"] = (
    df["end_treatment_date"] - df["diagnosis_date"]
).dt.days

df = df.drop(columns=["diagnosis_date", "end_treatment_date"])

## 3. Feature & Target Split

In [None]:
TARGET = "survived"

X = df.drop(columns=[TARGET])
y = df[TARGET]

## 4. Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## 5. Identify Column Types

In [None]:
numeric_features = X.select_dtypes(include=['int64','float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

## 6. Preprocessing Pipelines

In [None]:
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numeric_features),
    ("cat", categorical_pipeline, categorical_features)
])

## 7. Full Pipeline

In [None]:
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LogisticRegression(max_iter=5000))
])

## 8. GridSearchCV

In [None]:
param_grid = {
    "model__C": [0.001, 0.01, 0.1, 1, 10, 100]
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best Parameters:", grid.best_params_)
print("Best CV ROC-AUC:", grid.best_score_)

## 9. Evaluation

In [None]:
best_model = grid.best_estimator_

y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:,1]

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

## 10. Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure()
plt.imshow(cm)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.colorbar()
plt.show()

## 11. ROC Curve

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_proba)

plt.figure()
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1])
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

## 12. Sample Prediction

In [None]:
# Take one sample from test set
sample = X_test.iloc[[0]]

prediction = best_model.predict(sample)[0]
probability = best_model.predict_proba(sample)[0][1]

print("Sample Features:")
display(sample)

print("Predicted Survival:", prediction)
print("Predicted Survival Probability:", probability)