# Survival Analysis with the Lung Cancer Dataset
This notebook demonstrates survival analysis using the classic lung cancer dataset.

In [None]:
import pandas as pd
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

# Load the lung cancer dataset from lifelines
from lifelines.datasets import load_rossi
lung = pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/survival/lung.csv")
lung = lung.drop(columns=['Unnamed: 0'])
lung.head()

## Data Overview
Let's look at the first few rows:

In [None]:
lung.info()

## Kaplan-Meier Survival Curve
We'll fit a Kaplan-Meier estimator to visualize the survival function.

In [None]:
kmf = KaplanMeierFitter()
T = lung['time']
E = lung['status'] == 2  # 1=censored, 2=dead in this dataset

kmf.fit(T, event_observed=E)
kmf.plot_survival_function()
plt.title('Kaplan-Meier Survival Curve (Lung Cancer)')
plt.xlabel('Time (days)')
plt.ylabel('Survival Probability')
plt.show()

## Group Comparison
Let's compare survival between males and females.

In [None]:
for gender, grouped_df in lung.groupby('sex'):
    label = 'Male' if gender == 1 else 'Female'
    kmf.fit(grouped_df['time'], event_observed=grouped_df['status'] == 2, label=label)
    kmf.plot_survival_function()
plt.title('Survival Curves by Gender')
plt.xlabel('Time (days)')
plt.ylabel('Survival Probability')
plt.legend()
plt.show()