In [1]:
import pandas as pd
from lifelines import CoxPHFitter
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os

In [2]:
# Load dataset
df = pd.read_csv("data/survival_dataset.csv") #training set
df_pred = pd.read_csv("data/survival_predict.csv") #prediction set

In [3]:
# Columns only in training set
print("Columns in train but not in pred:")
print(set(df.columns) - set(df_pred.columns))

# Columns only in prediction set
print("Columns in pred but not in train:")
print(set(df_pred.columns) - set(df.columns))

Columns in train but not in pred:
{'duration', 'event'}
Columns in pred but not in train:
set()


In [4]:
# One-hot encode the categorical columns
df_cox = pd.get_dummies(df, columns=["sector", "location", "purpose"], drop_first=True)

In [5]:
# Fit Cox model
cph = CoxPHFitter()
cph.fit(df_cox, duration_col="duration", event_col="event")

<lifelines.CoxPHFitter: fitted with 5000 total observations, 3191 right-censored observations>

In [6]:
os.makedirs("models", exist_ok=True)

with open("models/survival_model.pkl", "wb") as f:
    pickle.dump(cph, f)

In [7]:
# Preprocess: must match training features exactly!
# → same encoding, scaling, dummy variables, etc.
feature_cols = df_cox.drop(columns=["duration", "event"]).columns.tolist()

In [8]:
# One-hot encode using same logic
df_pred_encoded = pd.get_dummies(df_pred, columns=["sector", "location", "purpose"], drop_first=True)

# Align columns with training
df_pred_encoded = df_pred_encoded.reindex(columns=feature_cols, fill_value=0)

In [9]:
### VIZ

os.makedirs("figures", exist_ok=True)

In [10]:
# Get only the common columns shared between both datasets
common_cols = df_cox.columns.intersection(df_pred_encoded.columns)

In [11]:
for col in common_cols:
    try:
        # Drop NA values
        train_vals = df_cox[col].dropna()
        pred_vals = df_pred_encoded[col].dropna()

        # Skip low-variance columns
        if train_vals.nunique() <= 1 and pred_vals.nunique() <= 1:
            print(f"⚠️ Skipping {col} due to low variance.")
            continue

        # Combine for plotting
        df_plot = pd.concat([
            pd.DataFrame({col: train_vals, "Dataset": "Train"}),
            pd.DataFrame({col: pred_vals, "Dataset": "Prediction"})
        ])

        # Plot side-by-side histograms (faceted by dataset)
        g = sns.displot(
            data=df_plot,
            x=col,
            col="Dataset",
            bins=20,
            kde=False,
            stat="density",
            common_bins=True,
            facet_kws={"sharey": False, "sharex": True},
            height=4,
            aspect=1.2
        )
        g.fig.suptitle(f"Histogram of {col} by Dataset", fontsize=14)
        g.fig.tight_layout()
        g.fig.subplots_adjust(top=0.85)

        # Save
        filename = f"figures/facet_hist_{col}.png".replace(" ", "_").replace("(", "").replace(")", "")
        g.savefig(filename)
        plt.close()

    except Exception as e:
        print(f"⚠️ Skipping {col} due to error: {e}")

In [12]:
with open("models/survival_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

In [13]:
# Predict
survival_probs = []
for idx, row in df_pred_encoded.iterrows():
    term = df_pred_encoded.loc[idx, "term_months"]
    surv_func = cph.predict_survival_function(row.to_frame().T)
    prob = surv_func.loc[term].values[0] if term in surv_func.index else surv_func.iloc[-1].values[0]
    survival_probs.append(prob)

In [14]:
# Add to dataset
df_pred["survival_probability"] = survival_probs

In [None]:
# Save or pass to credit risk model
df_pred.to_csv("data/scored_survival_dataset.csv", index=False)