In [None]:
import pandas as pd
import numpy as np

In [None]:
np.random.seed(153)

n = 5000

sectors = ['Consumer Goods', 'Technology & Communications', 'Financials', 'Health Care', 'Transportation', 'Services', 'Infrastructure', 'Resource Transformation']
purposes = ['Working Capital', 'Equipment', 'Expansion', 'Product Development']
locations = ["London", "Manchester", "Birmingham", "Leeds", "Glasgow"]

In [None]:
df = pd.DataFrame({
    "sector": np.random.choice(sectors, n),
    "location": np.random.choice(locations, n),
    "years_operating": np.random.randint(0, 20, size=n),
    "num_employees": np.random.poisson(15, size=n).clip(min=1),
    "annual_revenue": np.random.lognormal(mean=12, sigma=0.5, size=n),
    "profit_margin": np.round(np.random.normal(loc=5, scale=10, size=n), 1),
    "late_payments": np.random.poisson(1.5, size=n).clip(0, 10),
    "credit_score": np.clip(np.random.normal(loc=600, scale=100, size=n), 300, 1000).astype(int),
    "existing_debt": np.clip(np.random.normal(loc=80000, scale=50000, size=n), 0, None),
    "purpose": np.random.choice(purposes, size=n),
    "productivity_gain": np.clip(np.round(np.random.normal(loc=10, scale=10, size=n), 1), 0, None),
    "dnb_risk_score": np.random.randint(1, 100, size=n),
    "loan_amount": np.clip(np.random.normal(loc=50000, scale=20000, size=n), 5000, None),
    "term_months": np.random.choice([12, 18, 24, 36, 48, 60], size=n, p=[0.1, 0.2, 0.3, 0.2, 0.1, 0.1]),
    "current_ratio": np.random.uniform(0.5, 3.0, size=n),
    "debt_to_equity": np.random.uniform(0.1, 3.0, size=n)
})

In [None]:
df["loan_to_revenue_ratio"] = df["loan_amount"] / df["annual_revenue"]
df["monthly_payment"] = df["loan_amount"] / df["term_months"]

df["loan_to_revenue_ratio"] = df["loan_to_revenue_ratio"].clip(0.01, 2.0)

In [None]:
# duration: observed time in months (exponential decay)
df["duration"] = np.random.exponential(scale=36, size=n).astype(int)

# event: whether failure (1) or still active (0), based on features
def simulate_failure(row):
    score = 0
    score += 1 if row["current_ratio"] < 1.0 else 0
    score += 1 if row["debt_to_equity"] > 2.0 else 0
    score += 1 if row["dnb_risk_score"] < 30 else 0
    score += 1 if row["loan_to_revenue_ratio"] > 0.75 else 0
    score += 1 if row["late_payments"] > 2 else 0
    score += 1 if row["credit_score"] < 500 else 0
    score += 1 if (row["term_months"] > 36 and row["dnb_risk_score"] < 40) else 0
    score += 2 if row["sector"] in {"Services", "Consumer Goods"} else 0
    score += 2 if row["location"] in {"Glasgow", "Leeds"} else 0

    prob = min(0.1 + 0.1 * score, 0.95)
    return np.random.binomial(1, prob)

df["event"] = df.apply(simulate_failure, axis=1)

In [None]:
# Time scale in months
def simulate_company_lifetime(row):
    lifetime = 0
    lifetime += 6 if row["sector"] in {"Technology & Communications", "Financials"} else 0 
    lifetime += 4 if row["num_employees"] > 10 else 0 
    lifetime += 2 if row["location"] == "London" else 0

    return int(np.random.exponential(scale=lifetime if lifetime > 0 else 1))  # Avoid scale=0

df["lifetime"] = df.apply(simulate_company_lifetime, axis=1) > df["duration"]

In [None]:
df_survival = df.copy()
df_survival.to_csv("data/survival_dataset.csv", index=False)

print("Saved survival dataset to: data/survival_dataset.csv")