In [1]:
import pandas as pd
import numpy as np
import random

np.random.seed(42)
random.seed(42)


In [2]:
N = 1200  # >= 1000 as required

In [3]:
data = pd.DataFrame({
    "age": np.random.randint(21, 65, N),
    "monthly_income": np.random.randint(2000, 15000, N),
    "credit_utilization_ratio": np.round(np.random.uniform(0.1, 1.0, N), 2),
    "loan_amount": np.random.randint(3000, 50000, N),
    "loan_duration_months": np.random.choice([12, 24, 36, 48, 60], N),
    "num_late_payments": np.random.poisson(1.5, N),
    "existing_loans_count": np.random.randint(0, 5, N),
    "account_tenure_years": np.random.randint(0, 15, N)
})


In [4]:
data["employment_type"] = np.random.choice(
    ["Salaried", "Self-Employed", "Student", "Unemployed"], N, p=[0.55, 0.25, 0.15, 0.05]
)

data["education_level"] = np.random.choice(
    ["High School", "Diploma", "Bachelor", "Master"], N
)

data["marital_status"] = np.random.choice(
    ["Single", "Married", "Divorced"], N
)

data["region"] = np.random.choice(
    ["Urban", "Suburban", "Rural"], N
)


In [5]:
positive_statements = [
    "My income is stable and I always pay my bills on time.",
    "I manage my expenses carefully and save monthly.",
    "I have a secure job and no financial stress."
]

neutral_statements = [
    "My finances are manageable but expenses are increasing.",
    "I sometimes feel pressure but can still handle payments.",
    "My income is okay but savings are limited."
]

negative_statements = [
    "I struggle to pay bills due to high expenses.",
    "My income is unstable and debts are increasing.",
    "I often miss payments and face financial stress."
]

def generate_statement():
    choice = np.random.rand()
    if choice < 0.5:
        return random.choice(positive_statements)
    elif choice < 0.8:
        return random.choice(neutral_statements)
    else:
        return random.choice(negative_statements)

data["customer_financial_statement"] = [generate_statement() for _ in range(N)]


In [6]:
def sentiment_from_text(text):
    if "struggle" in text or "unstable" in text or "miss payments" in text:
        return "Negative"
    elif "pressure" in text or "manageable" in text:
        return "Neutral"
    else:
        return "Positive"

data["sentiment"] = data["customer_financial_statement"].apply(sentiment_from_text)


In [7]:
def stress_level(row):
    if row["num_late_payments"] >= 3 or row["sentiment"] == "Negative":
        return "High"
    elif row["num_late_payments"] == 1:
        return "Medium"
    else:
        return "Low"

data["financial_stress_level"] = data.apply(stress_level, axis=1)


In [8]:
def risk_category(row):
    if row["credit_utilization_ratio"] > 0.8 or row["financial_stress_level"] == "High":
        return "High Risk"
    elif row["credit_utilization_ratio"] > 0.5:
        return "Watchlist"
    else:
        return "Low Risk"

data["risk_category"] = data.apply(risk_category, axis=1)


In [9]:
def generate_default(row):
    score = 0

    if row["credit_utilization_ratio"] > 0.8:
        score += 2
    if row["num_late_payments"] >= 3:
        score += 2
    if row["financial_stress_level"] == "High":
        score += 2
    if row["monthly_income"] < 4000:
        score += 1

    return 1 if score >= 4 else 0

data["default_risk"] = data.apply(generate_default, axis=1)


In [10]:
data.head()


Unnamed: 0,age,monthly_income,credit_utilization_ratio,loan_amount,loan_duration_months,num_late_payments,existing_loans_count,account_tenure_years,employment_type,education_level,marital_status,region,customer_financial_statement,sentiment,financial_stress_level,risk_category,default_risk
0,59,10966,0.3,18417,12,3,2,4,Salaried,Diploma,Married,Suburban,I have a secure job and no financial stress.,Positive,High,High Risk,1
1,49,4198,0.41,27729,48,3,3,5,Salaried,Diploma,Married,Rural,I struggle to pay bills due to high expenses.,Negative,High,High Risk,1
2,35,13348,0.44,49956,12,1,4,6,Salaried,Diploma,Married,Rural,My income is stable and I always pay my bills ...,Positive,Medium,Low Risk,0
3,63,6242,0.41,33099,60,0,2,9,Salaried,High School,Married,Suburban,My income is okay but savings are limited.,Positive,Low,Low Risk,0
4,28,5695,0.32,35598,12,3,2,9,Salaried,Master,Married,Suburban,I sometimes feel pressure but can still handle...,Neutral,High,High Risk,1


In [11]:
data["default_risk"].value_counts(normalize=True)


default_risk
0    0.799167
1    0.200833
Name: proportion, dtype: float64

In [12]:
data.to_csv("../data/credit_risk_dataset.csv", index=False)
print("Dataset saved successfully!")


Dataset saved successfully!
