In [1]:
# Step 1: Import Libraries
import numpy as np
import pandas as pd


In [2]:
np.random.seed(42)

In [3]:
num_rows = 25000

In [4]:
# Step 4: Generate Features
customer_id = np.arange(1, num_rows + 1)

age = np.clip(np.random.normal(35, 10, num_rows), 18, 65).astype(int)

gender = np.random.choice(['Male', 'Female'], num_rows)

location = np.random.choice(['Urban', 'Semi-Urban', 'Rural'], num_rows)

monthly_income = np.clip(np.random.normal(50000, 15000, num_rows), 10000, 200000)

monthly_emi_outflow = monthly_income * np.random.uniform(0.1, 0.7, num_rows)

current_outstanding = monthly_emi_outflow * np.random.uniform(2, 20, num_rows)

credit_utilization_ratio = np.random.uniform(0, 1, num_rows)

num_open_loans = np.random.poisson(3, num_rows)

repayment_history_score = np.random.uniform(0, 100, num_rows)

dpd_last_3_months = np.random.poisson(2, num_rows)

num_hard_inquiries_last_6m = np.random.poisson(1, num_rows)

recent_credit_card_usage = np.random.uniform(1000, 100000, num_rows)

recent_loan_disbursed_amount = np.random.uniform(0, 500000, num_rows)

total_credit_limit = np.random.uniform(50000, 1000000, num_rows)

months_since_last_default = np.random.randint(0, 61, num_rows)

In [5]:
# Step 5: Define Target Variable based on heuristics
target_credit_score_movement = []

for i in range(num_rows):
    emi_to_income_ratio = monthly_emi_outflow[i] / monthly_income[i]
    if (dpd_last_3_months[i] > 3) and (credit_utilization_ratio[i] > 0.7) and (num_hard_inquiries_last_6m[i] > 2):
        target_credit_score_movement.append('decrease')
    elif (emi_to_income_ratio < 0.3) and (repayment_history_score[i] > 70):
        target_credit_score_movement.append('increase')
    else:
        target_credit_score_movement.append('stable')

In [6]:
# Step 6: Combine into DataFrame
df = pd.DataFrame({
    'customer_id': customer_id,
    'age': age,
    'gender': gender,
    'location': location,
    'monthly_income': monthly_income.round(2),
    'monthly_emi_outflow': monthly_emi_outflow.round(2),
    'current_outstanding': current_outstanding.round(2),
    'credit_utilization_ratio': credit_utilization_ratio.round(2),
    'num_open_loans': num_open_loans,
    'repayment_history_score': repayment_history_score.round(2),
    'dpd_last_3_months': dpd_last_3_months,
    'num_hard_inquiries_last_6m': num_hard_inquiries_last_6m,
    'recent_credit_card_usage': recent_credit_card_usage.round(2),
    'recent_loan_disbursed_amount': recent_loan_disbursed_amount.round(2),
    'total_credit_limit': total_credit_limit.round(2),
    'months_since_last_default': months_since_last_default,
    'target_credit_score_movement': target_credit_score_movement
})

In [7]:
# Step 7: Save to CSV
df.to_csv('credit_data.csv', index=False)

In [8]:
print("✅ Dataset generated and saved as 'credit_data.csv'.")

✅ Dataset generated and saved as 'credit_data.csv'.


In [9]:
df.head()

Unnamed: 0,customer_id,age,gender,location,monthly_income,monthly_emi_outflow,current_outstanding,credit_utilization_ratio,num_open_loans,repayment_history_score,dpd_last_3_months,num_hard_inquiries_last_6m,recent_credit_card_usage,recent_loan_disbursed_amount,total_credit_limit,months_since_last_default,target_credit_score_movement
0,1,39,Male,Semi-Urban,37960.91,15102.3,237947.63,0.66,5,71.68,1,0,58789.72,147061.87,480349.24,23,stable
1,2,33,Female,Urban,97982.6,38453.99,734534.97,0.7,4,43.36,0,3,2874.54,455301.88,799049.0,10,stable
2,3,41,Male,Semi-Urban,72757.31,17259.65,102888.54,0.98,3,97.79,2,0,87967.9,180375.33,145336.22,3,increase
3,4,50,Male,Rural,46591.58,22157.16,197129.05,0.39,5,68.24,3,0,47122.19,167716.44,172741.41,56,stable
4,5,32,Male,Semi-Urban,30893.97,6813.03,81831.67,0.42,3,82.94,1,0,38802.13,88811.09,945603.45,52,increase
