In [1]:
import google.generativeai as genai
import pandas as pd
import json
import time
import os
from dotenv import load_dotenv

In [2]:
# Setup Gemini API
# Replace with your actual API Key
# Load variables from .env
load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")
genai.configure(api_key=api_key)
model = genai.GenerativeModel('gemini-2.5-flash')

def generate_financial_data_batch(batch_size, batch_id):
    """
    Asks Gemini to generate a batch of synthetic records with internal correlations.
    """
    prompt = f"""
    You are a data generator
    Generate {batch_size} unique, realistic financial records for a credit risk dataset (Batch #{batch_id}).
    
    CRITICAL INSTRUCTIONS:
    - Do NOT use random placeholders. 
    - Ensure realistic correlations: (e.g., younger people may have lower tenure, high credit utilization should correlate with higher risk categories).
    - Vary the personas: Include some high-earners, some struggling students, and middle-class families.
    - Output ONLY a valid JSON list of objects.

    Each object must contain these keys:
    "age": (int 21-70),
    "monthly_income": (int 1500-20000),
    "credit_utilization_ratio": (float 0.1-1.0),
    "loan_amount": (int 1000-50000),
    "loan_duration_months": (choice 12, 24, 36, 48, 60),
    "num_late_payments": (int 0-10),
    "existing_loans_count": (int 0-5),
    "account_tenure_years": (int 0-20),
    "employment_type": ("Salaried", "Self-Employed", "Student", "Unemployed"),
    "education_level": ("High School", "Diploma", "Bachelor", "Master"),
    "marital_status": ("Single", "Married", "Divorced"),
    "region": ("Urban", "Suburban", "Rural"),
    "customer_financial_statement": (A unique 1-sentence statement in first person about their money),
    "sentiment": (Evaluate the statement: "Positive", "Neutral", "Negative"),
    "financial_stress_level": (Evaluate the profile: "Low", "Medium", "High"),
    "risk_category": (Evaluate data: "Low Risk", "Watchlist", "High Risk"),
    "default_risk": (0 or 1, where 1 is likely to default based on the data)
    """

    try:
        response = model.generate_content(prompt)
        # Clean the response text to ensure it's valid JSON
        raw_text = response.text.strip().replace('```json', '').replace('```', '')
        return json.loads(raw_text)
    except Exception as e:
        print(f"Error in batch {batch_id}: {e}")
        return []

In [3]:
# --- Data Generation Loop ---
TOTAL_RECORDS_NEEDED = 1500
BATCH_SIZE = 25  # Smaller batches ensure higher quality and avoid token limits
all_records = []

print(f"Starting GenAI Data Simulation for {TOTAL_RECORDS_NEEDED} records...")

while len(all_records) < TOTAL_RECORDS_NEEDED:
    current_batch_id = (len(all_records) // BATCH_SIZE) + 1
    print(f"Generating batch {current_batch_id}...")
    
    batch_data = generate_financial_data_batch(BATCH_SIZE, current_batch_id)
    
    if batch_data:
        all_records.extend(batch_data)
        print(f"Successfully added {len(batch_data)} records. Total: {len(all_records)}")
    else:
        print("Batch failed, retrying...")
    
    # Respect API rate limits
    time.sleep(2)
    
# Create DataFrame
df = pd.DataFrame(all_records)

Starting GenAI Data Simulation for 1500 records...
Generating batch 1...
Successfully added 25 records. Total: 25
Generating batch 2...
Successfully added 25 records. Total: 50
Generating batch 3...
Successfully added 25 records. Total: 75
Generating batch 4...
Successfully added 25 records. Total: 100
Generating batch 5...
Successfully added 25 records. Total: 125
Generating batch 6...
Successfully added 25 records. Total: 150
Generating batch 7...
Successfully added 25 records. Total: 175
Generating batch 8...
Successfully added 25 records. Total: 200
Generating batch 9...
Successfully added 25 records. Total: 225
Generating batch 10...
Successfully added 25 records. Total: 250
Generating batch 11...
Successfully added 25 records. Total: 275
Generating batch 12...
Successfully added 25 records. Total: 300
Generating batch 13...
Successfully added 25 records. Total: 325
Generating batch 14...
Successfully added 25 records. Total: 350
Generating batch 15...
Successfully added 25 record

In [4]:
# Ensure we have exactly or more than 1000
df = df.head(TOTAL_RECORDS_NEEDED)

# Save to CSV
output_path = "./data/credit_risk_dataset.csv"
df.to_csv(output_path, index=False)

In [5]:
print(f"\nSimulation Complete!")
print(f"Dataset saved to: {output_path}")
print("\nSample Data:")
print(df[['age', 'monthly_income', 'customer_financial_statement', 'risk_category', 'default_risk']].head())

# Quick Analysis of GenAI Distribution
print("\nRisk Category Distribution (Generated by GenAI):")
print(df['risk_category'].value_counts())


Simulation Complete!
Dataset saved to: ./data/credit_risk_dataset.csv

Sample Data:
   age  monthly_income                       customer_financial_statement  \
0   22            1800  I'm really struggling to make ends meet each m...   
1   45           12500  My finances are stable, and I'm actively savin...   
2   38            6800  We're managing, but recent unexpected expenses...   
3   55            4500  I'm comfortable with my current financial situ...   
4   28            3200  I'm constantly worried about my credit card de...   

  risk_category  default_risk  
0     High Risk             1  
1      Low Risk             0  
2     Watchlist             1  
3      Low Risk             0  
4     High Risk             1  

Risk Category Distribution (Generated by GenAI):
risk_category
Low Risk     660
High Risk    438
Watchlist    402
Name: count, dtype: int64


In [7]:
print(df['default_risk'].value_counts(normalize=True))

default_risk
0    0.635333
1    0.364667
Name: proportion, dtype: float64
