In [1]:
!pip install sdv

Collecting sdv
  Downloading sdv-1.32.0-py3-none-any.whl.metadata (14 kB)
Collecting copulas>=0.12.1 (from sdv)
  Downloading copulas-0.12.3-py3-none-any.whl.metadata (9.5 kB)
Collecting ctgan>=0.11.1 (from sdv)
  Downloading ctgan-0.11.1-py3-none-any.whl.metadata (10 kB)
Collecting deepecho>=0.7.0 (from sdv)
  Downloading deepecho-0.7.0-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.18.2 (from sdv)
  Downloading rdt-1.18.2-py3-none-any.whl.metadata (10 kB)
Collecting sdmetrics>=0.21.0 (from sdv)
  Downloading sdmetrics-0.24.0-py3-none-any.whl.metadata (9.3 kB)
Collecting Faker!=37.11.0,>=17 (from rdt>=1.18.2->sdv)
  Downloading faker-39.0.0-py3-none-any.whl.metadata (16 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->ctgan>=0.11.1->sdv)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->ctgan>=0.11.1->sdv)
  Downloading nvidia_cuda_runtime_cu12-12.4

In [3]:
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
import pandas as pd
import random

#Fetch Data
url = "https://raw.githubusercontent.com/shrikant-temburwar/Loan-Prediction-Dataset/master/train.csv"
df = pd.read_csv(url)

df = df[['Gender', 'Education', 'Self_Employed', 'ApplicantIncome',
         'CoapplicantIncome', 'LoanAmount', 'Credit_History',
         'Property_Area', 'Loan_Status']]

#Clean
df = df.dropna()
df['Credit_History'] = df['Credit_History'].astype(str)

#Train CTGAN
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df)
model = CTGANSynthesizer(metadata)
model.fit(df)

#Generate 1000 Synthetic Records
synthetic_data = model.sample(1000)

reasons = [
    "I need to renovate my house immediately.",
    "Planning to expand my small business operations.",
    "Urgent medical expenses covered by loan.",
    "Buying a new car for daily commute.",
    "Consolidating high-interest debts.",
    "Investment in a new startup opportunity."
]

qualifiers = [
    "I am very confident in repayment.",
    "I might struggle slightly initially.",
    "Business is booming right now.",
    "Facing some financial stress currently."
]

def generate_text(row):
    reason = random.choice(reasons)
    if row['ApplicantIncome'] > 10000:
        qualifier = "I am very confident in repayment."
    else:
        qualifier = random.choice(qualifiers)
    return f"{reason} {qualifier}"

synthetic_data['Application_Note'] = synthetic_data.apply(generate_text, axis=1)

#Save
print(synthetic_data.head())
synthetic_data.to_csv('synthetic_financial_data_enhanced.csv', index=False)



   Gender     Education Self_Employed  ApplicantIncome  CoapplicantIncome  \
0    Male      Graduate            No             1969           0.000000   
1    Male  Not Graduate            No             1544           0.000000   
2    Male      Graduate            No              388         345.682079   
3    Male  Not Graduate           Yes              150         145.880326   
4  Female      Graduate           Yes             1691          32.047091   

   LoanAmount Credit_History Property_Area Loan_Status  \
0       247.0            0.0         Rural           N   
1       268.0            1.0     Semiurban           Y   
2       121.0            1.0     Semiurban           Y   
3       216.0            1.0     Semiurban           Y   
4       103.0            1.0         Rural           N   

                                    Application_Note  
0  Urgent medical expenses covered by loan. Facin...  
1  Urgent medical expenses covered by loan. Busin...  
2  Buying a new car for