In [1]:
!pip install sdv

Collecting sdv
  Downloading sdv-1.30.0-py3-none-any.whl.metadata (14 kB)
Collecting boto3<2.0.0,>=1.28 (from sdv)
  Downloading boto3-1.42.9-py3-none-any.whl.metadata (6.8 kB)
Collecting botocore<2.0.0,>=1.31 (from sdv)
  Downloading botocore-1.42.9-py3-none-any.whl.metadata (5.9 kB)
Collecting copulas>=0.12.1 (from sdv)
  Downloading copulas-0.12.3-py3-none-any.whl.metadata (9.5 kB)
Collecting ctgan>=0.11.1 (from sdv)
  Downloading ctgan-0.11.1-py3-none-any.whl.metadata (10 kB)
Collecting deepecho>=0.7.0 (from sdv)
  Downloading deepecho-0.7.0-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.18.2 (from sdv)
  Downloading rdt-1.18.2-py3-none-any.whl.metadata (10 kB)
Collecting sdmetrics>=0.21.0 (from sdv)
  Downloading sdmetrics-0.24.0-py3-none-any.whl.metadata (9.3 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3<2.0.0,>=1.28->sdv)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.17.0,>=0.16.0 (from boto3<2.0.0,>=1.28->sdv)
  Downloading s3t

In [5]:
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
import pandas as pd
import random

#Fetch Data
url = "https://raw.githubusercontent.com/shrikant-temburwar/Loan-Prediction-Dataset/master/train.csv"
df = pd.read_csv(url)

df = df[['Gender', 'Education', 'Self_Employed', 'ApplicantIncome',
         'CoapplicantIncome', 'LoanAmount', 'Credit_History',
         'Property_Area', 'Loan_Status']]

#Clean
df = df.dropna()
df['Credit_History'] = df['Credit_History'].astype(str)

#Train CTGAN
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df)
model = CTGANSynthesizer(metadata)
model.fit(df)

#Generate 1000 Synthetic Records
synthetic_data = model.sample(1000)

reasons = [
    "I need to renovate my house immediately.",
    "Planning to expand my small business operations.",
    "Urgent medical expenses covered by loan.",
    "Buying a new car for daily commute.",
    "Consolidating high-interest debts.",
    "Investment in a new startup opportunity."
]

qualifiers = [
    "I am very confident in repayment.",
    "I might struggle slightly initially.",
    "Business is booming right now.",
    "Facing some financial stress currently."
]

def generate_text(row):
    return f"{random.choice(reasons)} {random.choice(qualifiers)}"

synthetic_data['Application_Note'] = synthetic_data.apply(generate_text, axis=1)

#Save
print(synthetic_data.head())
synthetic_data.to_csv('synthetic_financial_data_enhanced.csv', index=False)



  Gender     Education Self_Employed  ApplicantIncome  CoapplicantIncome  \
0   Male  Not Graduate           Yes             7933           0.000000   
1   Male      Graduate            No            20370           0.000000   
2   Male  Not Graduate            No             5922        3264.663018   
3   Male  Not Graduate            No             6269          20.201380   
4   Male      Graduate            No             1429         435.718039   

   LoanAmount Credit_History Property_Area Loan_Status  \
0        71.0            1.0         Rural           Y   
1       222.0            1.0         Rural           N   
2       235.0            0.0         Urban           N   
3       204.0            1.0         Urban           N   
4         9.0            1.0     Semiurban           N   

                                    Application_Note  
0  Urgent medical expenses covered by loan. I mig...  
1  Planning to expand my small business operation...  
2  Consolidating high-interes