In [1]:
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.metadata import Metadata
import pandas as pd

data = pd.read_csv('data/cleaned_credit_score_v2.csv')
data = data.drop(['ID', 'Customer_ID', 'Name', 'SSN', 'Type_of_Loan', 'Payment_Behaviour'], axis=1)

metadata = {
    "columns": {
        "Month": {"sdtype": "categorical"},
        "Age": {"sdtype": "numerical"},
        "Occupation": {"sdtype": "categorical"},
        "Annual_Income": {"sdtype": "numerical"},
        "Monthly_Inhand_Salary": {"sdtype": "numerical"},
        "Num_Bank_Accounts": {"sdtype": "numerical"},
        "Num_Credit_Card": {"sdtype": "numerical"},
        "Interest_Rate": {"sdtype": "numerical"},
        "Num_of_Loan": {"sdtype": "numerical"},
        "Delay_from_due_date": {"sdtype": "numerical"},
        "Num_of_Delayed_Payment": {"sdtype": "numerical"},
        "Changed_Credit_Limit": {"sdtype": "numerical"},
        "Num_Credit_Inquiries": {"sdtype": "numerical"},
        "Credit_Mix": {"sdtype": "categorical"},
        "Outstanding_Debt": {"sdtype": "numerical"},
        "Credit_Utilization_Ratio": {"sdtype": "numerical"},
        "Credit_History_Age": {"sdtype": "numerical"},
        "Payment_of_Min_Amount": {"sdtype": "categorical"},
        "Total_EMI_per_month": {"sdtype": "numerical"},
        "Amount_invested_monthly": {"sdtype": "numerical"},
        "Monthly_Balance": {"sdtype": "numerical"},
    }
}

# Assuming 'metadata_dict' is your dictionary
metadata = Metadata.load_from_dict(metadata)

synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.fit(data=data)

for i in range(1, 31):
    # Create synthetic data
    synthetic_data = synthesizer.sample(33769)
    synthetic_data.to_csv(f"data/credit_score/sdv-copula_credit_score_{i}.csv", index=False)



In [2]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    data,
    synthetic_data,
    metadata)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 21/21 [00:00<00:00, 66.73it/s]|
Column Shapes Score: 92.97%

(2/2) Evaluating Column Pair Trends: |██████████| 210/210 [00:01<00:00, 142.34it/s]|
Column Pair Trends Score: 92.33%

Overall Score (Average): 92.65%

