In [None]:
!pip install sdmetrics matplotlib pandas
!pip install seaborn
!pip install nbformat

In [35]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sdmetrics.single_table import SingleTableMetric, NewRowSynthesis
from sdmetrics.reports.single_table import QualityReport
from sdmetrics.visualization import get_column_plot, get_column_pair_plot
from sdmetrics.single_column import RangeCoverage, CategoryCoverage, StatisticSimilarity
import plotly.io as pio
%matplotlib inline

real_table = pd.read_csv("./octgan/data/csv/loan/train.csv")
synthetic_table = pd.read_csv("./results/loan/syn.csv")
synthetic_table.columns = [
    'BTH_YR', 'GENDER', 'SCTR_CD', 'LN_CD_1', 'LN_CD_2', 'LN_CD_3',
    'LN_DT', 'LN_AMT', 'label', 'RPY_CD', 'EXP_DT', 'RATE', 'RPY_AMT'
]
cat_cols = ["BTH_YR", "GENDER", "SCTR_CD",
            "LN_CD_1", "LN_CD_2", "LN_CD_3",
            "LN_DT", "label", "RPY_CD", "EXP_DT"]
num_cols = ["LN_AMT", "RATE", "RPY_AMT"]

In [36]:
metadata = {
    "columns": {
        "BTH_YR": {"sdtype": "categorical"},
        "GENDER": {"sdtype": "categorical"},
        "SCTR_CD": {"sdtype": "categorical"},
        "LN_CD_1": {"sdtype": "categorical"},
        "LN_CD_2": {"sdtype": "categorical"},
        "LN_CD_3": {"sdtype": "categorical"},
        "LN_DT": {"sdtype": "categorical"},
        "LN_AMT": {"sdtype": "numerical"},
        "label": {"sdtype": "categorical"},
        "RPY_CD": {"sdtype": "categorical"},
        "EXP_DT": {"sdtype": "categorical"},
        "RATE": {"sdtype": "numerical"},
        "RPY_AMT": {"sdtype": "numerical"},
    }
}

In [29]:
report = QualityReport()
report.generate(synthetic_data=synthetic_table, real_data=real_table, metadata=metadata)

overall_quality_score = report.get_score()
metric_scores = report.get_properties()

print(f"Overall Quality Score: {overall_quality_score}")
metric_scores

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 13/13 [00:00<00:00, 50.60it/s]|
Column Shapes Score: 31.52%

(2/2) Evaluating Column Pair Trends: |██████████| 78/78 [00:01<00:00, 70.91it/s]|
Column Pair Trends Score: 13.55%

Overall Score (Average): 22.53%

Overall Quality Score: 0.22534646943177541


Unnamed: 0,Property,Score
0,Column Shapes,0.315234
1,Column Pair Trends,0.135459


In [50]:
fig = get_column_plot(
    real_data=real_table,
    synthetic_data=synthetic_table,
    column_name='RPY_AMT',
    plot_type='distplot'
)

pio.renderers.default = 'colab'
fig.show()

In [46]:
range_coverage_results = {}

for col in num_cols:
    result = RangeCoverage.compute(
        real_data=real_table[col],
        synthetic_data=synthetic_table[col]
    )
    range_coverage_results[col] = result

for col, score in range_coverage_results.items():
    print(f"{col}: {score:.2f}")

LN_AMT: 0.16
RATE: 0.97
RPY_AMT: 0.39


In [42]:
cat_coverage_results = {}
for col in cat_cols:
    result = CategoryCoverage.compute(
        real_data=real_table[col],
        synthetic_data=synthetic_table[col]
    )
    cat_coverage_results[col] = result

for col, score in cat_coverage_results.items():
    print(f"{col}: {score:.2f}")

BTH_YR: 0.20
GENDER: 1.00
SCTR_CD: 0.60
LN_CD_1: 0.67
LN_CD_2: 0.53
LN_CD_3: 0.83
LN_DT: 0.08
label: 1.00
RPY_CD: 0.33
EXP_DT: 0.03


In [45]:
stat_results = {}

for col in num_cols:
    result = StatisticSimilarity.compute(
        real_data=real_table[col],
        synthetic_data=synthetic_table[col],
        statistic='mean'
    )
    stat_results[col] = result

for col, score in stat_results.items():
    print(f"{col}: {score:.2f}")

LN_AMT: 0.98
RATE: 0.39
RPY_AMT: 0.97


In [49]:
NewRowSynthesis.compute(
    real_data=real_table,
    synthetic_data=synthetic_table,
    metadata=metadata,
    numerical_match_tolerance=0.01,
    synthetic_sample_size=60269
)


1.0