In [123]:
import pandas as pd
import networkx as nx
import pgmpy
import scipy
import xgboost as xgb
from pgmpy.estimators import HillClimbSearch
from pgmpy.estimators import BayesianEstimator
from pgmpy.models import BayesianNetwork

from sklearn.metrics import f1_score

In [9]:
df = pd.read_csv('german.csv', dtype='category')
non_label_attrs = ['status', 'duration', 'credit-history', 'purpose', 'credit-amount', 'saving-account', 'employment', 'installment-rate', 'sex-marst', 'other-debtors', 'residence', 'property', 'age', 'other-installment-plans', 'housing', 'existing-credits', 'job', 'people-liable', 'telephone', 'foreign-worker']
label_attr = 'good-credit'

In [10]:
data = df[non_label_attrs]
label = df[label_attr]

In [12]:
dtrain = xgb.DMatrix(data, label=label, enable_categorical=True)

In [163]:
# params = {'max_depth': 6, 'eta': 1, 'objective': 'binary:hinge'}
# params['nthread'] = 4
# params['eval_metric'] = 'auc'
params = {'objective': 'binary:hinge'}
params['max_depth'] = 7
params['eta'] = 1
params['eval_metric'] = 'auc'

In [164]:
bst = xgb.train(params=params, dtrain=dtrain)

In [165]:
ypred = bst.predict(dtrain)

In [166]:
f1_score(label.astype(float), ypred)

0.9978617248752674

In [172]:
non_label_dag = HillClimbSearch(data).estimate(max_iter=1e7)
non_label_model = BayesianNetwork(non_label_dag.edges())
non_label_model.fit(
    data, estimator=BayesianEstimator
)

  0%|          | 0/10000000 [00:00<?, ?it/s]

In [181]:
syn_non_label = non_label_model.simulate(n_samples=1000000)

  0%|          | 0/20 [00:00<?, ?it/s]

  f"Probability values don't exactly sum to 1. Differ by: {error}. Adjusting values."


In [182]:
syn_label = bst.predict(xgb.DMatrix(syn_non_label[non_label_attrs].astype('category'), enable_categorical=True))

In [183]:
syn_df = syn_non_label.copy()
syn_df[label_attr] = syn_label
syn_df[label_attr] = syn_df[label_attr].astype('int')

In [184]:
syn_df.to_csv('synthetic_german.csv', index=False)

In [185]:
for attr in non_label_attrs:
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 
'display.max_colwidth', 1000):
        print(syn_df.groupby(attr).agg(answer=(label_attr, 'mean'), count=(label_attr, 'count')))

                                              answer   count
status                                                      
... < 0 DM                                  0.574466  269135
... >= 200 DM / salary for at least 1 year  0.863093  393428
0<= ... < 200 DM                            0.756344   63922
no checking account                         0.526574  273515
                    answer   count
duration                          
1 <= ... < 4 yrs  0.672489  623606
4 <= ... < 7 yrs  0.614016   17366
< 1 yr            0.714474  359028
                                               answer   count
credit-history                                               
all credits at this bank paid back duly      0.732166  292532
critical account/other credits elsewhere     0.400374   49681
delay in paying off in the past              0.626543   40998
existing credits paid back duly till now     0.681759   88612
no credits taken/all credits paid back duly  0.693660  528177
                       an