In [11]:
from tabgan.sampler import OriginalGenerator, GANGenerator
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [None]:
# example

# random input data
train = pd.DataFrame(np.random.randint(-10, 150, size=(150, 4)), columns=list("ABCD"))
target = pd.DataFrame(np.random.randint(0, 2, size=(150, 1)), columns=list("Y"))
test = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=list("ABCD"))

# generate data
new_train1, new_target1 = OriginalGenerator().generate_data_pipe(train, target, test, )
new_train2, new_target2 = GANGenerator().generate_data_pipe(train, target, test, )

# example with all params defined
new_train3, new_target3 = GANGenerator(gen_x_times=1.1, cat_cols=None,
           bot_filter_quantile=0.001, top_filter_quantile=0.999, is_post_process=True,
           adversarial_model_params={
               "metrics": "AUC", "max_depth": 2, "max_bin": 100, 
               "learning_rate": 0.02, "random_state": 42, "n_estimators": 50,
           }, pregeneration_frac=2, only_generated_data=False,
           gan_params = {"batch_size": 500, "patience": 25, "epochs" : 500,}).generate_data_pipe(train, target,
                                          test, deep_copy=True, only_adversarial=False, use_adversarial=True)

In [12]:
from ctgan import CTGANSynthesizer
from ctgan import load_demo

data = load_demo()

# Names of the columns that are discrete
discrete_columns = [
    'workclass',
    'education',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country',
    'income'
]

ctgan = CTGANSynthesizer(epochs=10)
ctgan.fit(data, discrete_columns)

# Synthetic copy
samples = ctgan.sample(1000)

In [13]:
data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [14]:
samples

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,31,Private,179025,Some-college,10,Married-civ-spouse,Adm-clerical,Not-in-family,Black,Male,8332,-4,44,United-States,>50K
1,48,Private,63367,12th,6,Never-married,Sales,Unmarried,Black,Male,131,-5,64,United-States,>50K
2,37,Private,373810,Some-college,9,Never-married,Other-service,Husband,White,Female,62,-2,40,Cuba,<=50K
3,37,Federal-gov,140985,HS-grad,10,Widowed,?,Wife,White,Male,-43,1740,40,United-States,<=50K
4,30,Private,461481,Doctorate,9,Divorced,Machine-op-inspct,Not-in-family,White,Female,-46,-3,40,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,68,Private,261754,HS-grad,9,Married-civ-spouse,Adm-clerical,Not-in-family,Black,Male,112,-5,51,United-States,>50K
996,36,Private,179792,Assoc-voc,9,Never-married,Exec-managerial,Own-child,White,Female,135,-7,27,Iran,>50K
997,48,Private,303037,HS-grad,9,Never-married,Tech-support,Wife,White,Male,5521,1894,34,United-States,>50K
998,31,Private,184972,Bachelors,9,Never-married,Craft-repair,Wife,White,Male,155,-5,40,United-States,>50K


In [15]:
from sdv.demo import load_tabular_demo

data = load_tabular_demo('student_placements')
data.head()

Unnamed: 0,student_id,gender,second_perc,high_perc,high_spec,degree_perc,degree_type,work_experience,experience_years,employability_perc,mba_spec,mba_perc,salary,placed,start_date,end_date,duration
0,17264,M,67.0,91.0,Commerce,58.0,Sci&Tech,False,0,55.0,Mkt&HR,58.8,27000.0,True,2020-07-23,2020-10-12,3.0
1,17265,M,79.33,78.33,Science,77.48,Sci&Tech,True,1,86.5,Mkt&Fin,66.28,20000.0,True,2020-01-11,2020-04-09,3.0
2,17266,M,65.0,68.0,Arts,64.0,Comm&Mgmt,False,0,75.0,Mkt&Fin,57.8,25000.0,True,2020-01-26,2020-07-13,6.0
3,17267,M,56.0,52.0,Science,52.0,Sci&Tech,False,0,66.0,Mkt&HR,59.43,,False,NaT,NaT,
4,17268,M,85.8,73.6,Commerce,73.3,Comm&Mgmt,False,0,96.8,Mkt&Fin,55.5,42500.0,True,2020-07-04,2020-09-27,3.0


In [16]:
from sdv.tabular import CTGAN

model = CTGAN()
model.fit(data)

In [17]:
new_data = model.sample(200)
new_data

Unnamed: 0,student_id,gender,second_perc,high_perc,high_spec,degree_perc,degree_type,work_experience,experience_years,employability_perc,mba_spec,mba_perc,salary,placed,start_date,end_date,duration
0,17478,F,44.58,74.98,Science,66.37,Comm&Mgmt,False,0,66.57,Mkt&HR,52.58,,False,2020-01-10,2020-09-04,
1,17445,M,66.81,57.38,Science,61.30,Sci&Tech,True,1,80.12,Mkt&Fin,52.47,28700.0,True,2020-01-03,2020-09-13,
2,17381,M,72.62,63.87,Science,70.52,Comm&Mgmt,True,0,76.96,Mkt&HR,65.90,,False,NaT,2020-08-05,3.0
3,17452,M,86.14,69.56,Science,73.89,Sci&Tech,True,0,84.03,Mkt&Fin,54.52,26300.0,False,2020-09-05,2020-08-16,
4,17345,M,53.75,65.85,Commerce,65.94,Comm&Mgmt,True,0,79.82,Mkt&HR,64.54,,False,NaT,2021-03-05,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,17427,M,76.00,60.90,Commerce,72.65,Comm&Mgmt,False,0,94.77,Mkt&HR,52.03,,False,NaT,2020-09-03,3.0
196,17356,F,51.74,55.77,Commerce,68.05,Sci&Tech,True,0,97.84,Mkt&HR,65.77,28400.0,True,NaT,NaT,5.0
197,17409,F,52.35,40.45,Commerce,84.09,Comm&Mgmt,False,0,60.81,Mkt&HR,60.67,30000.0,True,NaT,2020-09-05,5.0
198,17471,F,40.89,55.51,Commerce,85.45,Sci&Tech,False,0,71.22,Mkt&HR,64.73,,False,NaT,2020-08-19,


In [18]:
from sdv.evaluation import evaluate
evaluate(new_data, data)

0.5511460186324222

In [19]:
from sdv.metrics.tabular import CSTest, KSTest

CSTest.compute(data, new_data), KSTest.compute(data, new_data)

(0.8931348345301781, 0.8199870801033592)

In [20]:
data = pd.read_csv('src/amazon_data_without_nan_keywords.csv')
data

Unnamed: 0,Targeting,Match Type,Bid Value,Suggested lower bid range,Suggested upper bid range,Impressions,Spend,Click-Thru Rate (CTR),Total Return on Advertising Spend (RoAS),Clicks,14 Day Total Sales (₹),14 Day Conversion Rate,14 Day Total Units (#),14 Day Total Orders (#),Category
0,knee pain oil,EXACT,31.35,14.788293,36.2406,14.0,31.35,0.071429,0.00,1.0,0.00,0.000000,0.0,0.0,['Косметика']
1,knee pain oil,EXACT,31.35,14.790000,36.2406,21.0,0.00,0.000000,0.00,0.0,0.00,0.000000,0.0,0.0,['Косметика']
2,knee pain oil,EXACT,31.35,14.790000,36.2406,42.0,0.00,0.000000,0.00,0.0,0.00,0.000000,0.0,0.0,['Косметика']
3,knee pain oil,EXACT,31.35,14.790000,36.2406,41.0,0.00,0.000000,0.00,0.0,0.00,0.000000,0.0,0.0,['Косметика']
4,knee pain oil,EXACT,31.35,14.790000,36.2406,11.0,0.00,0.000000,0.00,0.0,0.00,0.000000,0.0,0.0,['Косметика']
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3020,vitamin e oil,EXACT,26.48,2.810000,44.8400,284.0,0.00,0.000000,0.00,0.0,0.00,0.000000,0.0,0.0,['Косметика']
3021,vitamin e oil,EXACT,26.48,2.810000,44.8400,407.0,0.00,0.000000,0.00,0.0,0.00,0.000000,0.0,0.0,['Косметика']
3022,vitamin e oil,EXACT,26.48,2.810000,44.8400,212.0,0.00,0.000000,0.00,0.0,0.00,0.000000,0.0,0.0,['Косметика']
3023,hair oil,EXACT,50.45,47.390000,17.7200,8719.0,2068.44,0.004702,0.15,41.0,311.61,0.024390,1.0,1.0,['Косметика']


In [21]:
model = CTGAN()
model.fit(data)
samples = model.sample(1000)

In [23]:
samples

Unnamed: 0,Targeting,Match Type,Bid Value,Suggested lower bid range,Suggested upper bid range,Impressions,Spend,Click-Thru Rate (CTR),Total Return on Advertising Spend (RoAS),Clicks,14 Day Total Sales (₹),14 Day Conversion Rate,14 Day Total Units (#),14 Day Total Orders (#),Category
0,knee pain oil,PHRASE,29.916197,48.090800,6.659658,39.0,44.97,0.000000,0.000000,0.0,1.87,0.000000,0.0,0.0,['Косметика']
1,bhringraj oil,EXACT,5.014795,2.976179,0.010000,0.0,64.92,0.004313,43.738310,3.0,700.39,0.113231,1.0,0.0,['Косметика']
2,pain relief oil,EXACT,25.604857,2.023053,4.631972,8931.0,78.77,0.000000,0.000000,2.0,2.45,1.074387,0.0,1.0,['Косметика']
3,sunscreen spf 50,EXACT,1.000000,3.551827,1.312007,40.0,15.79,0.001278,0.436988,1.0,982.33,0.083515,1.0,1.0,['Косметика']
4,substitutes,EXACT,3.220611,2.953952,73.877018,8166.0,423.98,0.002764,49.487989,7.0,0.00,0.002342,4.0,1.0,['Косметика']
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,knee pain relief,PHRASE,16.485743,55.226268,27.748092,32.0,3.53,0.000000,0.798342,1.0,0.94,0.000000,0.0,0.0,['Косметика']
996,knee oil,PHRASE,26.165966,46.291777,52.396799,26.0,3.09,0.005705,0.187695,0.0,4.00,0.000000,0.0,0.0,['Косметика']
997,knee oil,PHRASE,33.277047,17.118350,34.089867,39.0,0.00,0.004219,0.000000,0.0,0.00,0.001148,0.0,0.0,['Косметика']
998,sunscreen,EXACT,6.774709,3.017399,4.190591,400.0,117.45,0.354122,724.190534,9.0,750.95,0.781834,0.0,1.0,['Косметика']


In [33]:
from sdv.metrics.tabular import BNLikelihood, BNLogLikelihood, GMLogLikelihood
from sdv.metrics.tabular import LogisticDetection, SVCDetection

print('Statistic metrics', CSTest.compute(data, samples), KSTest.compute(data, samples))

raw_score_bn = BNLikelihood.compute(data, samples)
raw_score_gm = GMLogLikelihood.compute(data, samples)
print('Likelihood metrics', raw_score_bn, raw_score_gm)

# 1 - ROCAUC
print('Detection metrics', LogisticDetection.compute(data, samples), SVCDetection.compute(data, samples))

print('Average score', evaluate(samples, data))

Statistic metrics 0.9990675647860189 0.7239359504132231
Likelihood metrics 0.01431702479338843 -501.93321566289575
Detection metrics 0.6826974363466047 0.26488584200415255
Average score 0.5369502864435555
