# Load Datasets

In [6]:
import pandas as pd
# Path of the dataset folder
# label: nonpotential_label; cillabel: nonpotential_cillabel
path  = 'Data/nonpotential_label'

# Path of the folder where you save Dataset_Utility, CTAB-GAN and Tabula codes
path_pro = 'Generative_Models'

df_train = pd.read_csv(f"{path}/df_train.csv")
print("data is loaded...")

data is loaded...


In [7]:
from Dataset_Utility import utility_functions as uf
uf.calculate_label_rate2(df_train,'label')

Total Sample size is 19999, Positive Sample size is 1398, Negative Sample size is 18601, label rate is 0.0752


In [3]:
print(df_train.columns)
discrete_columns = ['u_phonePrice', 'u_browserLifeCycle', 'u_browserMode',
       'u_feedLifeCycle', 'u_refreshTimes', 'i_regionEntity', 'i_cat',
       'i_dislikeTimes', 'i_upTimes', 'i_dtype', 'e_ch', 'e_m', 'e_po', 'e_rn',
       'e_section', 'label', 'cillabel', 'pro']

Index(['u_phonePrice', 'u_browserLifeCycle', 'u_browserMode',
       'u_feedLifeCycle', 'u_refreshTimes', 'i_regionEntity', 'i_cat',
       'i_dislikeTimes', 'i_upTimes', 'i_dtype', 'e_ch', 'e_m', 'e_po', 'e_rn',
       'e_section', 'label', 'cillabel', 'pro'],
      dtype='object')


# 1 Synthetic Data Generation

## 1.1 Generate Data Using CTGAN

https://github.com/sdv-dev/CTGAN

In [None]:
!pip install ctgan



In [None]:
from ctgan import CTGAN
ctgan = CTGAN()
ctgan.fit(df_train, discrete_columns)
ctgan_sample = ctgan.sample(df_train.shape[0])

uf.calculate_label_rate2(ctgan_sample,'label')

ctgan_sample.to_csv(f"{path}/df_syn_ctgan.csv", index = False)

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Total Sample size is 19999, Positive Sample size is 3065, Negative Sample size is 16934, label rate is 0.1810


## 1.2 Generate Data Using CTAB-GAN

https://github.com/Team-TUD/CTAB-GAN

In [8]:
import sys
sys.path.append(f"{path_pro}/CTAB-GAN")
from model.ctabgan import CTABGAN

real_path = f"{path}/df_train.csv" # Path of the training data
ctabgan =  CTABGAN(raw_csv_path = real_path,
                 test_ratio = 0.20,
                 categorical_columns = ['label'],
                 log_columns = [],
                 integer_columns = ['u_phonePrice', 'u_browserLifeCycle', 'u_browserMode',
       'u_feedLifeCycle', 'u_refreshTimes', 'i_regionEntity', 'i_cat',
       'i_dislikeTimes', 'i_upTimes', 'i_dtype', 'e_ch', 'e_m', 'e_po', 'e_rn',
       'e_section', 'cillabel', 'pro'],
                 problem_type= {"Classification": 'label'},
                 epochs = 50)
ctabgan.fit()
ctabgan_sample = ctabgan.generate_samples()

# Label in the generated dataset is of "object" type. Convert it to "int64"
ctabgan_sample['label'] = ctabgan_sample['label'].astype("int64")

uf.calculate_label_rate2(ctabgan_sample, 'label')

ctabgan_sample.to_csv(f"{path}/df_syn_ctabgan.csv", index = False)

## 1.3 Generate Data Using TVAE

https://sdv.dev/SDV/user_guides/single_table/tvae.html

In [None]:
from ctgan import TVAE

tvae = TVAE()
discrete_columns = df_train.columns
tvae.fit(df_train, discrete_columns)
tvae_sample = tvae.sample(df_train.shape[0])

uf.calculate_label_rate2(tvae_sample, 'label')

tvae_sample.to_csv(f"{path}/df_syn_tvae.csv", index = False)

Total Sample size is 19999, Positive Sample size is 1179, Negative Sample size is 18820, label rate is 0.0626


## 1.4 Generate Data Using DataSynthesizer

https://github.com/DataResponsibly/DataSynthesizer

In [None]:
!pip install DataSynthesizer

Collecting DataSynthesizer
  Downloading DataSynthesizer-0.1.13-py2.py3-none-any.whl.metadata (4.7 kB)
Downloading DataSynthesizer-0.1.13-py2.py3-none-any.whl (24 kB)
Installing collected packages: DataSynthesizer
Successfully installed DataSynthesizer-0.1.13


In [None]:
from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.ModelInspector import ModelInspector
from DataSynthesizer.lib.utils import read_json_file, display_bayesian_network

epsilon = 0

input_data = f"{path}/df_train.csv"
mode = 'correlated_attribute_mode'
description_file = f'{path}/description.json'
synthetic_data = f'{path}/sythetic_data.csv'

degree_of_bayesian_network = 2
num_tuples_to_generate = df_train.shape[0]

describer = DataDescriber()
describer.describe_dataset_in_correlated_attribute_mode(dataset_file=input_data,
                                                        epsilon=epsilon,
                                                        k=degree_of_bayesian_network)
describer.save_dataset_description_to_file(description_file)
generator = DataGenerator()
generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
generator.save_synthetic_data(synthetic_data)

ds_sample = pd.read_csv(synthetic_data)

uf.calculate_label_rate2(ds_sample, 'label')

ds_sample.to_csv(f"{path}/df_syn_ds.csv", index = False)

Adding ROOT e_po


  self.pid = os.fork()


Adding attribute e_section
Adding attribute i_dtype
Adding attribute u_browserMode
Adding attribute u_phonePrice
Adding attribute e_m
Adding attribute u_browserLifeCycle
Adding attribute u_feedLifeCycle
Adding attribute u_refreshTimes
Adding attribute i_cat
Adding attribute i_regionEntity
Adding attribute e_ch
Adding attribute i_dislikeTimes
Adding attribute i_upTimes
Adding attribute e_rn
Adding attribute pro
Adding attribute label
Adding attribute cillabel
Total Sample size is 19999, Positive Sample size is 1359, Negative Sample size is 18640, label rate is 0.0729


## 1.5 Generate Data Using GReaT

https://github.com/kathrinse/be_great/tree/main

In [2]:
!pip install be-great
!pip install transformers[torch]
!pip install accelerate -U

Collecting be-great
  Downloading be_great-0.0.7-py3-none-any.whl.metadata (5.1 kB)
Collecting datasets>=2.5.2 (from be-great)
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets>=2.5.2->be-great)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets>=2.5.2->be-great)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets>=2.5.2->be-great)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets>=2.5.2->be-great)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets>=2.5.2->be-great)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets>=2.5.2->be-great)
  Downloading fsspec-2024.

In [None]:
from be_great import GReaT
import numpy as np
great = GReaT(llm='distilgpt2', batch_size = 100, epochs = 30)
great.fit(df_train)
great_sample = great.sample(n_samples=df_train.shape[0])

uf.calculate_label_rate2(great_sample, 'label')

great_sample.to_csv(f"{path}/df_syn_great.csv", index = False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.23 GiB. GPU 

## 1.6 Generate Data Using Tabula

https://github.com/zhao-zilong/Tabula

In [9]:
import sys
sys.path.append(f"{path_pro}/Tabula")
from tabula_middle_padding import Tabula
import torch

tabula = Tabula(llm='distilgpt2', experiment_dir = "df_train", batch_size=100, epochs=30)
tabula.fit(df_train, conditional_col = df_train.columns[0])
torch.save(tabula.model.state_dict(), "df_train.pt")
tabula_sample = tabula.sample(n_samples=df_train.shape[0])

uf.calculate_label_rate2(tabula_sample, 'label')

tabula_sample.to_csv(f"{path}/df_syn_tabula.csv", index = False)