In [1]:
import pandas as pd
import numpy as np
from sklearn import cluster

from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
from ydata_synthetic.synthesizers.regular import RegularSynthesizer

In [2]:
# List to store individual dataframes
data_subgroups = []

# Loop through the Parquet files and read them into dataframes
for group_index in range(12):
    file_path = f"../../data/preprocessed_tabular/group_{group_index}.parquet"
    df = pd.read_parquet(file_path)
    data_subgroups.append(df)

# Concatenate the dataframes into a single dataframe
data = pd.concat(data_subgroups, ignore_index=True)
data


Unnamed: 0,cmcouplerfriction,cmdoorfriction,cmelectronicage,cmvibration,doorcyclecounter,mean_doorforce,mean_doorspeed,mean_doorposition,median_doorforce,median_doorspeed,median_doorposition,min_doorforce,min_doorspeed,min_doorposition,max_doorforce,max_doorspeed,max_doorposition
0,113.800003,107.099998,1.02,1.2,3082,30.502028,-0.003726,0.202940,53.599998,-0.0017,0.24430,-170.199997,-0.1040,-0.0431,167.399994,0.2190,0.3742
1,127.300003,101.500000,0.84,1.0,3083,26.110151,0.000225,0.199377,42.599998,-0.0017,0.24070,-170.199997,-0.1082,-0.0431,165.100006,0.2483,0.3742
2,126.199997,104.199997,0.86,0.9,3084,25.425653,0.000661,0.195084,31.400000,-0.0017,0.22740,-170.300003,-0.1177,-0.0431,168.399994,0.2574,0.3742
3,127.300003,106.099998,0.89,0.9,3085,24.361139,0.000030,0.192046,27.000000,-0.0017,0.21830,-170.199997,-0.1226,-0.0431,168.000000,0.2638,0.3742
4,126.000000,105.800003,0.88,0.8,3086,25.225254,-0.000226,0.192948,32.049999,-0.0017,0.22315,-170.199997,-0.1064,-0.0431,168.899994,0.2611,0.3742
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4875,112.800003,152.399994,0.80,0.9,3287,29.174458,-0.000060,0.191778,44.650002,-0.0017,0.22565,-170.199997,-0.1406,-0.0431,193.199997,0.2664,0.3742
4876,143.500000,146.899994,0.80,1.0,3288,29.236170,0.000293,0.193939,55.150002,-0.0017,0.23120,-170.100006,-0.1359,-0.0431,185.600006,0.2456,0.3742
4877,134.100006,152.699997,0.81,1.1,3289,29.148911,-0.000028,0.192583,43.200001,-0.0017,0.22610,-170.199997,-0.1423,-0.0431,191.199997,0.2711,0.3742
4878,134.500000,146.000000,0.80,1.0,3290,30.970966,-0.000782,0.196144,59.500000,-0.0017,0.23930,-170.199997,-0.1358,-0.0431,186.399994,0.2446,0.3742


In [3]:
#Data processing and analysis
num_cols = list(data.columns)

print('Dataset columns: {}'.format(num_cols))
sorted_cols = ['cmcouplerfriction',
 'cmdoorfriction',
 'cmelectronicage',
 'cmvibration',
 'doorcyclecounter',
 'mean_doorforce',
 'median_doorforce',
 'min_doorforce',
 'max_doorforce',
 'mean_doorspeed',
 'median_doorspeed',
 'min_doorspeed',
 'max_doorspeed',
 'mean_doorposition',
 'median_doorposition',
 'min_doorposition',
 'max_doorposition']
processed_data = data[ sorted_cols ].copy()
train_data = processed_data.copy()


Dataset columns: ['cmcouplerfriction', 'cmdoorfriction', 'cmelectronicage', 'cmvibration', 'doorcyclecounter', 'mean_doorforce', 'mean_doorspeed', 'mean_doorposition', 'median_doorforce', 'median_doorspeed', 'median_doorposition', 'min_doorforce', 'min_doorspeed', 'min_doorposition', 'max_doorforce', 'max_doorspeed', 'max_doorposition']


In [4]:
#Create a new class column using KMeans - This will mainly be useful if we want to leverage conditional GAN
print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1]))
algorithm = cluster.KMeans
args, kwds = (), {'n_clusters':12, 'random_state':0}
labels = algorithm(*args, **kwds).fit_predict(train_data[ num_cols ])

#fraud_w_classes = train_data.copy()
#fraud_w_classes['Class'] = labels


Dataset info: Number of records - 4880 Number of variables - 17


  super()._check_params_vs_input(X, default_n_init=10)


GAN Training

In [5]:
#Define the Conditional GAN and training parameters
noise_dim = 32
dim = 128
batch_size = 128
beta_1 = 0.5
beta_2 = 0.9

log_step = 100
epochs = 2 + 1
learning_rate = 5e-4
models_dir = '../cache'


In [6]:
#Test here the new inputs
gan_args = ModelParameters(batch_size=batch_size,
                           lr=learning_rate,
                           betas=(beta_1, beta_2),
                           noise_dim=noise_dim,
                           layers_dim=dim)

train_args = TrainParameters(epochs=epochs,
                             cache_prefix='',
                             sample_interval=log_step)

In [7]:
# Init the Conditional GAN providing the index of the label column as one of the arguments
synth = RegularSynthesizer(modelname='cgan', model_parameters=gan_args)
#synth=RegularSynthesizer(modelname = 'cgan').function(model_parameters = gan_args)

# Training the Conditional GAN
synth.fit(data=train_data, train_arguments=train_args, num_cols=num_cols)

# Generate synthetic data
synthetic_data = synth.sample(1000)

# Convert synthetic data to a DataFrame
synthetic_df = pd.DataFrame(synthetic_data, columns=num_cols)

# Save synthetic data to a Parquet file
synthetic_df.to_parquet('synthetic_data.parquet', index=False)


TypeError: object.__init__() takes exactly one argument (the instance to initialize)