In [1]:
!pip install numpy==1.21.0 torch==1.9.1 pandas==1.2.4 sklearn==0.24.1 dython==0.6.4.post1 scipy==1.4.1

Collecting numpy==1.21.0
  Downloading numpy-1.21.0.zip (10.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.3/10.3 MB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[31mERROR: Could not find a version that satisfies the requirement torch==1.9.1 (from versions: 1.11.0, 1.12.0, 1.12.1, 1.13.0, 1.13.1, 2.0.0, 2.0.1, 2.1.0, 2.1.1, 2.1.2, 2.2.0, 2.2.1, 2.2.2, 2.3.0, 2.3.1, 2.4.0)[0m[31m
[0m[31mERROR: No matching distribution found for torch==1.9.1[0m[31m
[0m

In [2]:
import numpy as np
import pandas as pd
import torch
from sklearn.mixture import BayesianGaussianMixture


class DataTransformer():

    """
    Transformer class responsible for processing data to train the CTABGANSynthesizer model

    Variables:
    1) train_data -> input dataframe
    2) categorical_list -> list of categorical columns
    3) mixed_dict -> dictionary of mixed columns
    4) n_clusters -> number of modes to fit bayesian gaussian mixture (bgm) model
    5) eps -> threshold for ignoring less prominent modes in the mixture model
    6) ordering -> stores original ordering for modes of numeric columns
    7) output_info -> stores dimension and output activations of columns (i.e., tanh for numeric, softmax for categorical)
    8) output_dim -> stores the final column width of the transformed data
    9) components -> stores the valid modes used by numeric columns
    10) filter_arr -> stores valid indices of continuous component in mixed columns
    11) meta -> stores column information corresponding to different data types i.e., categorical/mixed/numerical


    Methods:
    1) __init__() -> initializes transformer object and computes meta information of columns
    2) get_metadata() -> builds an inventory of individual columns and stores their relevant properties
    3) fit() -> fits the required bgm models to process the input data
    4) transform() -> executes the transformation required to train the model
    5) inverse_transform() -> executes the reverse transformation on data generated from the model

    """

    def __init__(self, train_data=pd.DataFrame, categorical_list=[], mixed_dict={}, n_clusters=5, eps=0.005):

        self.meta = None
        self.train_data = train_data
        self.categorical_columns= categorical_list
        self.mixed_columns= mixed_dict
        self.n_clusters = n_clusters
        self.eps = eps
        self.ordering = []
        self.output_info = []
        self.output_dim = 0
        self.components = []
        self.filter_arr = []
        self.meta = self.get_metadata()

    def get_metadata(self):

        meta = []

        for index in range(self.train_data.shape[1]):
            column = self.train_data.iloc[:,index]
            if index in self.categorical_columns:
                mapper = column.value_counts().index.tolist()
                meta.append({
                        "name": index,
                        "type": "categorical",
                        "size": len(mapper),
                        "i2s": mapper
                })
            elif index in self.mixed_columns.keys():
                meta.append({
                    "name": index,
                    "type": "mixed",
                    "min": column.min(),
                    "max": column.max(),
                    "modal": self.mixed_columns[index]
                })
            else:
                meta.append({
                    "name": index,
                    "type": "continuous",
                    "min": column.min(),
                    "max": column.max(),
                })

        return meta

    def fit(self):

        data = self.train_data.values

        # stores the corresponding bgm models for processing numeric data
        model = []

        # iterating through column information
        for id_, info in enumerate(self.meta):
            if info['type'] == "continuous":
                # fitting bgm model
                gm = BayesianGaussianMixture(
                    n_components=self.n_clusters,
                    weight_concentration_prior_type='dirichlet_process',
                    weight_concentration_prior=0.001, # lower values result in lesser modes being active
                    max_iter=500, n_init=5, random_state=42, tol=1e-3)
                gm.fit(data[:, id_].reshape([-1, 1]))
                model.append(gm)
                # keeping only relevant modes that have higher weight than eps and are used to fit the data
                old_comp = gm.weights_ > self.eps
                mode_freq = (pd.Series(gm.predict(data[:, id_].reshape([-1, 1]))).value_counts().keys())
                comp = []
                for i in range(self.n_clusters):
                    if (i in (mode_freq)) & old_comp[i]:
                        comp.append(True)
                    else:
                        comp.append(False)
                self.components.append(comp)
                self.output_info += [(1, 'tanh'), (np.sum(comp), 'softmax')]
                self.output_dim += 1 + np.sum(comp)

            elif info['type'] == "mixed":

                # in case of mixed columns, two bgm models are used
                gm1 = BayesianGaussianMixture(
                    n_components=self.n_clusters,
                    weight_concentration_prior_type='dirichlet_process',
                    weight_concentration_prior=0.001, max_iter=500, n_init=5, random_state=42, tol=1e-3)
                gm2 = BayesianGaussianMixture(
                    n_components=self.n_clusters,
                    weight_concentration_prior_type='dirichlet_process',
                    weight_concentration_prior=0.001, max_iter=500, n_init=5, random_state=42, tol=1e-3)

                # first bgm model is fit to the entire data only for the purposes of obtaining a normalized value of any particular categorical mode
                gm1.fit(data[:, id_].reshape([-1, 1]))

                # main bgm model used to fit the continuous component and serves the same purpose as with purely numeric columns
                filter_arr = []
                for element in data[:, id_]:
                    if element not in info['modal']:
                        filter_arr.append(True)
                    else:
                        filter_arr.append(False)
                self.filter_arr.append(filter_arr)

                gm2.fit(data[:, id_][filter_arr].reshape([-1, 1]))

                model.append((gm1,gm2))

                # similarly keeping only relevant modes with higher weight than eps and are used to fit strictly continuous data
                old_comp = gm2.weights_ > self.eps
                mode_freq = (pd.Series(gm2.predict(data[:, id_][filter_arr].reshape([-1, 1]))).value_counts().keys())
                comp = []

                for i in range(self.n_clusters):
                    if (i in (mode_freq)) & old_comp[i]:
                        comp.append(True)
                    else:
                        comp.append(False)

                self.components.append(comp)

                # modes of the categorical component are appended to modes produced by the main bgm model
                self.output_info += [(1, 'tanh'), (np.sum(comp) + len(info['modal']), 'softmax')]
                self.output_dim += 1 + np.sum(comp) + len(info['modal'])

            else:
                # in case of categorical columns, bgm model is ignored
                model.append(None)
                self.components.append(None)
                self.output_info += [(info['size'], 'softmax')]
                self.output_dim += info['size']

        self.model = model

    def transform(self, data):

        # stores the transformed values
        values = []

        # used for accessing filter_arr for transforming mixed columns
        mixed_counter = 0

        # iterating through column information
        for id_, info in enumerate(self.meta):
            current = data[:, id_]
            if info['type'] == "continuous":
                # mode-specific normalization occurs here
                current = current.reshape([-1, 1])
                # means and stds of the modes are obtained from the corresponding fitted bgm model
                means = self.model[id_].means_.reshape((1, self.n_clusters))
                stds = np.sqrt(self.model[id_].covariances_).reshape((1, self.n_clusters))
                # values are then normalized and stored for all modes
                features = np.empty(shape=(len(current),self.n_clusters))
                # note 4 is a multiplier to ensure values lie between -1 to 1 but this is not always guaranteed
                features = (current - means) / (4 * stds)

                # number of distict modes
                n_opts = sum(self.components[id_])
                # storing the mode for each data point by sampling from the probability mass distribution across all modes based on fitted bgm model
                opt_sel = np.zeros(len(data), dtype='int')
                probs = self.model[id_].predict_proba(current.reshape([-1, 1]))
                probs = probs[:, self.components[id_]]
                for i in range(len(data)):
                    pp = probs[i] + 1e-6
                    pp = pp / sum(pp)
                    opt_sel[i] = np.random.choice(np.arange(n_opts), p=pp)

                # creating a one-hot-encoding for the corresponding selected modes
                probs_onehot = np.zeros_like(probs)
                probs_onehot[np.arange(len(probs)), opt_sel] = 1

                # obtaining the normalized values based on the appropriately selected mode and clipping to ensure values are within (-1,1)
                idx = np.arange((len(features)))
                features = features[:, self.components[id_]]
                features = features[idx, opt_sel].reshape([-1, 1])
                features = np.clip(features, -.99, .99)

                # re-ordering the one-hot-encoding of modes in descending order as per their frequency of being selected
                re_ordered_phot = np.zeros_like(probs_onehot)
                col_sums = probs_onehot.sum(axis=0)
                n = probs_onehot.shape[1]
                largest_indices = np.argsort(-1*col_sums)[:n]
                for id,val in enumerate(largest_indices):
                    re_ordered_phot[:,id] = probs_onehot[:,val]

                # storing the original ordering for invoking inverse transform
                self.ordering.append(largest_indices)

                # storing transformed numeric column represented as normalized values and corresponding modes
                values += [features, re_ordered_phot]

            elif info['type'] == "mixed":

                # means and standard deviation of modes obtained from the first fitted bgm model
                means_0 = self.model[id_][0].means_.reshape([-1])
                stds_0 = np.sqrt(self.model[id_][0].covariances_).reshape([-1])

                # list to store relevant bgm modes for categorical components
                zero_std_list = []

                # means and stds needed to normalize relevant categorical components
                means_needed = []
                stds_needed = []

                # obtaining the closest bgm mode to the categorical component
                for mode in info['modal']:
                    # skipped for mode representing missing values
                    if mode!=-9999999:
                        dist = []
                        for idx,val in enumerate(list(means_0.flatten())):
                            dist.append(abs(mode-val))
                        index_min = np.argmin(np.array(dist))
                        zero_std_list.append(index_min)
                    else: continue


                # stores the appropriate normalized value of categorical modes
                mode_vals = []

                # based on the means and stds of the chosen modes for categorical components, their respective values are similarly normalized
                for idx in zero_std_list:
                    means_needed.append(means_0[idx])
                    stds_needed.append(stds_0[idx])

                for i,j,k in zip(info['modal'],means_needed,stds_needed):
                    this_val  = np.clip(((i - j) / (4*k)), -.99, .99)
                    mode_vals.append(this_val)

                # for categorical modes representing missing values, the normalized value associated is simply 0
                if -9999999 in info["modal"]:
                    mode_vals.append(0)

                # transforming continuous component of mixed columns similar to purely numeric columns using second fitted bgm model
                current = current.reshape([-1, 1])
                filter_arr = self.filter_arr[mixed_counter]
                current = current[filter_arr]

                means = self.model[id_][1].means_.reshape((1, self.n_clusters))
                stds = np.sqrt(self.model[id_][1].covariances_).reshape((1, self.n_clusters))

                features = np.empty(shape=(len(current),self.n_clusters))
                features = (current - means) / (4 * stds)

                n_opts = sum(self.components[id_])
                probs = self.model[id_][1].predict_proba(current.reshape([-1, 1]))
                probs = probs[:, self.components[id_]]

                opt_sel = np.zeros(len(current), dtype='int')
                for i in range(len(current)):
                    pp = probs[i] + 1e-6
                    pp = pp / sum(pp)
                    opt_sel[i] = np.random.choice(np.arange(n_opts), p=pp)

                idx = np.arange((len(features)))
                features = features[:, self.components[id_]]
                features = features[idx, opt_sel].reshape([-1, 1])
                features = np.clip(features, -.99, .99)

                probs_onehot = np.zeros_like(probs)
                probs_onehot[np.arange(len(probs)), opt_sel] = 1

                # additional modes are appended to represent categorical component
                extra_bits = np.zeros([len(current), len(info['modal'])])
                temp_probs_onehot = np.concatenate([extra_bits,probs_onehot], axis = 1)

                # storing the final normalized value and one-hot-encoding of selected modes
                final = np.zeros([len(data), 1 + probs_onehot.shape[1] + len(info['modal'])])

                # iterates through only the continuous component
                features_curser = 0

                for idx, val in enumerate(data[:, id_]):

                    if val in info['modal']:
                        # dealing with the modes of categorical component
                        category_ = list(map(info['modal'].index, [val]))[0]
                        final[idx, 0] = mode_vals[category_]
                        final[idx, (category_+1)] = 1

                    else:
                        # dealing with the modes of continuous component
                        final[idx, 0] = features[features_curser]
                        final[idx, (1+len(info['modal'])):] = temp_probs_onehot[features_curser][len(info['modal']):]
                        features_curser = features_curser + 1

                # re-ordering the one-hot-encoding of modes in descending order as per their frequency of being selected
                just_onehot = final[:,1:]
                re_ordered_jhot= np.zeros_like(just_onehot)
                n = just_onehot.shape[1]
                col_sums = just_onehot.sum(axis=0)
                largest_indices = np.argsort(-1*col_sums)[:n]

                for id,val in enumerate(largest_indices):
                      re_ordered_jhot[:,id] = just_onehot[:,val]

                final_features = final[:,0].reshape([-1, 1])

                # storing the original ordering for invoking inverse transform
                self.ordering.append(largest_indices)

                values += [final_features, re_ordered_jhot]

                mixed_counter = mixed_counter + 1

            else:
                # for categorical columns, standard one-hot-encoding is applied where categories are in descending order of frequency by default
                self.ordering.append(None)
                col_t = np.zeros([len(data), info['size']])
                idx = list(map(info['i2s'].index, current))
                col_t[np.arange(len(data)), idx] = 1
                values.append(col_t)

        return np.concatenate(values, axis=1)

    def inverse_transform(self, data):

        # stores the final inverse transformed generated data
        data_t = np.zeros([len(data), len(self.meta)])

        # used to iterate through the columns of the raw generated data
        st = 0

        # iterating through original column information
        for id_, info in enumerate(self.meta):
            if info['type'] == "continuous":

                # obtaining the generated normalized values and clipping for stability
                u = data[:, st]
                u = np.clip(u, -1, 1)

                # obtaining the one-hot-encoding of the modes representing the normalized values
                v = data[:, st + 1:st + 1 + np.sum(self.components[id_])]

                # re-ordering the modes as per their original ordering
                order = self.ordering[id_]
                v_re_ordered = np.zeros_like(v)
                for id,val in enumerate(order):
                    v_re_ordered[:,val] = v[:,id]
                v = v_re_ordered

                # ensuring un-used modes are represented with -100 such that they can be ignored when computing argmax
                v_t = np.ones((data.shape[0], self.n_clusters)) * -100
                v_t[:, self.components[id_]] = v
                v = v_t

                # obtaining approriate means and stds as per the appropriately selected mode for each data point based on fitted bgm model
                means = self.model[id_].means_.reshape([-1])
                stds = np.sqrt(self.model[id_].covariances_).reshape([-1])
                p_argmax = np.argmax(v, axis=1)
                std_t = stds[p_argmax]
                mean_t = means[p_argmax]

                # executing the inverse transformation
                tmp = u * 4 * std_t + mean_t

                data_t[:, id_] = tmp

                # moving to the next set of columns in the raw generated data in correspondance to original column information
                st += 1 + np.sum(self.components[id_])

            elif info['type'] == "mixed":

                # obtaining the generated normalized values and corresponding modes
                u = data[:, st]
                u = np.clip(u, -1, 1)
                full_v = data[:,(st+1):(st+1)+len(info['modal'])+np.sum(self.components[id_])]

                # re-ordering the modes as per their original ordering
                order = self.ordering[id_]
                full_v_re_ordered = np.zeros_like(full_v)
                for id,val in enumerate(order):
                    full_v_re_ordered[:,val] = full_v[:,id]
                full_v = full_v_re_ordered

                # modes of categorical component
                mixed_v = full_v[:,:len(info['modal'])]

                # modes of continuous component
                v = full_v[:,-np.sum(self.components[id_]):]

                # similarly ensuring un-used modes are represented with -100 to be ignored while computing argmax
                v_t = np.ones((data.shape[0], self.n_clusters)) * -100
                v_t[:, self.components[id_]] = v
                v = np.concatenate([mixed_v,v_t], axis=1)
                p_argmax = np.argmax(v, axis=1)

                # obtaining the means and stds of the continuous component using second fitted bgm model
                means = self.model[id_][1].means_.reshape([-1])
                stds = np.sqrt(self.model[id_][1].covariances_).reshape([-1])

                # used to store the inverse-transformed data points
                result = np.zeros_like(u)

                for idx in range(len(data)):
                    # in case of categorical mode being selected, the mode value itself is simply assigned
                    if p_argmax[idx] < len(info['modal']):
                        argmax_value = p_argmax[idx]
                        result[idx] = float(list(map(info['modal'].__getitem__, [argmax_value]))[0])
                    else:
                        # in case of continuous mode being selected, similar inverse-transform for purely numeric values is applied
                        std_t = stds[(p_argmax[idx]-len(info['modal']))]
                        mean_t = means[(p_argmax[idx]-len(info['modal']))]
                        result[idx] = u[idx] * 4 * std_t + mean_t

                data_t[:, id_] = result

                st += 1 + np.sum(self.components[id_]) + len(info['modal'])

            else:
                # reversing one hot encoding back to label encoding for categorical columns
                current = data[:, st:st + info['size']]
                idx = np.argmax(current, axis=1)
                data_t[:, id_] = list(map(info['i2s'].__getitem__, idx))
                st += info['size']
        return data_t

class ImageTransformer():

    """
    Transformer responsible for translating data rows to images and vice versa

    Variables:
    1) side -> height/width of the image

    Methods:
    1) __init__() -> initializes image transformer object with given input
    2) transform() -> converts tabular data records into square image format
    3) inverse_transform() -> converts square images into tabular format

    """

    def __init__(self, side):

        self.height = side

    def transform(self, data):

        if self.height * self.height > len(data[0]):
            # tabular data records are padded with 0 to conform to square shaped images
            padding = torch.zeros((len(data), self.height * self.height - len(data[0]))).to(data.device)
            data = torch.cat([data, padding], axis=1)

        return data.view(-1, 1, self.height, self.height)

    def inverse_transform(self, data):

        data = data.view(-1, self.height * self.height)

        return data



In [3]:
import numpy as np
import pandas as pd
import torch
import torch.utils.data
import torch.optim as optim
from torch.optim import Adam
from torch.nn import functional as F
from torch.nn import (Dropout, LeakyReLU, Linear, Module, ReLU, Sequential,
Conv2d, ConvTranspose2d, BatchNorm2d, Sigmoid, init, BCELoss, CrossEntropyLoss,SmoothL1Loss)
from tqdm import tqdm


def random_choice_prob_index_sampling(probs,col_idx):

    """
    Used to sample a specific category within a chosen one-hot-encoding representation

    Inputs:
    1) probs -> probability mass distribution of categories
    2) col_idx -> index used to identify any given one-hot-encoding

    Outputs:
    1) option_list -> list of chosen categories

    """

    option_list = []
    for i in col_idx:
        # for improved stability
        pp = probs[i] + 1e-6
        pp = pp / sum(pp)
        # sampled based on given probability mass distribution of categories within the given one-hot-encoding
        option_list.append(np.random.choice(np.arange(len(probs[i])), p=pp))

    return np.array(option_list).reshape(col_idx.shape)

class Condvec(object):

    """
    This class is responsible for sampling conditional vectors to be supplied to the generator

    Variables:
    1) model -> list containing an index of highlighted categories in their corresponding one-hot-encoded represenations
    2) interval -> an array holding the respective one-hot-encoding starting positions and sizes
    3) n_col -> total no. of one-hot-encoding representations
    4) n_opt -> total no. of distinct categories across all one-hot-encoding representations
    5) p_log_sampling -> list containing log of probability mass distribution of categories within their respective one-hot-encoding representations
    6) p_sampling -> list containing probability mass distribution of categories within their respective one-hot-encoding representations

    Methods:
    1) __init__() -> takes transformed input data with respective column information to compute class variables
    2) sample_train() -> used to sample the conditional vector during training of the model
    3) sample() -> used to sample the conditional vector for generating data after training is finished

    """


    def __init__(self, data, output_info):

        self.model = []
        self.interval = []
        self.n_col = 0
        self.n_opt = 0
        self.p_log_sampling = []
        self.p_sampling = []

        # iterating through the transformed input data columns
        st = 0
        for item in output_info:
            # ignoring columns that do not represent one-hot-encodings
            if item[1] == 'tanh':
                st += item[0]
                continue
            elif item[1] == 'softmax':
                # using starting (st) and ending (ed) position of any given one-hot-encoded representation to obtain relevant information
                ed = st + item[0]
                self.model.append(np.argmax(data[:, st:ed], axis=-1))
                self.interval.append((self.n_opt, item[0]))
                self.n_col += 1
                self.n_opt += item[0]
                freq = np.sum(data[:, st:ed], axis=0)
                log_freq = np.log(freq + 1)
                log_pmf = log_freq / np.sum(log_freq)
                self.p_log_sampling.append(log_pmf)
                pmf = freq / np.sum(freq)
                self.p_sampling.append(pmf)
                st = ed

        self.interval = np.asarray(self.interval)

    def sample_train(self, batch):

        """
        Used to create the conditional vectors for feeding it to the generator during training

        Inputs:
        1) batch -> no. of data records to be generated in a batch

        Outputs:
        1) vec -> a matrix containing a conditional vector for each data point to be generated
        2) mask -> a matrix to identify chosen one-hot-encodings across the batch
        3) idx -> list of chosen one-hot encoding across the batch
        4) opt1prime -> selected categories within chosen one-hot-encodings

        """

        if self.n_col == 0:
            return None
        batch = batch

        # each conditional vector in vec is a one-hot vector used to highlight a specific category across all possible one-hot-encoded representations
        # (i.e., including modes of continuous and mixed columns)
        vec = np.zeros((batch, self.n_opt), dtype='float32')

        # choosing one specific one-hot-encoding from all possible one-hot-encoded representations
        idx = np.random.choice(np.arange(self.n_col), batch)

        # matrix of shape (batch x total no. of one-hot-encoded representations) with 1 in indexes of chosen representations and 0 elsewhere
        mask = np.zeros((batch, self.n_col), dtype='float32')
        mask[np.arange(batch), idx] = 1

        # producing a list of selected categories within each of selected one-hot-encoding representation
        opt1prime = random_choice_prob_index_sampling(self.p_log_sampling,idx)

        # assigning the appropriately chosen category for each corresponding conditional vector
        for i in np.arange(batch):
            vec[i, self.interval[idx[i], 0] + opt1prime[i]] = 1

        return vec, mask, idx, opt1prime

    def sample(self, batch):

        """
        Used to create the conditional vectors for feeding it to the generator after training is finished

        Inputs:
        1) batch -> no. of data records to be generated in a batch

        Outputs:
        1) vec -> an array containing a conditional vector for each data point to be generated
        """

        if self.n_col == 0:
            return None

        batch = batch

        # each conditional vector in vec is a one-hot vector used to highlight a specific category across all possible one-hot-encoded representations
        # (i.e., including modes of continuous and mixed columns)
        vec = np.zeros((batch, self.n_opt), dtype='float32')

        # choosing one specific one-hot-encoding from all possible one-hot-encoded representations
        idx = np.random.choice(np.arange(self.n_col), batch)

        # producing a list of selected categories within each of selected one-hot-encoding representation
        opt1prime = random_choice_prob_index_sampling(self.p_sampling,idx)

        # assigning the appropriately chosen category for each corresponding conditional vector
        for i in np.arange(batch):
            vec[i, self.interval[idx[i], 0] + opt1prime[i]] = 1

        return vec

def cond_loss(data, output_info, c, m):

    """
    Used to compute the conditional loss for ensuring the generator produces the desired category as specified by the conditional vector

    Inputs:
    1) data -> raw data synthesized by the generator
    2) output_info -> column informtion corresponding to the data transformer
    3) c -> conditional vectors used to synthesize a batch of data
    4) m -> a matrix to identify chosen one-hot-encodings across the batch

    Outputs:
    1) loss -> conditional loss corresponding to the generated batch

    """

    # used to store cross entropy loss between conditional vector and all generated one-hot-encodings
    tmp_loss = []
    # counter to iterate generated data columns
    st = 0
    # counter to iterate conditional vector
    st_c = 0
    # iterating through column information
    for item in output_info:
        # ignoring numeric columns
        if item[1] == 'tanh':
            st += item[0]
            continue
        # computing cross entropy loss between generated one-hot-encoding and corresponding encoding of conditional vector
        elif item[1] == 'softmax':
            ed = st + item[0]
            ed_c = st_c + item[0]
            tmp = F.cross_entropy(
            data[:, st:ed],
            torch.argmax(c[:, st_c:ed_c], dim=1),
            reduction='none')
            tmp_loss.append(tmp)
            st = ed
            st_c = ed_c

    # computing the loss across the batch only and only for the relevant one-hot-encodings by applying the mask
    tmp_loss = torch.stack(tmp_loss, dim=1)
    loss = (tmp_loss * m).sum() / data.size()[0]

    return loss

class Sampler(object):

    """
    This class is used to sample the transformed real data according to the conditional vector

    Variables:
    1) data -> real transformed input data
    2) model -> stores the index values of data records corresponding to any given selected categories for all columns
    3) n -> size of the input data

    Methods:
    1) __init__() -> initiates the sampler object and stores class variables
    2) sample() -> takes as input the number of rows to be sampled (n), chosen column (col)
                   and category within the column (opt) to sample real records accordingly
    """

    def __init__(self, data, output_info):

        super(Sampler, self).__init__()

        self.data = data
        self.model = []
        self.n = len(data)

        # counter to iterate through columns
        st = 0
        # iterating through column information
        for item in output_info:
            # ignoring numeric columns
            if item[1] == 'tanh':
                st += item[0]
                continue
            # storing indices of data records for all categories within one-hot-encoded representations
            elif item[1] == 'softmax':
                ed = st + item[0]
                tmp = []
                # iterating through each category within a one-hot-encoding
                for j in range(item[0]):
                    # storing the relevant indices of data records for the given categories
                    tmp.append(np.nonzero(data[:, st + j])[0])
                self.model.append(tmp)
                st = ed

    def sample(self, n, col, opt):

        # if there are no one-hot-encoded representations, we may ignore sampling using a conditional vector
        if col is None:
            idx = np.random.choice(np.arange(self.n), n)
            return self.data[idx]

        # used to store relevant indices of data records based on selected category within a chosen one-hot-encoding
        idx = []

        # sampling a data record index randomly from all possible indices that meet the given criteria of the chosen category and one-hot-encoding
        for c, o in zip(col, opt):
            idx.append(np.random.choice(self.model[c][o]))

        return self.data[idx]

def get_st_ed(target_col_index,output_info):

    """
    Used to obtain the start and ending positions of the target column as per the transformed data to be used by the classifier

    Inputs:
    1) target_col_index -> column index of the target column used for machine learning tasks (binary/multi-classification) in the raw data
    2) output_info -> column information corresponding to the data after applying the data transformer

    Outputs:
    1) starting (st) and ending (ed) positions of the target column as per the transformed data

    """
    # counter to iterate through columns
    st = 0
    # counter to check if the target column index has been reached
    c= 0
    # counter to iterate through column information
    tc= 0
    # iterating until target index has reached to obtain starting position of the one-hot-encoding used to represent target column in transformed data
    for item in output_info:
        # exiting loop if target index has reached
        if c==target_col_index:
            break
        if item[1]=='tanh':
            st += item[0]
        elif item[1] == 'softmax':
            st += item[0]
            c+=1
        tc+=1

    # obtaining the ending position by using the dimension size of the one-hot-encoding used to represent the target column
    ed= st+output_info[tc][0]

    return (st,ed)

class Classifier(Module):

    """
    This class represents the classifier module used along side the discriminator to train the generator network

    Variables:
    1) dim -> column dimensionality of the transformed input data after removing target column
    2) class_dims -> list of dimensions used for the hidden layers of the classifier network
    3) str_end -> tuple containing the starting and ending positions of the target column in the transformed input data

    Methods:
    1) __init__() -> initializes and builds the layers of the classifier module
    2) forward() -> executes the forward pass of the classifier module on the corresponding input data and
                    outputs the predictions and corresponding true labels for the target column

    """

    def __init__(self,input_dim, class_dims,st_ed):
        super(Classifier,self).__init__()
        # subtracting the target column size from the input dimensionality
        self.dim = input_dim-(st_ed[1]-st_ed[0])
        # storing the starting and ending positons of the target column in the input data
        self.str_end = st_ed

        # building the layers of the network with same hidden layers as discriminator
        seq = []
        tmp_dim = self.dim
        for item in list(class_dims):
            seq += [
                Linear(tmp_dim, item),
                LeakyReLU(0.2),
                Dropout(0.5)
            ]
            tmp_dim = item

        # in case of binary classification the last layer outputs a single numeric value which is squashed to a probability with sigmoid
        if (st_ed[1]-st_ed[0])==2:
            seq += [Linear(tmp_dim, 1),Sigmoid()]
        # in case of multi-classs classification, the last layer outputs an array of numeric values associated to each class
        else: seq += [Linear(tmp_dim,(st_ed[1]-st_ed[0]))]

        self.seq = Sequential(*seq)

    def forward(self, input):

        # true labels obtained from the input data
        label = torch.argmax(input[:, self.str_end[0]:self.str_end[1]], axis=-1)

        # input to be fed to the classifier module
        new_imp = torch.cat((input[:,:self.str_end[0]],input[:,self.str_end[1]:]),1)

        # returning predictions and true labels for binary/multi-class classification
        if ((self.str_end[1]-self.str_end[0])==2):
            return self.seq(new_imp).view(-1), label
        else: return self.seq(new_imp), label

class Discriminator(Module):

    """
    This class represents the discriminator network of the model

    Variables:
    1) seq -> layers of the network used for making the final prediction of the discriminator model
    2) seq_info -> layers of the discriminator network used for computing the information loss

    Methods:
    1) __init__() -> initializes and builds the layers of the discriminator model
    2) forward() -> executes a forward pass on the input data to output the final predictions and corresponding
                    feature information associated with the penultimate layer used to compute the information loss

    """

    def __init__(self, layers):
        super(Discriminator, self).__init__()
        self.seq = Sequential(*layers)
        self.seq_info = Sequential(*layers[:len(layers)-2])

    def forward(self, input):
        return (self.seq(input)), self.seq_info(input)

class Generator(Module):

    """
    This class represents the discriminator network of the model

    Variables:
    1) seq -> layers of the network used by the generator

    Methods:
    1) __init__() -> initializes and builds the layers of the generator model
    2) forward() -> executes a forward pass using noise as input to generate data

    """

    def __init__(self, layers):
        super(Generator, self).__init__()
        self.seq = Sequential(*layers)

    def forward(self, input):
        return self.seq(input)

def determine_layers_disc(side, num_channels):

    """
    This function describes the layers of the discriminator network as per DCGAN (https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html)

    Inputs:
    1) side -> height/width of the input fed to the discriminator
    2) num_channels -> no. of channels used to decide the size of respective hidden layers

    Outputs:
    1) layers_D -> layers of the discriminator network

    """

    # computing the dimensionality of hidden layers
    layer_dims = [(1, side), (num_channels, side // 2)]

    while layer_dims[-1][1] > 3 and len(layer_dims) < 4:
        # the number of channels increases by a factor of 2 whereas the height/width decreases by the same factor with each layer
        layer_dims.append((layer_dims[-1][0] * 2, layer_dims[-1][1] // 2))

    # constructing the layers of the discriminator network based on the recommendations mentioned in https://arxiv.org/abs/1511.06434
    layers_D = []
    for prev, curr in zip(layer_dims, layer_dims[1:]):
        layers_D += [
            Conv2d(prev[0], curr[0], 4, 2, 1, bias=False),
            BatchNorm2d(curr[0]),
            LeakyReLU(0.2, inplace=True)
        ]
    # last layer reduces the output to a single numeric value which is squashed to a probabability using sigmoid function
    layers_D += [
        Conv2d(layer_dims[-1][0], 1, layer_dims[-1][1], 1, 0),
        Sigmoid()
    ]

    return layers_D

def determine_layers_gen(side, random_dim, num_channels):

    """
    This function describes the layers of the generator network

    Inputs:
    1) random_dim -> height/width of the noise matrix to be fed for generation
    2) num_channels -> no. of channels used to decide the size of respective hidden layers

    Outputs:
    1) layers_G -> layers of the generator network

    """

    # computing the dimensionality of hidden layers
    layer_dims = [(1, side), (num_channels, side // 2)]

    while layer_dims[-1][1] > 3 and len(layer_dims) < 4:
        layer_dims.append((layer_dims[-1][0] * 2, layer_dims[-1][1] // 2))

    # similarly constructing the layers of the generator network based on the recommendations mentioned in https://arxiv.org/abs/1511.06434
    # first layer of the generator takes the channel dimension of the noise matrix to the desired maximum channel size of the generator's layers
    layers_G = [
        ConvTranspose2d(
            random_dim, layer_dims[-1][0], layer_dims[-1][1], 1, 0, output_padding=0, bias=False)
    ]

    # the following layers are then reversed with respect to the discriminator
    # such as the no. of channels reduce by a factor of 2 and height/width of generated image increases by the same factor with each layer
    for prev, curr in zip(reversed(layer_dims), reversed(layer_dims[:-1])):
        layers_G += [
            BatchNorm2d(prev[0]),
            ReLU(True),
            ConvTranspose2d(prev[0], curr[0], 4, 2, 1, output_padding=0, bias=True)
        ]

    return layers_G

def apply_activate(data, output_info):

    """
    This function applies the final activation corresponding to the column information associated with transformer

    Inputs:
    1) data -> input data generated by the model in the same format as the transformed input data
    2) output_info -> column information associated with the transformed input data

    Outputs:
    1) act_data -> resulting data after applying the respective activations

    """

    data_t = []
    # used to iterate through columns
    st = 0
    # used to iterate through column information
    for item in output_info:
        # for numeric columns a final tanh activation is applied
        if item[1] == 'tanh':
            ed = st + item[0]
            data_t.append(torch.tanh(data[:, st:ed]))
            st = ed
        # for one-hot-encoded columns, a final gumbel softmax (https://arxiv.org/pdf/1611.01144.pdf) is used
        # to sample discrete categories while still allowing for back propagation
        elif item[1] == 'softmax':
            ed = st + item[0]
            # note that as tau approaches 0, a completely discrete one-hot-vector is obtained
            data_t.append(F.gumbel_softmax(data[:, st:ed], tau=0.2))
            st = ed

    act_data = torch.cat(data_t, dim=1)

    return act_data

def weights_init(model):

    """
    This function initializes the learnable parameters of the convolutional and batch norm layers

    Inputs:
    1) model->  network for which the parameters need to be initialized

    Outputs:
    1) network with corresponding weights initialized using the normal distribution

    """

    classname = model.__class__.__name__

    if classname.find('Conv') != -1:
        init.normal_(model.weight.data, 0.0, 0.02)

    elif classname.find('BatchNorm') != -1:
        init.normal_(model.weight.data, 1.0, 0.02)
        init.constant_(model.bias.data, 0)

class CTABGANSynthesizer:

    """
    This class represents the main model used for training the model and generating synthetic data


    Variables:
    1) random_dim -> size of the noise vector fed to the generator
    2) class_dim -> tuple containing dimensionality of hidden layers for the classifier network
    3) num_channels -> no. of channels for deciding respective hidden layers of discriminator and generator networks
    4) dside -> height/width of the input data fed to discriminator network
    5) gside -> height/width of the input data generated by the generator network
    6) l2scale -> parameter to decide strength of regularization of the network based on constraining l2 norm of weights
    7) batch_size -> no. of records to be processed in each mini-batch of training
    8) epochs -> no. of epochs to train the model
    9) device -> type of device to be used for training (i.e., gpu/cpu)
    10) generator -> generator network from which data can be generated after training the model

    Methods:
    1) __init__() -> initializes the model with user specified parameters
    2) fit() -> takes the pre-processed training data and associated parameters as input to fit the CTABGANSynthesizer model
    3) sample() -> takes as input the no. of data rows to be generated and synthesizes the corresponding no. of data rows

    """

    def __init__(self,
                 class_dim=(256, 256, 256, 256),
                 random_dim=100,
                 num_channels=64,
                 l2scale=1e-5,
                 batch_size=500,
                 epochs=1):

        self.random_dim = random_dim
        self.class_dim = class_dim
        self.num_channels = num_channels
        self.dside = None
        self.gside = None
        self.l2scale = l2scale
        self.batch_size = batch_size
        self.epochs = epochs
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.generator = None

    def fit(self, train_data=pd.DataFrame, categorical=[], mixed={}, type={}):

        # obtaining the column index of the target column used for ML tasks
        problem_type = None
        target_index = None

        if type:
            problem_type = list(type.keys())[0]
            if problem_type:
                target_index = train_data.columns.get_loc(type[problem_type])

        # transforming pre-processed training data according to different data types
        # i.e., mode specific normalisation for numeric and mixed columns and one-hot-encoding for categorical columns
        self.transformer = DataTransformer(train_data=train_data, categorical_list=categorical, mixed_dict=mixed)
        self.transformer.fit()
        train_data = self.transformer.transform(train_data.values)
        # storing column size of the transformed training data
        data_dim = self.transformer.output_dim

        # initializing the sampler object to execute training-by-sampling
        data_sampler = Sampler(train_data, self.transformer.output_info)
        # initializing the condvec object to sample conditional vectors during training
        self.cond_generator = Condvec(train_data, self.transformer.output_info)

        # obtaining the desired height/width for converting tabular data records to square images for feeding it to discriminator network
        sides = [4, 8, 16, 24, 32]
        # the discriminator takes the transformed training data concatenated by the corresponding conditional vectors as input
        col_size_d = data_dim + self.cond_generator.n_opt
        for i in sides:
            if i * i >= col_size_d:
                self.dside = i
                break

        # obtaining the desired height/width for generating square images from the generator network that can be converted back to tabular domain
        sides = [4, 8, 16, 24, 32]
        col_size_g = data_dim
        for i in sides:
            if i * i >= col_size_g:
                self.gside = i
                break

        # constructing the generator and discriminator networks
        layers_G = determine_layers_gen(self.gside, self.random_dim+self.cond_generator.n_opt, self.num_channels)
        layers_D = determine_layers_disc(self.dside, self.num_channels)
        self.generator = Generator(layers_G).to(self.device)
        discriminator = Discriminator(layers_D).to(self.device)

        # assigning the respective optimizers for the generator and discriminator networks
        optimizer_params = dict(lr=2e-4, betas=(0.5, 0.9), eps=1e-3, weight_decay=self.l2scale)
        optimizerG = Adam(self.generator.parameters(), **optimizer_params)
        optimizerD = Adam(discriminator.parameters(), **optimizer_params)


        st_ed = None
        classifier=None
        optimizerC= None
        if target_index != None:
            # obtaining the one-hot-encoding starting and ending positions of the target column in the transformed data
            st_ed= get_st_ed(target_index,self.transformer.output_info)
            # configuring the classifier network and it's optimizer accordingly
            classifier = Classifier(data_dim,self.class_dim,st_ed).to(self.device)
            optimizerC = optim.Adam(classifier.parameters(),**optimizer_params)

        # initializing learnable parameters of the discrimnator and generator networks
        self.generator.apply(weights_init)
        discriminator.apply(weights_init)

        # initializing the image transformer objects for the generator and discriminator networks for transitioning between image and tabular domain
        self.Gtransformer = ImageTransformer(self.gside)
        self.Dtransformer = ImageTransformer(self.dside)

        # initiating the training by computing the number of iterations per epoch
        steps_per_epoch = max(1, len(train_data) // self.batch_size)
        for i in tqdm(range(self.epochs)):
            for _ in range(steps_per_epoch):

                # sampling noise vectors using a standard normal distribution
                noisez = torch.randn(self.batch_size, self.random_dim, device=self.device)
                # sampling conditional vectors
                condvec = self.cond_generator.sample_train(self.batch_size)
                c, m, col, opt = condvec
                c = torch.from_numpy(c).to(self.device)
                m = torch.from_numpy(m).to(self.device)
                # concatenating conditional vectors and converting resulting noise vectors into the image domain to be fed to the generator as input
                noisez = torch.cat([noisez, c], dim=1)
                noisez =  noisez.view(self.batch_size,self.random_dim+self.cond_generator.n_opt,1,1)

                # sampling real data according to the conditional vectors and shuffling it before feeding to discriminator to isolate conditional loss on generator
                perm = np.arange(self.batch_size)
                np.random.shuffle(perm)
                real = data_sampler.sample(self.batch_size, col[perm], opt[perm])
                real = torch.from_numpy(real.astype('float32')).to(self.device)

                # storing shuffled ordering of the conditional vectors
                c_perm = c[perm]
                # generating synthetic data as an image
                fake = self.generator(noisez)
                # converting it into the tabular domain as per format of the trasformed training data
                faket = self.Gtransformer.inverse_transform(fake)
                # applying final activation on the generated data (i.e., tanh for numeric and gumbel-softmax for categorical)
                fakeact = apply_activate(faket, self.transformer.output_info)

                # the generated data is then concatenated with the corresponding condition vectors
                fake_cat = torch.cat([fakeact, c], dim=1)
                # the real data is also similarly concatenated with corresponding conditional vectors
                real_cat = torch.cat([real, c_perm], dim=1)

                # transforming the real and synthetic data into the image domain for feeding it to the discriminator
                real_cat_d = self.Dtransformer.transform(real_cat)
                fake_cat_d = self.Dtransformer.transform(fake_cat)

                # executing the gradient update step for the discriminator
                optimizerD.zero_grad()
                # computing the probability of the discriminator to correctly classify real samples hence y_real should ideally be close to 1
                y_real,_ = discriminator(real_cat_d)
                # computing the probability of the discriminator to correctly classify fake samples hence y_fake should ideally be close to 0
                y_fake,_ = discriminator(fake_cat_d)
                # computing the loss to essentially maximize the log likelihood of correctly classifiying real and fake samples as log(D(x))+log(1−D(G(z)))
                # or equivalently minimizing the negative of log(D(x))+log(1−D(G(z))) as done below
                loss_d = (-(torch.log(y_real + 1e-4).mean()) - (torch.log(1. - y_fake + 1e-4).mean()))
                # accumulating gradients based on the loss
                loss_d.backward()
                # computing the backward step to update weights of the discriminator
                optimizerD.step()

                # similarly sample noise vectors and conditional vectors
                noisez = torch.randn(self.batch_size, self.random_dim, device=self.device)
                condvec = self.cond_generator.sample_train(self.batch_size)
                c, m, col, opt = condvec
                c = torch.from_numpy(c).to(self.device)
                m = torch.from_numpy(m).to(self.device)
                noisez = torch.cat([noisez, c], dim=1)
                noisez =  noisez.view(self.batch_size,self.random_dim+self.cond_generator.n_opt,1,1)

                # executing the gradient update step for the generator
                optimizerG.zero_grad()

                # similarly generating synthetic data and applying final activation
                fake = self.generator(noisez)
                faket = self.Gtransformer.inverse_transform(fake)
                fakeact = apply_activate(faket, self.transformer.output_info)
                # concatenating conditional vectors and converting it to the image domain to be fed to the discriminator
                fake_cat = torch.cat([fakeact, c], dim=1)
                fake_cat = self.Dtransformer.transform(fake_cat)

                # computing the probability of the discriminator classifiying fake samples as real
                # along with feature representaions of fake data resulting from the penultimate layer
                y_fake,info_fake = discriminator(fake_cat)
                # extracting feature representation of real data from the penultimate layer of the discriminator
                _,info_real = discriminator(real_cat_d)
                # computing the conditional loss to ensure the generator generates data records with the chosen category as per the conditional vector
                cross_entropy = cond_loss(faket, self.transformer.output_info, c, m)

                # computing the loss to train the generator where we want y_fake to be close to 1 to fool the discriminator
                # and cross_entropy to be close to 0 to ensure generator's output matches the conditional vector
                g = -(torch.log(y_fake + 1e-4).mean()) + cross_entropy
                # in order to backprop the gradient of separate losses w.r.t to the learnable weight of the network independently
                # we may use retain_graph=True in backward() method in the first back-propagated loss
                # to maintain the computation graph to execute the second backward pass efficiently
                g.backward(retain_graph=True)
                # computing the information loss by comparing means and stds of real/fake feature representations extracted from discriminator's penultimate layer
                loss_mean = torch.norm(torch.mean(info_fake.view(self.batch_size,-1), dim=0) - torch.mean(info_real.view(self.batch_size,-1), dim=0), 1)
                loss_std = torch.norm(torch.std(info_fake.view(self.batch_size,-1), dim=0) - torch.std(info_real.view(self.batch_size,-1), dim=0), 1)
                loss_info = loss_mean + loss_std
                # computing the finally accumulated gradients
                loss_info.backward()
                # executing the backward step to update the weights
                optimizerG.step()

                # the classifier module is used in case there is a target column associated with ML tasks
                if problem_type:

                    c_loss = None
                    # in case of binary classification, the binary cross entropy loss is used
                    if (st_ed[1] - st_ed[0])==2:
                        c_loss = BCELoss()
                    # in case of multi-class classification, the standard cross entropy loss is used
                    else: c_loss = CrossEntropyLoss()

                    # updating the weights of the classifier
                    optimizerC.zero_grad()
                    # computing classifier's target column predictions on the real data along with returning corresponding true labels
                    real_pre, real_label = classifier(real)
                    if (st_ed[1] - st_ed[0])==2:
                        real_label = real_label.type_as(real_pre)
                    # computing the loss to train the classifier so that it can perform well on the real data
                    loss_cc = c_loss(real_pre, real_label)
                    loss_cc.backward()
                    optimizerC.step()

                    # updating the weights of the generator
                    optimizerG.zero_grad()
                    # generate synthetic data and apply the final activation
                    fake = self.generator(noisez)
                    faket = self.Gtransformer.inverse_transform(fake)
                    fakeact = apply_activate(faket, self.transformer.output_info)
                    # computing classifier's target column predictions on the fake data along with returning corresponding true labels
                    fake_pre, fake_label = classifier(fakeact)
                    if (st_ed[1] - st_ed[0])==2:
                        fake_label = fake_label.type_as(fake_pre)
                    # computing the loss to train the generator to improve semantic integrity between target column and rest of the data
                    loss_cg = c_loss(fake_pre, fake_label)
                    loss_cg.backward()
                    optimizerG.step()


    def sample(self, n):

        # turning the generator into inference mode to effectively use running statistics in batch norm layers
        self.generator.eval()
        # column information associated with the transformer fit to the pre-processed training data
        output_info = self.transformer.output_info

        # generating synthetic data in batches accordingly to the total no. required
        steps = n // self.batch_size + 1
        data = []
        for _ in range(steps):
            # generating synthetic data using sampled noise and conditional vectors
            noisez = torch.randn(self.batch_size, self.random_dim, device=self.device)
            condvec = self.cond_generator.sample(self.batch_size)
            c = condvec
            c = torch.from_numpy(c).to(self.device)
            noisez = torch.cat([noisez, c], dim=1)
            noisez =  noisez.view(self.batch_size,self.random_dim+self.cond_generator.n_opt,1,1)
            fake = self.generator(noisez)
            faket = self.Gtransformer.inverse_transform(fake)
            fakeact = apply_activate(faket,output_info)
            data.append(fakeact.detach().cpu().numpy())

        data = np.concatenate(data, axis=0)

        # applying the inverse transform and returning synthetic data in a similar form as the original pre-processed training data
        result = self.transformer.inverse_transform(data)

        return result[0:n]


In [5]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [12]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Load the Titanic dataset
data = pd.read_csv('/content/gdrive/MyDrive/CTGAN/train.csv')

# Ensure the target column exists
target_column = 'Survived'

# Identify categorical and numerical columns
categorical_cols = ['Sex', 'Embarked', 'Pclass']  # Pclass is actually numerical but often treated as categorical
numerical_cols = data.columns.difference(categorical_cols + [target_column, 'Name', 'Ticket', 'Cabin'])  # Exclude target column and irrelevant columns

# Impute missing values for categorical columns
imputer_categorical = SimpleImputer(strategy='most_frequent')
data[categorical_cols] = imputer_categorical.fit_transform(data[categorical_cols])

# Impute missing values for numerical columns
imputer_numerical = SimpleImputer(strategy='mean')
data[numerical_cols] = imputer_numerical.fit_transform(data[numerical_cols])

# Label encode categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Combine numerical and encoded categorical columns
data_combined = pd.concat([pd.DataFrame(data[numerical_cols]), data[categorical_cols], data[[target_column]]], axis=1)

# Show the first few rows of the processed dataset
print(data_combined.head())


    Age     Fare  Parch  PassengerId  SibSp  Sex  Embarked  Pclass  Survived
0  22.0   7.2500    0.0          1.0    1.0    1         2       2         0
1  38.0  71.2833    0.0          2.0    1.0    0         0       0         1
2  26.0   7.9250    0.0          3.0    0.0    0         2       2         1
3  35.0  53.1000    0.0          4.0    1.0    0         2       0         1
4  35.0   8.0500    0.0          5.0    0.0    1         2       2         0


In [13]:
synthesizer = CTABGANSynthesizer()

# Fit the model on the dataset
synthesizer.fit(data_combined)

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
100%|██████████| 1/1 [00:02<00:00,  2.29

In [15]:
synthetic_data = synthesizer.sample(n=1000)

In [16]:
synthetic_data.shape

(1000, 9)

In [17]:
data_combined.shape

(891, 9)