# Data Preprocessors

## Loading Data

In [1]:
import os
import pandas as pd

In [2]:
# Loading data in pandas dataframe
current_directory = os.path.abspath(os.getcwd())
parent_directory = os.path.join(current_directory, '..')
grandparent_directory = os.path.join(parent_directory, '..')
data_directory = os.path.join(grandparent_directory, 'data')
csv_path = os.path.join(data_directory, 'healthcare_dataset.csv')

df = pd.read_csv(csv_path)

In [3]:
df.head().T

Unnamed: 0,0,1,2,3,4
Name,Tiffany Ramirez,Ruben Burns,Chad Byrd,Antonio Frederick,Mrs. Brandy Flowers
Age,81,35,61,49,51
Gender,Female,Male,Male,Male,Male
Blood Type,O-,O+,B-,B-,O-
Medical Condition,Diabetes,Asthma,Obesity,Asthma,Arthritis
Date of Admission,2022-11-17,2023-06-01,2019-01-09,2020-05-02,2021-07-09
Doctor,Patrick Parker,Diane Jackson,Paul Baker,Brian Chandler,Dustin Griffin
Hospital,Wallace-Hamilton,"Burke, Griffin and Cooper",Walton LLC,Garcia Ltd,"Jones, Brown and Murray"
Insurance Provider,Medicare,UnitedHealthcare,Medicare,Medicare,UnitedHealthcare
Billing Amount,37490.983364,47304.064845,36874.896997,23303.322092,18086.344184


In [4]:
df.shape

(10000, 15)

In [5]:
df.drop(['Name', 'Hospital', 'Doctor'], axis=1, inplace=True)
df.shape

(10000, 12)

In [6]:
# Custom class to process data
from DataProcessor_OLD import DataProcessor

# CTGAN

In [7]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch import optim
from torch.utils.data import TensorDataset

In [8]:
# Functions in the training loop
def create_masks(N_d, m):
    return torch.randint(0, 2, (m, N_d), dtype=torch.float32)

def sample_z(m, latent_dim):
    return torch.randn(m, latent_dim)

def create_pacs(data, pac):
    m = data.size(0)
    return torch.stack([torch.bitwise_xor(data[k*pac:(k+1)*pac].int()).float() for k in range(m // pac)])

def gradient_penalty(critic, real, fake, cond):
    alpha = torch.rand(real.size(0), 1, device=real.device)
    interpolates = (alpha * real + (1 - alpha) * fake).requires_grad_(True)
    d_interpolates = critic(interpolates, cond)
    fake_grad = torch.autograd.grad(
        outputs=d_interpolates,
        inputs=interpolates,
        grad_outputs=torch.ones_like(d_interpolates),
        create_graph=True,
        retain_graph=True,
        only_inputs=True
    )[0]
    grad_penalty = ((fake_grad.norm(2, dim=1) - 1) ** 2).mean()
    return grad_penalty

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np

torch.cuda.empty_cache()

# Generator
class Generator(nn.Module):
    def __init__(self, z_dim, cond_dim, m, D):
        super(Generator, self).__init__()
        self.z_dim = z_dim
        self.cond_dim = cond_dim
        self.input_dim = z_dim + cond_dim
        self.hidden_dim_1 = 256
        self.hidden_dim_2 = 512
        self.m = m
        self.D = D
        
        self.fc1 = nn.Linear(self.input_dim, self.hidden_dim_1)
        self.bn1 = nn.BatchNorm1d(self.hidden_dim_1)
        
        self.fc2 = nn.Linear(self.hidden_dim_1 + self.input_dim, self.hidden_dim_1)
        self.bn2 = nn.BatchNorm1d(self.hidden_dim_1)
        
        self.alpha_layers = nn.ModuleList([nn.Linear(self.hidden_dim_2 + self.input_dim, 1) for _ in range(self.m)])
        self.d_hat_layers = nn.ModuleList([nn.Linear(self.hidden_dim_2 + self.input_dim, self.D) for _ in range(self.D)])
    
    def forward(self, z, cond):
        z = z.unsqueeze(0)
        cond = cond.unsqueeze(0)
        
        print(z.shape, cond.shape)

        h0 = torch.cat([z, cond], dim=1)

        h1 = self.fc1(h0)
        # h1 = self.bn1(h1)
        h1 = F.relu(h1)
        h1 = torch.cat([h0, h1], dim=1)
        
        h2 = self.fc2(h1)
        # h2 = self.bn2(h2)
        h2 = F.relu(h2)
        h2 = torch.cat([h1, h2], dim=1)

        # Generate continuous outputs
        alpha = [torch.tanh(layer(h2)) for layer in self.alpha_layers]
        
        # Generate discrete outputs using Gumbel-Softmax
        d_hat = [F.gumbel_softmax(layer(h2), tau=1.0, hard=True) for layer in self.d_hat_layers]
        
        # Concatenate all outputs
        r = torch.cat(alpha + d_hat, dim=1)
        print(r.shape)
        return r

# Critic
class Critic(nn.Module):
    def __init__(self, pac, r_dim, cond_dim):
        super(Critic, self).__init__()
        self.pac = pac
        self.input_dim = (r_dim + cond_dim) * pac
        self.hidden_dim_1 = 256
                
        self.fc1 = nn.Linear(self.input_dim, self.hidden_dim_1)
        self.leaky = nn.LeakyReLU(0.2)
        self.drop = nn.Dropout(0.2)
        
        self.fc2 = nn.Linear(self.hidden_dim_1, self.hidden_dim_1)
        
        self.fc3 = nn.Linear(self.hidden_dim_1, 1)
    
    def forward(self, r_pac, cond_pac):
        r_pac = r_pac.unsqueeze(0)
        cond_pac = cond_pac.unsqueeze(0)
        
        print(r_pac.shape, cond_pac.shape, 'Expected', self.input_dim)
                
        h0 = torch.cat([r_pac, cond_pac], dim=1)
        
        h1 = self.fc1(h0)
        h1 = self.leaky(h1)
        h1 = self.drop(h1)
        
        h2 = self.fc2(h1)
        h2 = self.leaky(h2)
        h2 = self.drop(h2)
        
        C = self.fc3(h2)
        
        return C

# CTGAN class
class CTGAN:
    def __init__(self, df, epochs=300, pac=10, batch_size=500):
        self.df, self.m, self.D = DataProcessor(df).fit_transform()
        self.size = df.shape[0]
        self.epochs = epochs
        self.pac = pac
        self.batch_size = batch_size
        
        print('Recieved sizes:', self.size, self.m, self.D)

        self.df = self.df.sample(frac=1)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print('Device:', self.device)

    def create_mask(self):
        masks = []
        ind = np.random.choice(self.df.index, self.batch_size, replace=False)
        batch = self.df.loc[ind]
        number_of_batches = max(self.df.shape[0] // self.batch_size, 1)
        
        for _ in range(number_of_batches):
            batches = []
            for _, rows in batch.iterrows():
                condvec = rows.values.flatten().tolist()
                batches.append(condvec)
            masks.append(batches)
        return np.array(masks)

    def sampler(self, df, cond):
        real_samples = torch.tensor([]).to(self.device)
        cond_df = pd.DataFrame([cond], columns=df.columns)

        # Filter training data based on the condition vector
        filtered_data = df.copy()
        for col in df.columns:
            filtered_data = filtered_data[filtered_data[col] == cond_df[col].values[0]]

        # Randomly sample from the filtered data
        if not filtered_data.empty:
            sample = filtered_data.sample(n=1).values
            real_samples.append(sample)
        else:
            # Handle case where no samples match the condition
            real_samples = torch.cat([real_samples,torch.tensor(np.zeros((1, df.shape[1]))).to(self.device)], axis=0)

        return real_samples
        

    def Loss_G(self, real_pac, real_cond, fake_pac, fake_cond):
        pass
    
    def Loss_C(self, real_pac, real_cond, fake_pac, fake_cond):
        pass
        
    def train(self):
        masks = self.create_mask()
        latent_dim = 100
        z = torch.randn(latent_dim).to(self.device)
        
        cond_dim = self.df.shape[1]
        r_dim = self.m + self.D + 1
                
        self.generator = Generator(latent_dim, cond_dim, self.m, self.D).to(self.device)
        self.critic = Critic(self.pac, r_dim, cond_dim).to(self.device)
        
        self.optimizerG = optim.Adam(
            self.generator.parameters(),
            lr=2e-4,
            betas=(0.5, 0.9),
            weight_decay=1e-6
        )
        self.optimizerC = optim.Adam(
            self.critic.parameters(),
            lr=2e-4,
            betas=(0.5, 0.9),
            weight_decay=1e-6
        )
        
        data_size = len(self.df)
        pacs_per_batch = self.batch_size // self.pac
        batches_per_epoch = data_size // self.batch_size
        
        print(f"Data Size: {data_size} | Batches per Epoch: {batches_per_epoch} | PACs per Batch: {pacs_per_batch}")
        
        ###### CHANGE THIS
        self.epochs = 3
        
        for epoch in range(self.epochs):
            print(f'Epoch {epoch + 1}/{self.epochs}')
            
            for batch in range(batches_per_epoch):
                batch_start = batch * self.batch_size
                batch_end = (batch + 1) * self.batch_size
                batch_df = self.df.iloc[batch_start:batch_end]
                
                for pac_num in range(pacs_per_batch):
                    pac_start = pac_num * self.pac
                    pac_end = min(pac_start + self.pac, len(batch_df))
                    pac_df = batch_df.iloc[pac_start:pac_end]
                    cond_pac = torch.tensor([]).to(self.device).float()
                    r_real_pac = torch.tensor([]).to(self.device).float()
                    r_fake_pac = torch.tensor([]).to(self.device).float()
                    
                    for l, (index, row) in enumerate(pac_df.iterrows()):
                        cond = torch.tensor(row.values).float().to(self.device)
                        r_real = self.sampler(self.df, cond).to(self.device).float()
                        r_fake = self.generator(z, cond).to(self.device).float()
                                                
                        cond_pac = torch.cat([cond_pac, cond], dim=0).to(self.device).float()
                        r_real_pac = torch.cat([r_real_pac, r_real], dim=1).to(self.device).float()
                        r_fake_pac = torch.cat([r_fake_pac, r_fake], dim=1).to(self.device).float()
                                        
                    r_real_pac = r_real_pac.reshape(-1)
                    r_fake_pac = r_fake_pac.reshape(-1)
                    
                    print("Batch Number:", batch, "Pac Number:", pac_num, "Pac Count", l, "Record number:", row[0],flush=True)
                    print('\tExpected dim:', ((self.m + self.D + 1) * self.pac), 'r_real_pac dim:', r_real_pac.shape,flush=True)
                    print('\tExpected dim:', ((self.m + self.D + 1) * self.pac), 'r_fake_pac dim:', r_fake_pac.shape,flush=True)
                    
                    C_real = self.critic(r_real_pac, cond_pac)
                    C_fake = self.critic(r_fake_pac, cond_pac)
                    
                    print('Critic output shape:', C_real.shape, C_fake.shape)

            
'''
                        print('Epoch', i + 1, 'Batch Number:', batch, 'Pac Number:', pac_num, 'Pac Count', l, 'Record number:', row[0], flush=True)
                    pac_start_index += self.pac
                    print('Finished One Pac', flush=True)
                start_index += self.batch_size
                print('Finished One Batch', flush=True)
            print('--------------------------------------------------')
'''
'''
            for i in range(len(masks)):
                
                for j in range(len(masks) // self.pac):
                    cond = torch.tensor(mask[0]).float().to(self.device)
                    print('Shape of condition: ', cond.shape)
                    r_pac_fake = self.generator(z, cond).to(self.device)
                    print('Shape of fake data: ', r_pac_fake.shape)
                    
                    r_pac_real = self.sampler(self.df, cond).to(self.device)
                    print('Shape of real data: ', r_pac_real.shape)
                    
                    cond_pac = cond.to(self.device)
                    C_real, C_fake = 0, 0
                    for _ in range(self.pac - 1):
                        cond = torch.tensor(mask).float().to(self.device)
                        cond_pac = torch.cat([cond_pac, cond], dim=1).to(self.device)
                        
                        r_fake = self.generator(z, cond).to(self.device)
                        r_real = self.sampler(self.df, cond).to(self.device)
                                                
                        r_pac_fake = torch.cat([r_pac_fake, r_fake], dim=1).to(self.device)
                        r_pac_real = torch.cat([r_pac_real, r_real], dim=1).to(self.device)
                    
                    C_real += self.critic(r_pac_real, cond_pac)
                    C_fake += self.critic(r_pac_fake, cond_pac)
                    
                    LossC = C_fake.mean() - C_real.mean()
                    
                    
                    self.optimizerG.zero_grad()
                    self.optimizerC.zero_grad()
                    
                    self.optimizerC.zero_grad()

            print(f"Epoch: {epoch}, LossC: {LossC}")
'''
test = CTGAN(df)
test.train()

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Recieved sizes: 10000 5 40
Device: cuda
Data Size: 10000 | Batches per Epoch: 20 | PACs per Batch: 50
Epoch 1/3
torch.Size([1, 100]) torch.Size([1, 45])
torch.Size([1, 1605])
torch.Size([1, 100]) torch.Size([1, 45])
torch.Size([1, 1605])
torch.Size([1, 100]) torch.Size([1, 45])
torch.Size([1, 1605])
torch.Size([1, 100]) torch.Size([1, 45])
torch.Size([1, 1605])
torch.Size([1, 100]) torch.Size([1, 45])
torch.Size([1, 1605])
torch.Size([1, 100]) torch.Size([1, 45])
torch.Size([1, 1605])
torch.Size([1, 100]) torch.Size([1, 45])
torch.Size([1, 1605])
torch.Size([1, 100]) torch.Size([1, 45])
torch.Size([1, 1605])
torch.Size([1, 100]) torch.Size([1, 45])
torch.Size([1, 1605])
torch.Size([1, 100]) torch.Size([1, 45])
torch.Size([1, 1605])
Batch Number: 0 Pac Number: 0 Pac Count 9 Record number: 35.0
	Expected dim: 460 r_real_pac dim: torch.Size([450])
	Expected dim: 460 r_fake_pac dim: torch.Size([16050])
torch.Size([1, 450]) torch.Size([1, 450]) Expected 910


  print("Batch Number:", batch, "Pac Number:", pac_num, "Pac Count", l, "Record number:", row[0],flush=True)


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x900 and 910x256)

In [None]:
dp, m, D = DataProcessor(df).fit_transform()

In [None]:
epochs = 5
batch_size = 500
pac_size = 10
device = 'cuda'

data_size = len(dp)
pacs_per_batch = batch_size // pac_size
batches_per_epoch = data_size // batch_size

print(f"Data Size: {data_size} | Batches per Epoch: {batches_per_epoch} | PACs per Batch: {pacs_per_batch}")


for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    for batch in range(batches_per_epoch):
        batch_start = batch * batch_size
        batch_end = min(batch_start + batch_size, data_size)
        batch_df = dp.iloc[batch_start:batch_end]
        print(f"  Batch {batch + 1}/{batches_per_epoch}")

        for pac_num in range(pacs_per_batch):
            pac_start = pac_num * pac_size
            pac_end = min(pac_start + pac_size, len(batch_df))
            pac_df = batch_df.iloc[pac_start:pac_end]
            print(f"    Pac {pac_num + 1}/{pacs_per_batch}")
            cond_pac = torch.tensor([]).to(device)

            for l, (index, row) in enumerate(pac_df.iterrows()):
                cond = torch.tensor(row.values).to(device)
                cond_pac = torch.cat((cond_pac, cond), dim=0)
    print("--------------------------------------------------")