In [1]:
!pip install ctgan



In [7]:

from ctgan import CTGAN


# Setup SageMaker session
import sagemaker
import boto3
from sagemaker.pytorch import PyTorch
import os

boto_session = boto3.Session(region_name='us-east-1')
sagemaker_session = sagemaker.Session(boto_session=boto_session)
role = "arn:aws:iam::211125439249:role/service-role/AmazonSageMaker-ExecutionRole-20250314T153928"
role_name = role.split('/')[-1]  # Extract just the role name from the ARN

# Attach AdministratorAccess policy to your existing role
iam_client = boto3.client('iam')
iam_client.attach_role_policy(
    RoleName=role_name,
    PolicyArn="arn:aws:iam::aws:policy/AdministratorAccess"
)
print(f"Attached AdministratorAccess policy to role: {role}")

# Assume blood.csv is in S3 already - if not, upload it first
input_data_s3_uri = "s3://blue-blood-data/final_df.csv"

import time

# Sweep settings
learning_rates = [0.001, 0.002, 0.02, 0.005]
epochs = 300
batch_size = 500
base_output_path = "s3://blue-blood-data/ctgan-outputs"

learning_rates = [0.001, 0.002, 0.003, 0.005]  # 👈 Try different values

for lr in learning_rates:
    print(f"\n Starting training with learning rate: {lr}")

    estimator = PyTorch(
        entry_point='train_ctgan.py',
        role=role,
        framework_version='1.12.0',
        py_version='py38',
        instance_count=1,
        instance_type='ml.m5.xlarge',
        sagemaker_session=sagemaker_session,
        hyperparameters={
            'epochs': 300,
            'batch-size': 500,
        },
        dependencies=['requirements.txt']
    )

    estimator.fit({'train': input_data_s3_uri})

    output_path = estimator.model_data
    print(f"✅ Run complete. Model artifacts stored at: {output_path}")


Attached AdministratorAccess policy to role: arn:aws:iam::211125439249:role/service-role/AmazonSageMaker-ExecutionRole-20250314T153928

 Starting training with learning rate: 0.001


2025-04-08 03:00:57 Starting - Starting the training job...
2025-04-08 03:01:18 Starting - Preparing the instances for training...
2025-04-08 03:01:58 Downloading - Downloading the training image......
2025-04-08 03:02:49 Training - Training image download completed. Training in progress.bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2025-04-08 03:02:56,698 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2025-04-08 03:02:56,700 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-04-08 03:02:56,708 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2025-04-08 03:02:56,710 sagemaker_pytorch_container.training INFO     Invoking user training script.
2025-04-08 03:02:56,987 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:
/opt/conda/bin/python3.8 -m pip install -r requirements.

In [3]:
import boto3
import tarfile
import os
import pandas as pd
from io import BytesIO

# S3 bucket and key extraction
s3_uri = "s3://sagemaker-us-east-1-211125439249/pytorch-training-2025-03-21-04-33-20-724/output/output.tar.gz"
parts = s3_uri.replace("s3://", "").split("/")
bucket_name = parts[0]
key = "/".join(parts[1:])

# Initialize S3 client
s3 = boto3.client('s3')

# Download the .tar.gz file
response = s3.get_object(Bucket=bucket_name, Key=key)
file_obj = BytesIO(response['Body'].read())

# Extract the .tar.gz file
with tarfile.open(fileobj=file_obj, mode='r:gz') as tar:
    for member in tar.getmembers():
        if "synthetic_data.csv" in member.name:
            csv_file = tar.extractfile(member)
            df = pd.read_csv(csv_file)
            break

# Display the DataFrame
df

Unnamed: 0,prescription_dose_val_rx,prescription_dose_unit_rx,pre_ph,pre_pco2,pre_po2,pre_bicarbonate,pre_baseexcess,pre_totalco2,pre_hematocrit,pre_hemoglobin,...,P118,P119,P120,P121,P122,P123,P124,P125,P126,P127
0,0.001414,3,0.714855,0.394011,0.181301,-0.002940,0.476277,0.484996,0.010056,0.007761,...,1.526871,-0.226891,-0.562963,-0.571600,0.833258,-0.366861,-0.101112,-0.138971,0.903566,0.160276
1,-0.008318,3,0.755034,0.665916,0.429736,0.002194,0.486233,0.509745,0.012129,0.011334,...,0.133386,1.487494,-0.317584,1.150079,0.659608,-0.164488,0.520313,1.457416,-0.637841,-0.527525
2,0.009602,3,0.575826,0.604175,0.175497,-0.001985,0.883870,0.472304,0.006683,0.005170,...,0.473507,-0.872985,-0.854821,0.317275,0.736494,-0.236524,0.822562,1.309453,-0.262809,-0.408042
3,0.031796,3,0.568649,0.472849,0.201809,-0.000932,0.500979,0.881549,-0.004801,-0.012761,...,-0.736157,0.231719,-0.220861,0.276359,0.466031,-0.016229,-1.211064,0.497239,0.252757,1.231792
4,0.007656,3,0.463221,0.564624,0.101114,-0.000798,0.474374,0.431181,-0.009773,0.786163,...,-0.351883,1.104538,-1.855656,-0.640339,0.824865,2.791273,-0.193441,-0.803403,0.627992,-0.826140
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.033166,3,0.566082,0.562798,0.246193,0.002698,0.482202,0.356896,0.000045,-0.008382,...,-0.529967,0.615780,1.202921,0.235030,-0.947755,-0.191961,1.253938,2.514703,0.283438,-1.773356
96,-0.001568,3,0.590410,0.475273,0.124659,-0.001120,0.486416,0.468453,0.010148,0.003373,...,1.428672,1.073474,-0.063989,-0.611268,-0.244905,-0.312786,-1.532022,-0.607202,-0.461784,-0.002399
97,0.011926,10,0.573495,0.641641,0.419101,-0.000895,0.523242,0.537552,0.003332,0.003079,...,1.188481,1.345886,1.163857,0.419345,-0.363016,0.550361,0.758469,-1.076191,-1.132977,-0.533249
98,0.006822,3,0.586478,0.916336,0.244199,-0.002852,0.347821,0.485628,0.008791,0.004538,...,0.879364,0.368527,1.135426,-0.443087,0.700429,0.198101,0.555337,0.355721,0.574814,-0.118663


In [4]:
import boto3
import pandas as pd
import io

def upload_synthetic_data_to_s3(df, bucket_name, folder_prefix="ctgan-outputs", file_prefix="synthetic-data"):
    # set up S3 client using boto
    s3 = boto3.client('s3')

    # list existing objects to determine next run number so we can name the new file correctly
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_prefix + "/") 
    existing_files = []
    
    
    if "Contents" in response:
        for obj in response['Contents']:
            key = obj['Key']
            if key.endswith(".csv") and file_prefix in key:
                existing_files.append(key)
    
    # Determine the next number in the sequence
    run_numbers = []
    for file in existing_files:
        try:
            num = int(file.split(f"{file_prefix}-")[1].split(".csv")[0])
            run_numbers.append(num)
        except:
            continue

    next_run = max(run_numbers) + 1 if run_numbers else 1

    # Define file name
    file_name = f"{file_prefix}-{next_run}.csv"
    full_s3_path = f"{folder_prefix}/{file_name}"

    # Convert DataFrame to CSV in memory
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, index=False)

    # Upload to S3
    s3.put_object(Bucket=bucket_name, Key=full_s3_path, Body=csv_buffer.getvalue())

    print(f"✅ Uploaded to s3://{bucket_name}/{full_s3_path}")
    return full_s3_path


Uploading the synthetic data to s3


In [5]:
# Assume synthetic_data is your generated DataFrame
output_path = upload_synthetic_data_to_s3(
    df=df,
    bucket_name="blue-blood-data",  # change this
    folder_prefix="ctgan-outputs",
    file_prefix="synthetic-data"
)


✅ Uploaded to s3://blue-blood-data/ctgan-outputs/synthetic-data-3.csv


In [6]:
df.to_csv('synthetic_data_1.csv', index = False)