In [1]:
!pip install ctgan



In [2]:
from ctgan import CTGAN


# Setup SageMaker session
import sagemaker
import boto3
from sagemaker.pytorch import PyTorch
import os

boto_session = boto3.Session(region_name='us-east-1')
sagemaker_session = sagemaker.Session(boto_session=boto_session)
role = "arn:aws:iam::211125439249:role/service-role/AmazonSageMaker-ExecutionRole-20250314T153928"
role_name = role.split('/')[-1]  # Extract just the role name from the ARN

# Attach AdministratorAccess policy to your existing role
iam_client = boto3.client('iam')
iam_client.attach_role_policy(
    RoleName=role_name,
    PolicyArn="arn:aws:iam::aws:policy/AdministratorAccess"
)
print(f"Attached AdministratorAccess policy to role: {role}")

# Assume blood.csv is in S3 already - if not, upload it first
input_data_s3_uri = "s3://blue-blood-data/final_data_april22.csv"


# Create a PyTorch estimator
estimator = PyTorch(
    entry_point='train_ctgan2.py',  
    role=role,
    framework_version='1.12.0',  
    py_version='py38',
    instance_count=1,
    instance_type='ml.m5.xlarge',
    sagemaker_session=sagemaker_session,
    hyperparameters={
        'epochs': 300,
        'batch-size': 500
    },
    dependencies=['requirements.txt']  # Add dependencies
)

#put it in a s3 bucket folder, so that you can reference back to it, name it simple, (output ctgan folder)
# with each new run, when testing different hyperparameters - name it with synthetic data-1 or s

# Start training
estimator.fit({'train': input_data_s3_uri})

# After training, you can access the output
output_path = estimator.model_data
print(f"Model artifacts stored at: {output_path}")




sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/neha/Library/Application Support/sagemaker/config.yaml


Attached AdministratorAccess policy to role: arn:aws:iam::211125439249:role/service-role/AmazonSageMaker-ExecutionRole-20250314T153928


2025-04-22 06:14:32 Starting - Starting the training job...
2025-04-22 06:14:58 Starting - Preparing the instances for training...
2025-04-22 06:15:39 Downloading - Downloading the training image......
2025-04-22 06:16:29 Training - Training image download completed. Training in progress.bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2025-04-22 06:16:37,252 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2025-04-22 06:16:37,254 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-04-22 06:16:37,262 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2025-04-22 06:16:37,265 sagemaker_pytorch_container.training INFO     Invoking user training script.
2025-04-22 06:16:37,471 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:
/opt/conda/bin/python3.8 -m pip install -r requirements.

In [5]:
#step 1

import boto3

s3 = boto3.client('s3')

bucket = 'sagemaker-us-east-1-211125439249'
key = 'pytorch-training-2025-04-22-06-14-30-742/output/model.tar.gz'
local_tar_path = 'model.tar.gz'

s3.download_file(bucket, key, local_tar_path)
print("Downloaded model.tar.gz")


Downloaded model.tar.gz


In [6]:
#step 2

import tarfile
import os

output_dir = './model_output'
os.makedirs(output_dir, exist_ok=True)

with tarfile.open(local_tar_path, 'r:gz') as tar:
    tar.extractall(path=output_dir)

print("Extracted the following files:")
print(os.listdir(output_dir))


Extracted the following files:
['ctgan_model.pkl']


In [8]:


import boto3
import tarfile
import pandas as pd
import joblib
from io import BytesIO

# Your S3 model output URI from SageMaker
s3_uri = "s3://sagemaker-us-east-1-211125439249/pytorch-training-2025-04-22-06-14-30-742/output/model.tar.gz"

# Extract bucket and key
bucket_name, key = s3_uri.replace("s3://", "").split("/", 1)

# Connect to S3
s3 = boto3.client('s3')
response = s3.get_object(Bucket=bucket_name, Key=key)
file_obj = BytesIO(response['Body'].read())

# Extract ctgan_model.pkl from tar.gz
with tarfile.open(fileobj=file_obj, mode='r:gz') as tar:
    for member in tar.getmembers():
        if member.name.endswith(".pkl"):
            model_file = tar.extractfile(member)
            with open("ctgan_model.pkl", "wb") as f:
                f.write(model_file.read())
            print("✅ Model file extracted: ctgan_model.pkl")
            break

# Load the trained CTGAN model
model = joblib.load("ctgan_model.pkl")
print("✅ Model loaded successfully")

# Generate 750 samples
synthetic_df_apr22 = model.sample(2000)
print(f"✅ Generated synthetic dataframe with {len(synthetic_df_apr22)} rows")

# Save to CSV locally
synthetic_df_apr22.to_csv("synthetic_output_apr22.csv", index=False)
print("✅ Saved as: synthetic_output_apr22.csv")

# Display the dataframe
synthetic_df_apr22



✅ Model file extracted: ctgan_model.pkl


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


✅ Model loaded successfully
✅ Generated synthetic dataframe with 2000 rows
✅ Saved as: synthetic_output_apr22.csv


Unnamed: 0,prescription_dose_val_rx,prescription_dose_unit_rx,pre_ph,pre_pco2,pre_po2,pre_bicarbonate,pre_baseexcess,pre_totalco2,pre_hematocrit,pre_hemoglobin,...,P118,P119,P120,P121,P122,P123,P124,P125,P126,P127
0,0.015912,10,0.594248,0.527314,0.142489,-1.000842,-0.879999,0.532785,-0.997184,-1.014971,...,0.443874,-0.322464,1.015162,0.749739,0.715762,-0.976574,-1.061133,-0.058379,1.215269,-0.272993
1,-0.019336,6,0.708165,-0.050647,-0.051054,-1.000892,-1.198152,0.531444,-0.984167,-1.014930,...,1.191342,0.886632,-4.417329,-0.444727,-0.119065,0.105628,1.769705,-1.522937,1.171722,0.485828
2,0.011186,8,0.587773,0.401715,0.134474,-1.002261,-1.135831,0.047771,-0.991515,-1.004722,...,3.651039,1.110791,0.002523,-1.581717,1.822959,0.123961,-0.680477,-1.223425,-1.134826,-0.375754
3,-1.193432,10,0.456588,0.514074,0.087774,-0.999985,0.114600,0.163194,-0.985047,-1.010880,...,-0.894870,-0.776358,-1.607747,-0.560384,2.595142,-0.747404,1.151902,1.494811,-0.218585,2.274749
4,-0.013073,10,0.646255,0.547561,0.343893,-1.002241,0.477224,0.309903,-0.985125,-1.013326,...,-1.020241,0.828670,1.044978,-3.050448,-0.723636,0.381234,1.417423,-1.281786,0.281012,-0.204389
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.005730,6,0.669306,0.547630,0.082716,-1.000139,0.578961,0.953350,-0.997746,-0.995859,...,-0.412728,-0.143222,-1.948098,1.076576,0.158893,-0.047997,-0.368626,-0.742388,0.836348,0.119326
1996,0.004195,5,0.718803,0.419873,0.085823,-1.001464,0.455249,0.551772,-0.997325,-1.009278,...,-1.036751,-0.181421,2.120468,-0.614653,0.746705,-0.588871,-0.269255,0.154045,3.682704,1.368463
1997,0.010331,5,0.645914,0.417574,0.007226,-0.997235,0.419593,0.479415,-0.990039,-0.995604,...,4.064640,0.691035,-0.176160,-0.179039,-0.612655,0.061707,1.281673,0.443664,-0.643681,2.630039
1998,-0.005547,-1,0.742288,0.388134,0.119721,-1.001901,0.402738,0.455816,-0.980879,-1.009173,...,-0.892733,0.678204,1.134211,0.529190,0.025141,0.259433,0.552065,0.237499,0.619307,0.216607


In [11]:
import boto3

s3 = boto3.client('s3')

local_file = 'synthetic_output_apr22.csv'  # Your local file
bucket_name = 'blue-blood-data'
s3_key = 'blue-blood-synthetic-final.csv'  # Name it as you want others to access it

# Upload to S3
s3.upload_file(local_file, bucket_name, s3_key)

print(f"✅ File uploaded to s3://{bucket_name}/{s3_key}")


✅ File uploaded to s3://blue-blood-data/blue-blood-synthetic-final.csv


all the code beneath is old version

In [4]:
'''
import boto3
import tarfile
import os
import pandas as pd
from io import BytesIO


# S3 bucket and key extraction
s3_uri = "s3://sagemaker-us-east-1-211125439249/pytorch-training-2025-04-14-04-44-29-365/output/model.tar.gz"
parts = s3_uri.replace("s3://", "").split("/")
bucket_name = parts[0]
key = "/".join(parts[1:])

# Initialize S3 client
s3 = boto3.client('s3')

# Download the .tar.gz file
response = s3.get_object(Bucket=bucket_name, Key=key)
file_obj = BytesIO(response['Body'].read())

# Extract the .tar.gz file
with tarfile.open(fileobj=file_obj, mode='r:gz') as tar:
    for member in tar.getmembers():
        if "synthetic_data_9.csv" in member.name:
            csv_file = tar.extractfile(member)
            df_synthetic = pd.read_csv(csv_file)
            break

# Display the DataFrame
df_synthetic

'''


'\nimport boto3\nimport tarfile\nimport os\nimport pandas as pd\nfrom io import BytesIO\n\n\n# S3 bucket and key extraction\ns3_uri = "s3://sagemaker-us-east-1-211125439249/pytorch-training-2025-04-14-04-44-29-365/output/model.tar.gz"\nparts = s3_uri.replace("s3://", "").split("/")\nbucket_name = parts[0]\nkey = "/".join(parts[1:])\n\n# Initialize S3 client\ns3 = boto3.client(\'s3\')\n\n# Download the .tar.gz file\nresponse = s3.get_object(Bucket=bucket_name, Key=key)\nfile_obj = BytesIO(response[\'Body\'].read())\n\n# Extract the .tar.gz file\nwith tarfile.open(fileobj=file_obj, mode=\'r:gz\') as tar:\n    for member in tar.getmembers():\n        if "synthetic_data_9.csv" in member.name:\n            csv_file = tar.extractfile(member)\n            df_synthetic = pd.read_csv(csv_file)\n            break\n\n# Display the DataFrame\ndf_synthetic\n\n'