In [None]:
# ==============================================================================
# Notebook 2: CTGAN Training and Synthetic Data Generation
#
# This notebook trains a CTGAN model on the real fraud data to learn its
# distribution and then generates a new, larger synthetic dataset.
# ==============================================================================

import pandas as pd
import os
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata

# --- 1. Load the Real Fraud Data ----------------------------------------------
data_path = os.path.join('C:\\Users\\pvgre\\Desktop\\Proposals\\GNCIPL_mini_projects\\Synthetic_Fraud_AI_Project\\Data', 'creditcard.csv')
try:
    original_data = pd.read_csv(data_path)
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print(f"Error: The file '{data_path}' was not found.")
    print("Please ensure 'creditcard.csv' is in a 'Data' subfolder.")
    exit()

# Separate the real fraud data (the minority class).
real_fraud_data = original_data[original_data['Class'] == 1]
print(f"Real fraud data shape: {real_fraud_data.shape}")

# --- 2. Generate Synthetic Fraud Data with CTGAN ------------------------------
# Create the metadata object that the CTGANSynthesizer requires.
# We will auto-detect the metadata from the real fraud data.
print("\nDetecting metadata from the real fraud data...")
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=real_fraud_data)
print("Metadata detection complete.")

# Initialize and train the CTGAN model with the metadata.
print("\nTraining the CTGAN model on real fraud data...")
# The project document mentioned 300 epochs.
ctgan = CTGANSynthesizer(metadata=metadata, epochs=300)
ctgan.fit(real_fraud_data)
print("CTGAN training complete.")

# Generate new synthetic fraud samples.
# The goal is to generate enough synthetic data to significantly augment the minority class.
num_synthetic_samples = 5000
print(f"Generating {num_synthetic_samples} synthetic fraud samples...")
synthetic_fraud_data = ctgan.sample(num_rows=num_synthetic_samples)
print(f"Synthetic fraud data shape: {synthetic_fraud_data.shape}")

# --- 3. Save the Synthetic Data -----------------------------------------------
# Create the Data directory if it doesn't exist.
if not os.path.exists('Data'):
    os.makedirs('Data')
    
synthetic_data_path = os.path.join('C:\\Users\\pvgre\\Desktop\\Proposals\\GNCIPL_mini_projects\\Synthetic_Fraud_AI_Project\\Data', 'synthetic_fraud.csv')
synthetic_fraud_data.to_csv(synthetic_data_path, index=False)
print(f"\nSynthetic fraud data saved to '{synthetic_data_path}'.")

