In [None]:
# ==============================================================================
# Notebook 4: Visualization and Quality Assessment
#
# This notebook visualizes the original and synthetic fraud data to assess the
# quality of the CTGAN model. It uses plots to compare feature distributions.
# ==============================================================================

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.manifold import TSNE

# --- 1. Load Datasets ---------------------------------------------------------
data_path = os.path.join('C:\\Users\\pvgre\\Desktop\\Proposals\\GNCIPL_mini_projects\\Synthetic_Fraud_AI_Project\\Data', 'creditcard.csv')
synthetic_data_path = os.path.join('C:\\Users\\pvgre\\Desktop\\Proposals\\GNCIPL_mini_projects\\Synthetic_Fraud_AI_Project\\Data', 'synthetic_fraud.csv')

try:
    original_data = pd.read_csv(data_path)
    synthetic_fraud_data = pd.read_csv(synthetic_data_path)
    print("Datasets loaded successfully.")
except FileNotFoundError as e:
    print(f"Error: One of the required files was not found: {e}")
    print("Please ensure 'creditcard.csv' and 'synthetic_fraud.csv' exist in the 'Data' folder.")
    exit()

# Separate the real fraud data
real_fraud_data = original_data[original_data['Class'] == 1].drop(columns=['Class'])
synthetic_fraud_data = synthetic_fraud_data.drop(columns=['Class'])

print(f"Shape of real fraud data: {real_fraud_data.shape}")
print(f"Shape of synthetic fraud data: {synthetic_fraud_data.shape}")

# --- 2. Visualize Feature Distributions (Example: 'Amount') -------------------
plt.figure(figsize=(10, 6))
sns.kdeplot(real_fraud_data['Amount'], color='blue', label='Real Fraud', fill=True, alpha=0.5)
sns.kdeplot(synthetic_fraud_data['Amount'], color='red', label='Synthetic Fraud', fill=True, alpha=0.5)
plt.title('Distribution of Transaction Amount: Real vs. Synthetic Fraud')
plt.xlabel('Amount')
plt.ylabel('Density')
plt.legend()
plt.show()

# --- 3. Visualize a few other features (e.g., V1, V2) -------------------------
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot for feature V1
sns.kdeplot(real_fraud_data['V1'], color='blue', label='Real Fraud', fill=True, alpha=0.5, ax=axes[0])
sns.kdeplot(synthetic_fraud_data['V1'], color='red', label='Synthetic Fraud', fill=True, alpha=0.5, ax=axes[0])
axes[0].set_title('Distribution of Feature V1')
axes[0].legend()

# Plot for feature V2
sns.kdeplot(real_fraud_data['V2'], color='blue', label='Real Fraud', fill=True, alpha=0.5, ax=axes[1])
sns.kdeplot(synthetic_fraud_data['V2'], color='red', label='Synthetic Fraud', fill=True, alpha=0.5, ax=axes[1])
axes[1].set_title('Distribution of Feature V2')
axes[1].legend()

plt.tight_layout()
plt.show()

# --- 4. 2D Visualization with t-SNE -------------------------------------------
# Create a small, balanced subset of data for t-SNE visualization
sample_size = min(len(real_fraud_data), 1000)
tsne_data = pd.concat([
    real_fraud_data.sample(sample_size, random_state=42),
    synthetic_fraud_data.sample(sample_size, random_state=42)
], ignore_index=True)

# Add a label column to distinguish real vs. synthetic data
tsne_data['Label'] = ['Real'] * sample_size + ['Synthetic'] * sample_size

print("\nApplying t-SNE to visualize the data in 2D...")
tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=1000)
tsne_results = tsne.fit_transform(tsne_data.drop(columns=['Label']))

tsne_df = pd.DataFrame(tsne_results, columns=['tsne-2d-one', 'tsne-2d-two'])
tsne_df['Label'] = tsne_data['Label']

plt.figure(figsize=(10, 8))
sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="Label",
    palette=sns.color_palette("hls", 2),
    data=tsne_df,
    legend="full",
    alpha=0.6
)
plt.title('t-SNE Visualization of Real vs. Synthetic Fraud Data')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.show()

