# Step 3: Create Combined Dataset with t-SNE Selection

This notebook combines real and synthetic data, applies t-SNE selection for dominant classes, and creates the missing `combined_tsne_new-1.csv` file.

## Process:
1. Load real ODIR data
2. Load synthetic data (generated in previous step)
3. Apply t-SNE selection to reduce dominant classes
4. Combine real and synthetic data
5. Create the combined dataset CSV file


In [None]:
import os
import re
import cv2
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from PIL import Image

print("Dependencies imported successfully!")


In [None]:
# Configuration
class Config:
    # Dataset paths
    odir_path = "/kaggle/input/ocular-disease-recognition-odir5k/"
    synthetic_path = "/kaggle/working/synthetic_data/"
    synthetic_metadata_path = "/kaggle/working/synthetic_metadata.csv"
    
    # Output path
    output_path = "/kaggle/working/combined_tsne_new-1.csv"
    
    # Image parameters
    img_size = 224
    
    # t-SNE parameters
    tsne_perplexity = 30
    tsne_n_iter = 1000
    n_clusters = 50  # Number of clusters for dominant class reduction
    
    # Class mappings
    classes = {
        "G": "Glaucoma",
        "C": "Cataract", 
        "A": "Age Related Macular Degeneration",
        "H": "Hypertension",
        "M": "Myopia"
    }
    
    # Dominant classes that need t-SNE reduction (based on original paper)
    dominant_classes = ["G", "C", "A"]  # These classes have more samples
    
config = Config()
print(f"Configuration loaded. Output will be saved to: {config.output_path}")


In [None]:
# Load real ODIR data
print("Loading real ODIR data...")
df_real = pd.read_csv(os.path.join(config.odir_path, "full_df.csv"))
df_real["class"] = df_real["labels"].apply(lambda x: " ".join(re.findall("[a-zA-Z]+", x)))

# Filter for our target classes
target_classes = list(config.classes.keys())
df_real = df_real[df_real["class"].isin(target_classes)]

print(f"Real data loaded: {len(df_real)} samples")
print("Real data distribution:")
print(df_real["class"].value_counts())


In [None]:
# Load synthetic data
print("Loading synthetic data...")
if os.path.exists(config.synthetic_metadata_path):
    df_synthetic = pd.read_csv(config.synthetic_metadata_path)
    print(f"Synthetic data loaded: {len(df_synthetic)} samples")
    print("Synthetic data distribution:")
    print(df_synthetic["disease_name"].value_counts())
else:
    print("❌ Synthetic data not found!")
    print("Please run the synthetic data generation first:")
    print("1. Run generate_synthetic_data.py")
    print("2. Or run the synthetic data generation notebook")
    df_synthetic = pd.DataFrame()  # Empty dataframe as fallback


In [None]:
# Function to load and preprocess images for t-SNE
def load_images_for_tsne(df, images_path, max_samples=None):
    """Load images and extract features for t-SNE"""
    print(f"Loading images for t-SNE analysis...")
    
    features = []
    labels = []
    filenames = []
    
    for idx, row in df.iterrows():
        if max_samples and len(features) >= max_samples:
            break
            
        # Load image
        image_path = os.path.join(images_path, row["filename"])
        
        if os.path.exists(image_path):
            # Load and resize image
            image = cv2.imread(image_path)
            if image is not None:
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                image = cv2.resize(image, (config.img_size, config.img_size))
                
                # Flatten image for t-SNE
                features.append(image.flatten())
                labels.append(row["class"])
                filenames.append(row["filename"])
    
    return np.array(features), labels, filenames

# Apply t-SNE selection to dominant classes
def apply_tsne_selection(df, images_path, class_name, max_samples=300):
    """Apply t-SNE and clustering to select representative samples"""
    print(f"Applying t-SNE selection for {class_name}...")
    
    # Get samples for this class
    class_df = df[df["class"] == class_name].copy()
    
    if len(class_df) == 0:
        return pd.DataFrame()
    
    # Limit samples for computational efficiency
    if len(class_df) > max_samples:
        class_df = class_df.sample(n=max_samples, random_state=42)
    
    # Load images
    features, labels, filenames = load_images_for_tsne(class_df, images_path)
    
    if len(features) < 10:  # Need minimum samples for t-SNE
        return class_df
    
    print(f"  Loaded {len(features)} images for t-SNE")
    
    # Apply t-SNE
    print(f"  Running t-SNE...")
    tsne = TSNE(n_components=2, perplexity=config.tsne_perplexity, 
                n_iter=config.tsne_n_iter, random_state=42)
    tsne_results = tsne.fit_transform(features)
    
    # Apply K-means clustering
    print(f"  Applying K-means clustering...")
    kmeans = KMeans(n_clusters=min(config.n_clusters, len(features)//2), 
                   random_state=42)
    cluster_labels = kmeans.fit_predict(tsne_results)
    
    # Select representative samples from each cluster
    selected_indices = []
    for cluster_id in np.unique(cluster_labels):
        cluster_indices = np.where(cluster_labels == cluster_id)[0]
        
        # Select samples from this cluster (up to 3 per cluster)
        n_select = min(3, len(cluster_indices))
        selected_cluster_indices = np.random.choice(cluster_indices, 
                                                  size=n_select, 
                                                  replace=False)
        selected_indices.extend(selected_cluster_indices)
    
    # Create filtered dataframe
    selected_filenames = [filenames[i] for i in selected_indices]
    filtered_df = class_df[class_df["filename"].isin(selected_filenames)].copy()
    
    print(f"  Selected {len(filtered_df)} samples from {len(class_df)} original samples")
    
    return filtered_df


In [None]:
# Apply t-SNE selection to real data (dominant classes only)
print("Applying t-SNE selection to real data...")
df_real_filtered = []

for class_short in target_classes:
    class_name = config.classes[class_short]
    
    if class_short in config.dominant_classes:
        # Apply t-SNE selection for dominant classes
        filtered_class_df = apply_tsne_selection(
            df_real, 
            os.path.join(config.odir_path, "preprocessed_images"),
            class_short
        )
    else:
        # Keep all samples for minority classes
        filtered_class_df = df_real[df_real["class"] == class_short].copy()
        print(f"Keeping all {len(filtered_class_df)} samples for {class_name}")
    
    df_real_filtered.append(filtered_class_df)

# Combine filtered real data
df_real_combined = pd.concat(df_real_filtered, ignore_index=True)
print(f"\nReal data after t-SNE selection: {len(df_real_combined)} samples")
print("Distribution:")
print(df_real_combined["class"].value_counts())


In [None]:
# Process synthetic data (if available)
if not df_synthetic.empty:
    print("Processing synthetic data...")
    
    # Rename columns to match real data format
    df_synthetic_processed = df_synthetic.copy()
    df_synthetic_processed = df_synthetic_processed.rename(columns={
        'disease_name': 'class_name',
        'class': 'class'
    })
    
    # Ensure synthetic data has the same format as real data
    synthetic_data = []
    for idx, row in df_synthetic_processed.iterrows():
        synthetic_data.append({
            'filename': row['filename'],
            'class': row['class'],
            'labels': row['labels'],
            'source': 'synthetic'
        })
    
    df_synthetic_final = pd.DataFrame(synthetic_data)
    print(f"Synthetic data processed: {len(df_synthetic_final)} samples")
    
else:
    print("No synthetic data available - creating dataset with real data only")
    df_synthetic_final = pd.DataFrame()

# Combine real and synthetic data
print("Combining real and synthetic data...")

# Add source column to real data
df_real_combined['source'] = 'real'

# Combine datasets
if not df_synthetic_final.empty:
    df_combined = pd.concat([df_real_combined, df_synthetic_final], ignore_index=True)
else:
    df_combined = df_real_combined.copy()

print(f"Combined dataset: {len(df_combined)} samples")
print("Final distribution:")
print(df_combined["class"].value_counts())
print("\nSource distribution:")
print(df_combined["source"].value_counts())


In [None]:
# Save the combined dataset
print(f"Saving combined dataset to: {config.output_path}")
df_combined.to_csv(config.output_path, index=False)

print("✅ Combined dataset saved successfully!")
print(f"📁 File: {config.output_path}")
print(f"📊 Total samples: {len(df_combined)}")

# Display sample of the combined dataset
print("\n📋 Sample of combined dataset:")
print(df_combined.head(10))

# Create summary statistics
print("\n📈 Dataset Summary:")
print(f"Real data: {len(df_combined[df_combined['source'] == 'real'])} samples")
if not df_synthetic_final.empty:
    print(f"Synthetic data: {len(df_combined[df_combined['source'] == 'synthetic'])} samples")

print("\n🎯 Class distribution:")
for class_short, class_name in config.classes.items():
    class_count = len(df_combined[df_combined['class'] == class_short])
    print(f"  {class_name} ({class_short}): {class_count} samples")

print(f"\n🎉 Combined dataset creation completed!")
print(f"The missing file '/kaggle/input/combined-tsne-new-1/combined_tsne_new-1.csv' has been created!")
print(f"You can now run the 'resnet_synthetic_and_real_data_combined.ipynb' notebook.")
