In [2]:
import os
import pandas as pd
import random

# Paths to Male and Female folders
female_folder = '/ifs/loni/faculty/thompson/four_d/cjagad/LDM/2D/2D_SYN_DATA/LDM_FEMALE'#FEMALE_80AE_2000LDM
male_folder = '/ifs/loni/faculty/thompson/four_d/cjagad/LDM/2D/2D_SYN_DATA/LDM_MALE'

# Output directory for the CSV files
output_folder = '/ifs/loni/faculty/thompson/four_d/vdesai/Syn/LDM 2D'
os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists

# List all image files in each folder
female_images = [os.path.join(female_folder, img) for img in os.listdir(female_folder) if img.endswith('.npy')]
male_images = [os.path.join(male_folder, img) for img in os.listdir(male_folder) if img.endswith('.npy')]

# Shuffle images to ensure random distribution
random.shuffle(female_images)
random.shuffle(male_images)

# Ensure balanced split
num_images = len(female_images) + len(male_images)
#assert len(female_images) + len(male_images) == 652, "Total number of images does not match 652."
#assert len(female_images) == len(male_images), "Male and Female images are not balanced."

# Define dataset sizes
train_size = 800
validation_size = 100
test_size = 100

# Split data for Male and Female equally
train_male = male_images[:train_size // 2]
train_female = female_images[:train_size // 2]
validation_male = male_images[train_size // 2:(train_size + validation_size) // 2]
validation_female = female_images[train_size // 2:(train_size + validation_size) // 2]
test_male = male_images[(train_size + validation_size) // 2:]
test_female = female_images[(train_size + validation_size) // 2:]

# Helper function to create a dataframe
def create_dataframe(image_paths, label):
    return pd.DataFrame({'Location': image_paths, 'Sex': [label] * len(image_paths)})

# Create dataframes
train_df = pd.concat([create_dataframe(train_male, 'Male'), create_dataframe(train_female, 'Female')]).sample(frac=1).reset_index(drop=True)
validation_df = pd.concat([create_dataframe(validation_male, 'Male'), create_dataframe(validation_female, 'Female')]).sample(frac=1).reset_index(drop=True)
test_df = pd.concat([create_dataframe(test_male, 'Male'), create_dataframe(test_female, 'Female')]).sample(frac=1).reset_index(drop=True)

# Save to CSV files in the specified output folder
train_csv_path = os.path.join(output_folder, 'training.csv')
validation_csv_path = os.path.join(output_folder, 'validation.csv')
test_csv_path = os.path.join(output_folder, 'testing.csv')

train_df.to_csv(train_csv_path, index=False)
validation_df.to_csv(validation_csv_path, index=False)
test_df.to_csv(test_csv_path, index=False)

print(f"CSV files created:\n- {train_csv_path}\n- {validation_csv_path}\n- {test_csv_path}")


CSV files created:
- /ifs/loni/faculty/thompson/four_d/vdesai/Syn/LDM 2D/training.csv
- /ifs/loni/faculty/thompson/four_d/vdesai/Syn/LDM 2D/validation.csv
- /ifs/loni/faculty/thompson/four_d/vdesai/Syn/LDM 2D/testing.csv


In [4]:
print(f"Number of Female images: {len(female_images)}")
print(f"Number of Male images: {len(male_images)}")
print(f"Total images: {len(female_images) + len(male_images)}")


Number of Female images: 500
Number of Male images: 500
Total images: 1000


In [7]:
import pandas as pd

# Paths to the generated CSV files
train_csv_path = '/ifs/loni/faculty/thompson/four_d/vdesai/Syn/LDM 2D/training.csv'
validation_csv_path = '/ifs/loni/faculty/thompson/four_d/vdesai/Syn/LDM 2D/validation.csv'
test_csv_path = '/ifs/loni/faculty/thompson/four_d/vdesai/Syn/LDM 2D/testing.csv'

# Read CSV files
train_df = pd.read_csv(train_csv_path)
validation_df = pd.read_csv(validation_csv_path)
test_df = pd.read_csv(test_csv_path)

# Count Male and Female in each dataset
def count_gender(df):
    return df['Sex'].value_counts()

print("Training Set:")
print(count_gender(train_df))
print("\nValidation Set:")
print(count_gender(validation_df))
print("\nTesting Set:")
print(count_gender(test_df))

Training Set:
Female    250
Male      250
Name: Sex, dtype: int64

Validation Set:
Female    125
Male      125
Name: Sex, dtype: int64

Testing Set:
Female    125
Male      125
Name: Sex, dtype: int64


In [1]:
import os
import pandas as pd
import random

# Paths to Male and Female folders
female_folder = '/ifs/loni/faculty/thompson/four_d/cjagad/LDM/2D/2d_data/female/axial'
male_folder = '/ifs/loni/faculty/thompson/four_d/cjagad/LDM/2D/2d_data/male/axial'
output_folder = '/ifs/loni/faculty/thompson/four_d/vdesai/Syn/LDM 2D Real'
os.makedirs(output_folder, exist_ok=True)

# List all image files
female_images = [os.path.join(female_folder, img) for img in os.listdir(female_folder) if img.endswith('.npy')]
male_images = [os.path.join(male_folder, img) for img in os.listdir(male_folder) if img.endswith('.npy')]

# Shuffle images
random.shuffle(female_images)
random.shuffle(male_images)

# Calculate splits based on available data
# You have 317 female and 314 male images = 631 total
# Let's do a 70/15/15 split
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

# Calculate split sizes for each gender
train_size_female = int(len(female_images) * train_ratio)
val_size_female = int(len(female_images) * val_ratio)
test_size_female = len(female_images) - train_size_female - val_size_female

train_size_male = int(len(male_images) * train_ratio)
val_size_male = int(len(male_images) * val_ratio)
test_size_male = len(male_images) - train_size_male - val_size_male

# Split the data
train_female = female_images[:train_size_female]
val_female = female_images[train_size_female:train_size_female + val_size_female]
test_female = female_images[train_size_female + val_size_female:]

train_male = male_images[:train_size_male]
val_male = male_images[train_size_male:train_size_male + val_size_male]
test_male = male_images[train_size_male + val_size_male:]

# Helper function to create a dataframe
def create_dataframe(image_paths, label):
    return pd.DataFrame({'Location': image_paths, 'Sex': [label] * len(image_paths)})

# Create dataframes
train_df = pd.concat([create_dataframe(train_male, 'Male'), 
                     create_dataframe(train_female, 'Female')]).sample(frac=1).reset_index(drop=True)
validation_df = pd.concat([create_dataframe(val_male, 'Male'), 
                         create_dataframe(val_female, 'Female')]).sample(frac=1).reset_index(drop=True)
test_df = pd.concat([create_dataframe(test_male, 'Male'), 
                    create_dataframe(test_female, 'Female')]).sample(frac=1).reset_index(drop=True)

# Save to CSV files
train_csv_path = os.path.join(output_folder, 'training.csv')
validation_csv_path = os.path.join(output_folder, 'validation.csv')
test_csv_path = os.path.join(output_folder, 'testing.csv')

train_df.to_csv(train_csv_path, index=False)
validation_df.to_csv(validation_csv_path, index=False)
test_df.to_csv(test_csv_path, index=False)

# Print statistics
print("Dataset Statistics:")
print("\nTraining Set:")
print(train_df['Sex'].value_counts())
print("\nValidation Set:")
print(validation_df['Sex'].value_counts())
print("\nTesting Set:")
print(test_df['Sex'].value_counts())

print("\nTotal images per set:")
print(f"Training: {len(train_df)}")
print(f"Validation: {len(validation_df)}")
print(f"Testing: {len(test_df)}")

Dataset Statistics:

Training Set:
Female    253
Male      251
Name: Sex, dtype: int64

Validation Set:
Male      31
Female    31
Name: Sex, dtype: int64

Testing Set:
Female    33
Male      32
Name: Sex, dtype: int64

Total images per set:
Training: 504
Validation: 62
Testing: 65
