# Data Preparation Script for Coffee Bean Defect Detection
Creates stratified train/validation/test splits and saves indices

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import shutil
import json
import pickle
from pathlib import Path
from sklearn.model_selection import train_test_split, StratifiedKFold
from collections import defaultdict

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [2]:
# Set project paths
PROJECT_ROOT = Path('/home/tony/research_project/iate_project')
DATA_DIR = PROJECT_ROOT / 'data' / 'raw'
PROCESSED_DIR = PROJECT_ROOT / 'data' / 'processed'
SPLITS_DIR = PROJECT_ROOT / 'data' / 'splits'

In [3]:
# Create directories
for dir_path in [PROCESSED_DIR / 'train', PROCESSED_DIR / 'val', PROCESSED_DIR / 'test', SPLITS_DIR]:
    dir_path.mkdir(parents=True, exist_ok=True)

# 1. COLLECTING IMAGE PATHS

In [4]:
all_image_paths = []
all_labels = []
all_metadata = []

for class_idx, class_name in enumerate(['normal', 'defect']):
    class_path = DATA_DIR / class_name
    if not class_path.exists():
        print(f"Warning: {class_path} does not exist")
        continue

    for process_method in ['dry', 'honey', 'wet']:
        process_path = class_path / process_method
        if not process_path.exists():
            continue

        for roast_level in ['dark', 'medium', 'light']:
            roast_path = process_path / roast_level
            if not roast_path.exists():
                continue

            # Get all images
            image_files = [p for p in roast_path.iterdir() if p.is_file() and p.suffix.lower() in {'.jpg', '.jpeg', '.png'}]

            for img_file in image_files:
                all_image_paths.append(str(img_file))
                all_labels.append(class_idx)  # 0 for Normal, 1 for Defect
                all_metadata.append({
                    'path': str(img_file),
                    'class': class_name,
                    'class_idx': class_idx,
                    'process': process_method,
                    'roast': roast_level,
                    'filename': img_file.name
                })

print(f"Total images collected: {len(all_image_paths)}")
print(f"Normal images: {all_labels.count(0)}")
print(f"Defect images: {all_labels.count(1)}")

Total images collected: 5400
Normal images: 3600
Defect images: 1800


In [5]:
# Convert to numpy arrays
all_image_paths = np.array(all_image_paths)
all_labels = np.array(all_labels)

# 2. CREATING STRATIFIED SPLITS

In [6]:
# Split ratios
TRAIN_RATIO = 0.70
VAL_RATIO = 0.15
TEST_RATIO = 0.15

# First split: separate test set
X_temp, X_test, y_temp, y_test, idx_temp, idx_test = train_test_split(
    all_image_paths, all_labels, np.arange(len(all_labels)),
    test_size=TEST_RATIO,
    stratify=all_labels,
    random_state=RANDOM_SEED
)

# Second split: separate train and validation from temp
val_size_adjusted = VAL_RATIO / (TRAIN_RATIO + VAL_RATIO)
X_train, X_val, y_train, y_val, idx_train, idx_val = train_test_split(
    X_temp, y_temp, idx_temp,
    test_size=val_size_adjusted,
    stratify=y_temp,
    random_state=RANDOM_SEED
)

print(f"Train set: {len(X_train)} images")
print(f"  Normal: {np.sum(y_train == 0)} ({np.sum(y_train == 0)/len(y_train)*100:.1f}%)")
print(f"  Defect: {np.sum(y_train == 1)} ({np.sum(y_train == 1)/len(y_train)*100:.1f}%)")

print(f"\nValidation set: {len(X_val)} images")
print(f"  Normal: {np.sum(y_val == 0)} ({np.sum(y_val == 0)/len(y_val)*100:.1f}%)")
print(f"  Defect: {np.sum(y_val == 1)} ({np.sum(y_val == 1)/len(y_val)*100:.1f}%)")

print(f"\nTest set: {len(X_test)} images")
print(f"  Normal: {np.sum(y_test == 0)} ({np.sum(y_test == 0)/len(y_test)*100:.1f}%)")
print(f"  Defect: {np.sum(y_test == 1)} ({np.sum(y_test == 1)/len(y_test)*100:.1f}%)")


Train set: 3780 images
  Normal: 2520 (66.7%)
  Defect: 1260 (33.3%)

Validation set: 810 images
  Normal: 540 (66.7%)
  Defect: 270 (33.3%)

Test set: 810 images
  Normal: 540 (66.7%)
  Defect: 270 (33.3%)


# 3. SAVING SPLIT INDICES

In [7]:
splits = {
    'train_indices': idx_train.tolist(),
    'val_indices': idx_val.tolist(),
    'test_indices': idx_test.tolist(),
    'train_paths': X_train.tolist(),
    'val_paths': X_val.tolist(),
    'test_paths': X_test.tolist(),
    'train_labels': y_train.tolist(),
    'val_labels': y_val.tolist(),
    'test_labels': y_test.tolist(),
    'metadata': all_metadata,
    'random_seed': RANDOM_SEED
}

# Save as JSON
with open(SPLITS_DIR / 'splits.json', 'w') as f:
    json.dump(splits, f, indent=2)
print(f"Saved splits to: {SPLITS_DIR / 'splits.json'}")

# Save as pickle for faster loading
with open(SPLITS_DIR / 'splits.pkl', 'wb') as f:
    pickle.dump(splits, f)
print(f"Saved splits to: {SPLITS_DIR / 'splits.pkl'}")

Saved splits to: /home/tony/research_project/iate_project/data/splits/splits.json
Saved splits to: /home/tony/research_project/iate_project/data/splits/splits.pkl


# 4. CREATING CROSS-VALIDATION SPLITS

In [8]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
cv_splits = []

for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    cv_splits.append({
        'fold': fold_idx,
        'train_indices': train_idx.tolist(),
        'val_indices': val_idx.tolist(),
        'train_size': len(train_idx),
        'val_size': len(val_idx)
    })
    print(f"Fold {fold_idx + 1}: Train={len(train_idx)}, Val={len(val_idx)}")

# Save CV splits
with open(SPLITS_DIR / 'cv_splits.json', 'w') as f:
    json.dump(cv_splits, f, indent=2)
print(f"\nSaved CV splits to: {SPLITS_DIR / 'cv_splits.json'}")

Fold 1: Train=3024, Val=756
Fold 2: Train=3024, Val=756
Fold 3: Train=3024, Val=756
Fold 4: Train=3024, Val=756
Fold 5: Train=3024, Val=756

Saved CV splits to: /home/tony/research_project/iate_project/data/splits/cv_splits.json


# 5. GENERATING DATASET STATISTICS

In [9]:
def _class_key(x):
    if isinstance(x, (int, np.integer)):
        return 'Normal' if x == 0 else 'Defect'
    s = str(x).lower()
    if s == 'normal': return 'Normal'
    if s == 'defect': return 'Defect'
    return s.title()

In [10]:
# Process distribution analysis
process_dist = defaultdict(lambda: {'train': {'Normal': 0, 'Defect': 0},
                                    'val': {'Normal': 0, 'Defect': 0},
                                    'test': {'Normal': 0, 'Defect': 0}})

roast_dist = defaultdict(lambda: {'train': {'Normal': 0, 'Defect': 0},
                                  'val': {'Normal': 0, 'Defect': 0},
                                  'test': {'Normal': 0, 'Defect': 0}})

# Analyze train set
for idx in idx_train:
    meta = all_metadata[idx]
    process_dist[meta['process']]['train'][_class_key(meta['class'])] += 1
    roast_dist[meta['roast']]['train'][_class_key(meta['class'])] += 1

# Analyze val set
for idx in idx_val:
    meta = all_metadata[idx]
    process_dist[meta['process']]['val'][_class_key(meta['class'])] += 1
    roast_dist[meta['roast']]['val'][_class_key(meta['class'])] += 1

# Analyze test set
for idx in idx_test:
    meta = all_metadata[idx]
    process_dist[meta['process']]['test'][_class_key(meta['class'])] += 1
    roast_dist[meta['roast']]['test'][_class_key(meta['class'])] += 1

# Create statistics summary
stats = {
    'total_images': len(all_image_paths),
    'class_distribution': {
        'Normal': all_labels.tolist().count(0),
        'Defect': all_labels.tolist().count(1)
    },
    'split_sizes': {
        'train': len(X_train),
        'val': len(X_val),
        'test': len(X_test)
    },
    'split_ratios': {
        'train': len(X_train) / len(all_image_paths),
        'val': len(X_val) / len(all_image_paths),
        'test': len(X_test) / len(all_image_paths)
    },
    'process_distribution': dict(process_dist),
    'roast_distribution': dict(roast_dist),
    'class_balance': {
        'train': {
            'Normal': np.sum(y_train == 0) / len(y_train),
            'Defect': np.sum(y_train == 1) / len(y_train)
        },
        'val': {
            'Normal': np.sum(y_val == 0) / len(y_val),
            'Defect': np.sum(y_val == 1) / len(y_val)
        },
        'test': {
            'Normal': np.sum(y_test == 0) / len(y_test),
            'Defect': np.sum(y_test == 1) / len(y_test)
        }
    }
}

# Save statistics
with open(SPLITS_DIR / 'dataset_stats.json', 'w') as f:
    json.dump(stats, f, indent=2)
print(f"Saved statistics to: {SPLITS_DIR / 'dataset_stats.json'}")

Saved statistics to: /home/tony/research_project/iate_project/data/splits/dataset_stats.json


# 6. CREATING FILE LISTS

In [11]:
# Train list
train_list = []
for path, label in zip(X_train, y_train):
    train_list.append(f"{path}\t{label}\n")

with open(SPLITS_DIR / 'train_list.txt', 'w') as f:
    f.writelines(train_list)
print(f"Created: {SPLITS_DIR / 'train_list.txt'}")

# Val list
val_list = []
for path, label in zip(X_val, y_val):
    val_list.append(f"{path}\t{label}\n")

with open(SPLITS_DIR / 'val_list.txt', 'w') as f:
    f.writelines(val_list)
print(f"Created: {SPLITS_DIR / 'val_list.txt'}")

# Test list
test_list = []
for path, label in zip(X_test, y_test):
    test_list.append(f"{path}\t{label}\n")

with open(SPLITS_DIR / 'test_list.txt', 'w') as f:
    f.writelines(test_list)
print(f"Created: {SPLITS_DIR / 'test_list.txt'}")

Created: /home/tony/research_project/iate_project/data/splits/train_list.txt
Created: /home/tony/research_project/iate_project/data/splits/val_list.txt
Created: /home/tony/research_project/iate_project/data/splits/test_list.txt


In [12]:
print("\nSummary:")
print(f"• Total images: {len(all_image_paths):,}")
print(f"• Train set: {len(X_train):,} images")
print(f"• Validation set: {len(X_val):,} images")
print(f"• Test set: {len(X_test):,} images")
print(f"• Class balance maintained across all splits")
print(f"• 5-fold CV splits created for robust evaluation")


Summary:
• Total images: 5,400
• Train set: 3,780 images
• Validation set: 810 images
• Test set: 810 images
• Class balance maintained across all splits
• 5-fold CV splits created for robust evaluation


In [13]:
print("\nFiles created:")
print(f"• {SPLITS_DIR / 'splits.json'}")
print(f"• {SPLITS_DIR / 'splits.pkl'}")
print(f"• {SPLITS_DIR / 'cv_splits.json'}")
print(f"• {SPLITS_DIR / 'dataset_stats.json'}")
print(f"• {SPLITS_DIR / 'train_list.txt'}")
print(f"• {SPLITS_DIR / 'val_list.txt'}")
print(f"• {SPLITS_DIR / 'test_list.txt'}")


Files created:
• /home/tony/research_project/iate_project/data/splits/splits.json
• /home/tony/research_project/iate_project/data/splits/splits.pkl
• /home/tony/research_project/iate_project/data/splits/cv_splits.json
• /home/tony/research_project/iate_project/data/splits/dataset_stats.json
• /home/tony/research_project/iate_project/data/splits/train_list.txt
• /home/tony/research_project/iate_project/data/splits/val_list.txt
• /home/tony/research_project/iate_project/data/splits/test_list.txt
