In [None]:
# Combine two datasets: mineralimage5K-98-combined and Minerals_type_images1-combined
from datasets import load_from_disk, concatenate_datasets, DatasetDict, ClassLabel, Value
import pandas as pd

# Load both datasets
print("Loading datasets...")
ds1 = load_from_disk("./mineralimage5K-98/mineralimage5K-98-combined")
ds2 = load_from_disk("./Minerals_type_images1/Minerals_type_images1-combined")

print(f"\nDataset 1 (mineralimage5K-98-combined):")
print(f"  Splits: {list(ds1.keys())}")
print(f"  Train size: {len(ds1['train'])}")
print(f"  Features: {ds1['train'].features.keys()}")
print(f"  Label type: {type(ds1['train'].features['label'])}")

print(f"\nDataset 2 (Minerals_type_images1-combined):")
print(f"  Splits: {list(ds2.keys())}")
print(f"  Train size: {len(ds2['train'])}")
print(f"  Features: {ds2['train'].features.keys()}")
print(f"  Label type: {type(ds2['train'].features['label'])}")


  from .autonotebook import tqdm as notebook_tqdm


Loading datasets...

Dataset 1 (mineralimage5K-98-combined):
  Splits: ['train']
  Train size: 18326
  Features: dict_keys(['image', 'label'])
  Label type: <class 'datasets.features.features.Value'>

Dataset 2 (Minerals_type_images1-combined):
  Splits: ['train']
  Train size: 43762
  Features: dict_keys(['image', 'label'])
  Label type: <class 'datasets.features.features.Value'>


In [2]:
# Normalize labels to strings for both datasets and then combine
train1 = ds1['train']
train2 = ds2['train']

# Check label types and convert to strings if needed
label_feature1 = train1.features.get('label')
label_feature2 = train2.features.get('label')

print("Checking and normalizing label types...")

# Convert dataset 1 labels to strings if it's ClassLabel
if isinstance(label_feature1, ClassLabel):
    print("Dataset 1: Converting ClassLabel to string")
    def convert_label1_to_string(example):
        if 'label' in example:
            label_idx = example['label']
            if isinstance(label_idx, int):
                example['label'] = label_feature1.names[label_idx]
        return example
    train1 = train1.map(convert_label1_to_string)
    train1 = train1.cast_column('label', Value('string'))
    print("  ✓ Converted to string")
else:
    print("Dataset 1: Label already string type")

# Convert dataset 2 labels to strings if it's ClassLabel
if isinstance(label_feature2, ClassLabel):
    print("Dataset 2: Converting ClassLabel to string")
    def convert_label2_to_string(example):
        if 'label' in example:
            label_idx = example['label']
            if isinstance(label_idx, int):
                example['label'] = label_feature2.names[label_idx]
        return example
    train2 = train2.map(convert_label2_to_string)
    train2 = train2.cast_column('label', Value('string'))
    print("  ✓ Converted to string")
else:
    print("Dataset 2: Label already string type")

# Verify both have matching features
print(f"\nAfter normalization:")
print(f"  Dataset 1 features: {train1.features.keys()}")
print(f"  Dataset 2 features: {train2.features.keys()}")
print(f"  Dataset 1 label type: {type(train1.features['label'])}")
print(f"  Dataset 2 label type: {type(train2.features['label'])}")

# Ensure both have lowercase string labels
print("\nEnsuring all labels are lowercase strings...")
def ensure_lowercase_string(example):
    if 'label' in example and example['label']:
        example['label'] = str(example['label']).lower()
    return example

train1 = train1.map(ensure_lowercase_string)
train2 = train2.map(ensure_lowercase_string)
print("  ✓ All labels normalized to lowercase strings")


Checking and normalizing label types...
Dataset 1: Label already string type
Dataset 2: Label already string type

After normalization:
  Dataset 1 features: dict_keys(['image', 'label'])
  Dataset 2 features: dict_keys(['image', 'label'])
  Dataset 1 label type: <class 'datasets.features.features.Value'>
  Dataset 2 label type: <class 'datasets.features.features.Value'>

Ensuring all labels are lowercase strings...


Map: 100%|██████████| 18326/18326 [00:12<00:00, 1481.03 examples/s]
Map: 100%|██████████| 43762/43762 [00:38<00:00, 1123.49 examples/s]

  ✓ All labels normalized to lowercase strings





In [3]:
# Combine both datasets into one
print("Combining datasets...")
print(f"  Dataset 1 size: {len(train1)}")
print(f"  Dataset 2 size: {len(train2)}")

# Ensure both datasets have matching feature types for concatenation
# Cast both to have the same feature schema
from datasets import Features, Image

# Create a common feature schema
common_features = Features({
    'image': Image(),
    'label': Value('string')
})

train1 = train1.cast(common_features)
train2 = train2.cast(common_features)

# Concatenate the datasets
combined_train = concatenate_datasets([train1, train2])

print(f"\nCombined dataset size: {len(combined_train)}")
print(f"Combined features: {combined_train.features.keys()}")

# Create DatasetDict with combined train split
ds_combined = DatasetDict({"train": combined_train})

# Export the combined dataset
export_path = "./combined-datasets"
ds_combined.save_to_disk(export_path)

print(f"\n✓ Combined dataset exported successfully to: {export_path}")

# Show statistics
print("\n" + "="*50)
print("DATASET COMBINATION SUMMARY")
print("="*50)
print(f"Total examples: {len(combined_train)}")
print(f"  - From mineralimage5K-98: {len(train1)}")
print(f"  - From Minerals_type_images1: {len(train2)}")
print(f"\nFeatures: {list(combined_train.features.keys())}")
print(f"Label type: {type(combined_train.features['label'])}")

# Show unique labels count and sample
unique_labels = sorted(set(combined_train['label']))
print(f"\nTotal unique labels: {len(unique_labels)}")
print(f"\nSample labels (first 10):")
for i, label in enumerate(unique_labels[:10]):
    count = combined_train['label'].count(label)
    print(f"  {i+1}. '{label}': {count} examples")
if len(unique_labels) > 10:
    print(f"  ... and {len(unique_labels) - 10} more")


Combining datasets...
  Dataset 1 size: 18326
  Dataset 2 size: 43762


Casting the dataset: 100%|██████████| 18326/18326 [00:14<00:00, 1243.42 examples/s]
Casting the dataset: 100%|██████████| 43762/43762 [00:19<00:00, 2269.23 examples/s]



Combined dataset size: 62088
Combined features: dict_keys(['image', 'label'])


Saving the dataset (23/23 shards): 100%|██████████| 62088/62088 [01:07<00:00, 915.57 examples/s] 



✓ Combined dataset exported successfully to: ./combined-datasets

DATASET COMBINATION SUMMARY
Total examples: 62088
  - From mineralimage5K-98: 18326
  - From Minerals_type_images1: 43762

Features: ['image', 'label']
Label type: <class 'datasets.features.features.Value'>

Total unique labels: 5246

Sample labels (first 10):
  1. 'abellaite': 12 examples
  2. 'abelsonite': 1 examples
  3. 'abenakiite-(ce)': 2 examples
  4. 'abernathyite': 20 examples
  5. 'abhurite': 16 examples
  6. 'abramovite': 5 examples
  7. 'abuite': 1 examples
  8. 'acanthite': 20 examples
  9. 'acetamide': 1 examples
  10. 'achalaite': 1 examples
  ... and 5236 more
