In [1]:
from datasets import load_dataset
import pandas as pd

ds = load_dataset("Quanli1/Minerals_type_images1")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Transform the dataset: extract first word from 'text' column, rename to 'label', and convert to lowercase
# Check current columns for all splits
print("Current features:", ds['train'].features.keys())
print("Available splits:", list(ds.keys()))

# Function to extract first word from text (split by whitespace) and convert to lowercase
def extract_first_word_and_lowercase(example):
    if 'text' in example:
        # Split by whitespace and take the first word, then convert to lowercase
        if example['text']:
            first_word = example['text'].split()[0].lower() if example['text'].split() else ''
        else:
            first_word = ''
        return {'text': first_word}
    return example

# Apply transformation to extract first word and convert to lowercase for all splits
if 'text' in ds['train'].column_names:
    # Map the function to transform the text column (applies to all splits automatically)
    ds = ds.map(extract_first_word_and_lowercase)
    print("Extracted first word from 'text' column and converted to lowercase in all splits")
    
    # Now rename 'text' to 'label' (applies to all splits automatically)
    ds = ds.rename_column('text', 'label')
    print("Renamed 'text' to 'label' in all splits")
else:
    print("'text' column not found. Available columns:", ds['train'].column_names)

# Verify transformation worked for all splits
print("\nVerifying labels in all splits:")
for split_name in ds.keys():
    if 'label' in ds[split_name].column_names:
        sample_labels = [ds[split_name][i]['label'] for i in range(min(5, len(ds[split_name])))]
        print(f"  {split_name}: sample labels = {sample_labels}")
    else:
        print(f"  {split_name}: 'label' column not found")

# Export the dataset locally
export_path = "./Minerals_type_images1-processed"
ds.save_to_disk(export_path)

print(f"\nDataset exported successfully to: {export_path}")
print(f"Final features: {ds['train'].features.keys()}")
print(f"\nDataset info:")
print(ds['train'].features)


Current features: dict_keys(['image', 'text'])
Available splits: ['train']
Extracted first word from 'text' column and converted to lowercase in all splits
Renamed 'text' to 'label' in all splits

Verifying labels in all splits:
  train: sample labels = ['abellaite', 'abellaite', 'abellaite', 'abellaite', 'abellaite']


Saving the dataset (16/16 shards): 100%|██████████| 43762/43762 [00:31<00:00, 1406.91 examples/s]


Dataset exported successfully to: ./Minerals_type_images1-processed
Final features: dict_keys(['image', 'label'])

Dataset info:
{'image': Image(mode=None, decode=True), 'label': Value('string')}





In [3]:
# Load the processed dataset and combine all splits into one training split
from datasets import load_from_disk, concatenate_datasets, DatasetDict

# Load the processed dataset
processed_ds = load_from_disk("./Minerals_type_images1-processed")

print("Original dataset splits:", list(processed_ds.keys()))
for split_name in processed_ds.keys():
    print(f"{split_name.capitalize()} size: {len(processed_ds[split_name])}")

# Combine all splits into a single training dataset
combined_train = concatenate_datasets([
    processed_ds[split_name] for split_name in processed_ds.keys()
])

print(f"\nCombined dataset size: {len(combined_train)}")

# Create a new DatasetDict with only the combined train split
ds_combined = DatasetDict({"train": combined_train})

# Export the combined dataset
export_path_combined = "./Minerals_type_images1-combined"
ds_combined.save_to_disk(export_path_combined)

print(f"\nCombined dataset exported successfully to: {export_path_combined}")
print(f"Available features: {ds_combined['train'].features.keys()}")

# Show sample labels from combined dataset
if 'label' in ds_combined['train'].column_names:
    print(f"\nSample labels from combined dataset (first 5):")
    for i in range(min(5, len(ds_combined['train']))):
        print(f"  Example {i}: '{ds_combined['train'][i]['label']}'")


Original dataset splits: ['train']
Train size: 43762

Combined dataset size: 43762


Saving the dataset (16/16 shards): 100%|██████████| 43762/43762 [00:28<00:00, 1535.89 examples/s]


Combined dataset exported successfully to: ./Minerals_type_images1-combined
Available features: dict_keys(['image', 'label'])

Sample labels from combined dataset (first 5):
  Example 0: 'abellaite'
  Example 1: 'abellaite'
  Example 2: 'abellaite'
  Example 3: 'abellaite'
  Example 4: 'abellaite'





In [4]:

# Display the classes
from datasets import ClassLabel

if 'train' in ds:
    # Get unique labels from the train split
    if 'label' in ds['train'].features:
        labels = ds['train']['label']
        unique_labels = sorted(set(labels))
        
        # Check if label is ClassLabel type (has names attribute) or Value type (string)
        label_feature = ds['train'].features['label']
        
        if isinstance(label_feature, ClassLabel) and hasattr(label_feature, 'names'):
            # For ClassLabel, use the names attribute
            class_names = [label_feature.names[i] for i in unique_labels]
        else:
            # For Value (string) type, use the label values directly
            class_names = [str(label) for label in unique_labels]
        
        # Create a DataFrame to display classes in column format
        classes_df = pd.DataFrame({
            'Class ID': unique_labels,
            'Class Name': class_names
        })
        print("Dataset Classes:")
        print(classes_df.to_string(index=False))
        print(f"\nTotal number of unique classes: {len(unique_labels)}")
    else:
        print("Available features:", ds['train'].features.keys())
        print("\nDataset info:")
        print(ds['train'].features)
else:
    print("Available splits:", ds.keys())
    print("\nDataset info:")
    print(ds)

Dataset Classes:
                             Class ID                            Class Name
                            abellaite                             abellaite
                           abelsonite                            abelsonite
                      abenakiite-(ce)                       abenakiite-(ce)
                         abernathyite                          abernathyite
                             abhurite                              abhurite
                           abramovite                            abramovite
                               abuite                                abuite
                            acanthite                             acanthite
                            acetamide                             acetamide
                            achalaite                             achalaite
                        achyrophanite                         achyrophanite
                           achávalite                            acháva