In [1]:
from datasets import load_dataset
import pandas as pd
from collections import Counter

dataset = load_dataset("tedqc/mineral-dataset")

# Check the number of training rows per label
train_split = dataset['train']
labels = train_split['label']

# Count occurrences per label
label_counts = Counter(labels)

# Get label names if available (for ClassLabel type)
label_feature = train_split.features.get('label')
if hasattr(label_feature, 'names'):
    # Convert numeric labels to names
    label_counts_with_names = {label_feature.names[int(label)]: count for label, count in label_counts.items()}
else:
    label_counts_with_names = {str(label): count for label, count in label_counts.items()}

# Create DataFrame for better visualization
counts_df = pd.DataFrame([
    {'Label': label, 'Count': count} 
    for label, count in sorted(label_counts_with_names.items(), key=lambda x: x[1], reverse=True)
])

print(f"Total training rows: {len(train_split)}")
print(f"Number of unique labels: {len(label_counts)}")
print("\nTraining rows per label:")
print(counts_df.to_string(index=False))

# Summary statistics
print(f"\nSummary Statistics:")
print(f"  Min rows per label: {counts_df['Count'].min()}")
print(f"  Max rows per label: {counts_df['Count'].max()}")
print(f"  Mean rows per label: {counts_df['Count'].mean():.2f}")
print(f"  Median rows per label: {counts_df['Count'].median():.2f}")

  from .autonotebook import tqdm as notebook_tqdm


Total training rows: 62088
Number of unique labels: 5316

Training rows per label:
                                Label  Count
                                   11   2023
                                   28    867
                                    7    708
                                    8    679
                                    6    670
                                   37    581
                                   26    543
                                   30    512
                                    2    442
                                    1    403
                                   53    373
                                    9    354
                                   15    349
                                   48    349
                                   50    331
                                   76    331
                                   16    300
                                   62    227
                                   20    227
                 