In [5]:
# Load environment variables from .env file
import os
from pathlib import Path

try:
    from dotenv import load_dotenv
except ImportError:
    print("⚠ python-dotenv not installed. Install it with: pip install python-dotenv")
    print("  Will use system environment variables only.")
    load_dotenv = None

# Find the .env file in the project root
# Start from current directory and go up to find .env file
current_dir = Path().cwd()
env_path = None

# Try different locations: current dir, parent, parent.parent, etc.
search_paths = [
    current_dir / '.env',
    current_dir.parent / '.env',
    current_dir.parent.parent / '.env',
    current_dir.parent.parent.parent / '.env',
]

for path in search_paths:
    if path.exists():
        env_path = path
        break

if env_path:
    print(f"Found .env file at: {env_path}")
    if load_dotenv:
        load_dotenv(env_path)
        print("✓ Loaded environment variables from .env file")
    else:
        print("⚠ python-dotenv not available, cannot load .env file")
else:
    print("⚠ .env file not found in search paths")
    print("  Searched in:")
    for path in search_paths:
        print(f"    - {path}")
    print("  Will use system environment variables only")

# Get HuggingFace token from environment
HF_TOKEN = os.getenv('HF_TOKEN') or os.getenv('HUGGING_FACE_HUB_TOKEN') or os.getenv('HUGGINGFACE_TOKEN')

if HF_TOKEN:
    print(f"\n✓ HuggingFace token found in environment")
    print(f"  Token length: {len(HF_TOKEN)} characters")
else:
    print("\n⚠ HuggingFace token not found in environment variables")
    print("  Please set one of the following in your .env file:")
    print("    - HF_TOKEN")
    print("    - HUGGING_FACE_HUB_TOKEN")
    print("    - HUGGINGFACE_TOKEN")
    print("\n  Example .env file content:")
    print("    HF_TOKEN=your_token_here")


Found .env file at: c:\Clones\rocks-and-minerals-identifier\.env
✓ Loaded environment variables from .env file

✓ HuggingFace token found in environment
  Token length: 37 characters


In [6]:
# Load the combined dataset from disk
from datasets import load_from_disk

# Load the dataset (the dataset should be in the same directory as this notebook)
# Try current directory first, then check parent directory
current_dir = Path().cwd()
dataset_path = current_dir / "train"

# If train folder doesn't exist in current dir, try loading from current dir directly
if not dataset_path.exists():
    dataset_path = current_dir

print(f"Current directory: {current_dir}")
print(f"Loading dataset from: {dataset_path}")

# Check if dataset_dict.json exists (indicates it's a DatasetDict)
if (current_dir / "dataset_dict.json").exists():
    ds = load_from_disk(str(current_dir))
elif dataset_path.exists():
    # Try loading from the train subdirectory
    from datasets import DatasetDict, Dataset
    train_ds = load_from_disk(str(current_dir / "train"))
    ds = DatasetDict({"train": train_ds})
else:
    raise FileNotFoundError(f"Dataset not found in {current_dir}. Make sure you're running this notebook from the combined-datasets folder.")

print(f"\n✓ Dataset loaded successfully")
print(f"  Splits: {list(ds.keys())}")
print(f"  Train size: {len(ds['train'])}")
print(f"  Features: {list(ds['train'].features.keys())}")
print(f"  Label type: {type(ds['train'].features['label'])}")

# Show some statistics
unique_labels = sorted(set(ds['train']['label']))
print(f"\n  Total unique labels: {len(unique_labels)}")
print(f"  Sample labels (first 10): {unique_labels[:10]}")


Current directory: c:\Clones\rocks-and-minerals-identifier\data\combined-datasets
Loading dataset from: c:\Clones\rocks-and-minerals-identifier\data\combined-datasets\train

✓ Dataset loaded successfully
  Splits: ['train']
  Train size: 62088
  Features: ['image', 'label']
  Label type: <class 'datasets.features.features.Value'>

  Total unique labels: 5246
  Sample labels (first 10): ['abellaite', 'abelsonite', 'abenakiite-(ce)', 'abernathyite', 'abhurite', 'abramovite', 'abuite', 'acanthite', 'acetamide', 'achalaite']


In [7]:
# Configure HuggingFace Hub upload settings
# Set your HuggingFace repository ID here (format: username/dataset-name)
# Example: "myusername/combined-minerals-dataset"
dataset_repo_id = "tedqc/mineral-dataset"

# Set to True to make the dataset private, False to make it public
private = True

# Maximum shard size (adjust if needed)
max_shard_size = "200MB"

print(f"Repository ID: {dataset_repo_id}")
print(f"Private: {private}")
print(f"Max shard size: {max_shard_size}")
print(f"\nDataset info:")
print(f"  Size: {len(ds['train'])} examples")
print(f"  Unique labels: {len(unique_labels)}")
print(f"  Features: {list(ds['train'].features.keys())}")

if dataset_repo_id == "YOUR_USERNAME/YOUR_DATASET_NAME":
    print("\n⚠ Please update 'dataset_repo_id' above with your HuggingFace username and dataset name")
else:
    print(f"\n✓ Ready to upload to: {dataset_repo_id}")


Repository ID: tedqc/mineral-dataset
Private: True
Max shard size: 200MB

Dataset info:
  Size: 62088 examples
  Unique labels: 5246
  Features: ['image', 'label']

✓ Ready to upload to: tedqc/mineral-dataset


In [8]:
# Upload the dataset to HuggingFace Hub
if not HF_TOKEN:
    print("⚠ Error: HuggingFace token not found!")
    print("Please set HF_TOKEN, HUGGING_FACE_HUB_TOKEN, or HUGGINGFACE_TOKEN in your .env file")
elif dataset_repo_id == "YOUR_USERNAME/YOUR_DATASET_NAME":
    print("⚠ Please update 'dataset_repo_id' in the previous cell with your HuggingFace username and dataset name")
else:
    print("Uploading dataset to HuggingFace Hub...")
    print("This may take a while depending on dataset size...\n")
    print(f"Repository: {dataset_repo_id}")
    print(f"Private: {private}")
    print(f"Max shard size: {max_shard_size}\n")
    
    try:
        # Push to HuggingFace Hub
        ds.push_to_hub(
            repo_id=dataset_repo_id,
            token=HF_TOKEN,
            private=private,
            max_shard_size=max_shard_size,
            num_proc=10
        )
        
        print("\n" + "="*50)
        print("✓ DATASET UPLOADED SUCCESSFULLY!")
        print("="*50)
        print(f"\nDataset URL: https://huggingface.co/datasets/{dataset_repo_id}")
        print(f"Dataset size: {len(ds['train'])} examples")
        print(f"Unique labels: {len(unique_labels)}")
        print("\nYou can now use this dataset with:")
        print(f"  from datasets import load_dataset")
        print(f"  ds = load_dataset('{dataset_repo_id}')")
        
    except Exception as e:
        print(f"\n✗ Error uploading dataset: {e}")
        print(f"Error type: {type(e).__name__}")
        print("\nTroubleshooting tips:")
        print("1. Check that your HuggingFace token is valid")
        print("2. Ensure you have write access to the repository")
        print("3. Check your internet connection")
        print("4. Verify the repository ID is correct (format: username/dataset-name)")
        print("5. Make sure the repository doesn't already exist (or you have permission to overwrite)")
        import traceback
        traceback.print_exc()


Uploading dataset to HuggingFace Hub...
This may take a while depending on dataset size...

Repository: tedqc/mineral-dataset
Private: True
Max shard size: 200MB



Uploading the dataset shards (num_proc=10): 100%|██████████| 58/58 [04:20<00:00,  4.49s/ shards]  
No files have been modified since last commit. Skipping to prevent empty commit.



✓ DATASET UPLOADED SUCCESSFULLY!

Dataset URL: https://huggingface.co/datasets/tedqc/mineral-dataset
Dataset size: 62088 examples
Unique labels: 5246

You can now use this dataset with:
  from datasets import load_dataset
  ds = load_dataset('tedqc/mineral-dataset')
