In [11]:
!nvidia-smi
!pip -q install --upgrade pip
!pip -q install torch==2.3.1+cu118 torchvision==0.18.1+cu118 torchaudio==2.3.1+cu118 --index-url https://download.pytorch.org/whl/cu118
!pip -q install "transformers==4.35.0" "accelerate==0.24.0" "datasets>=2.20" "evaluate" "scikit-learn" "peft>=0.11" "tensorboard" "pyarrow<18" "tiktoken"

Wed Sep  3 10:32:54 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.127.08             Driver Version: 550.127.08     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        On  |   00000000:C1:00.0 Off |                  N/A |
| 30%   27C    P8             38W /  350W |       2MiB /  24576MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA GeForce RTX 3090        On  |   00

In [12]:
# Check current versions and model compatibility
import transformers
import accelerate
print(f"🔧 Transformers version: {transformers.__version__}")
print(f"🚀 Accelerate version: {accelerate.__version__}")
print(f"🎯 Using DeBERTa-v3-large - powerful model with slow tokenizer workaround")
print("✅ DeBERTa-v3-large with DebertaV2Tokenizer avoids tiktoken issues")


🔧 Transformers version: 4.56.0
🚀 Accelerate version: 1.10.1
🎯 Using DeBERTa-v3-large - powerful model with slow tokenizer workaround
✅ DeBERTa-v3-large with DebertaV2Tokenizer avoids tiktoken issues


In [13]:
# Check training script status (PROTECTED - READ-ONLY)
import os
import stat
import hashlib

if os.path.exists("train_samo.py"):
    # Check file permissions
    file_stat = os.stat("train_samo.py")
    is_readonly = not (file_stat.st_mode & stat.S_IWRITE)
    
    if is_readonly:
        print("🛡️  Script is PROTECTED (read-only) - cannot be modified!")
    else:
        print("⚠️  Script is writable - may be overwritten")
    
    # Check if main function exists
    with open("train_samo.py", "r") as f:
        content = f.read()
        if "def main():" in content and "trainer.train()" in content:
            print("✅ Script has complete main function with training logic!")
        elif "def main():" in content:
            print("✅ Script has main function")
        else:
            print("❌ Script missing main function!")
    
    # Show script info
    print("📄 Script: train_samo.py")
    print("📏 Size:", os.path.getsize("train_samo.py"), "bytes")
    print("🔐 SHA256:", hashlib.sha256(open("train_samo.py",'rb').read()).hexdigest())
    
    if is_readonly:
        print("🚀 Script is ready for training and protected from modification!")
else:
    print("❌ Script not found!")


🛡️  Script is PROTECTED (read-only) - cannot be modified!
✅ Script has complete main function with training logic!
📄 Script: train_samo.py
📏 Size: 23623 bytes
🔐 SHA256: 27f99c3e904d90f0b23ca891b8ad191b522987d8a58055dd88fa550a29a6c390
🚀 Script is ready for training and protected from modification!


In [14]:
# Verify the training script is complete and ready
import os
import hashlib

# Check if main function exists
with open("train_samo.py", "r") as f:
    content = f.read()
    if "def main():" in content and "if __name__ == \"__main__\":" in content:
        print("✅ Training script is complete with main function")
        print("🚀 Ready to start training!")
    else:
        print("❌ Training script is missing main function!")
        print("The script will not work without the main() function.")

# Show script info
if os.path.exists("train_samo.py"):
    print("📄 Training script: train_samo.py")
    print("📏 Size:", os.path.getsize("train_samo.py"), "bytes")
    print("🔐 SHA256:", hashlib.sha256(open("train_samo.py",'rb').read()).hexdigest())
    print("✅ Script is ready for training!")
else:
    print("❌ Training script not found!")


✅ Training script is complete with main function
🚀 Ready to start training!
📄 Training script: train_samo.py
📏 Size: 23623 bytes
🔐 SHA256: 27f99c3e904d90f0b23ca891b8ad191b522987d8a58055dd88fa550a29a6c390
✅ Script is ready for training!


In [15]:
%%bash
accelerate config default
CONFIG=/workspace/.hf_home/accelerate/default_config.yaml
python3 - <<'PY'
import json
from pathlib import Path
p = Path("/workspace/.hf_home/accelerate/default_config.yaml")
config = json.loads(p.read_text())
config['distributed_type'] = 'MULTI_GPU'
config['mixed_precision'] = 'fp16'
config['num_processes'] = 2
p.write_text(json.dumps(config, indent=2))
print("Accelerate config patched:\n", p.read_text())
PY

Configuration already exists at /workspace/.hf_home/accelerate/default_config.yaml, will not override. Run `accelerate config` manually or pass a different `save_location`.
Accelerate config patched:
 {
  "compute_environment": "LOCAL_MACHINE",
  "debug": false,
  "distributed_type": "MULTI_GPU",
  "downcast_bf16": false,
  "enable_cpu_affinity": false,
  "machine_rank": 0,
  "main_training_function": "main",
  "mixed_precision": "fp16",
  "num_machines": 1,
  "num_processes": 2,
  "rdzv_backend": "static",
  "same_network": false,
  "tpu_use_cluster": false,
  "tpu_use_sudo": false,
  "use_cpu": false
}


In [16]:
# Set up Hugging Face authentication (optional but recommended)
# You can get a token from: https://huggingface.co/settings/tokens
import os

# Option 1: Set environment variable (recommended for security)
os.environ["HF_TOKEN"] = "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"

# Option 2: Use huggingface_hub login (interactive)
# from huggingface_hub import login
# login()  # This will prompt you to enter your token

# Option 3: Check if token is already set
if os.getenv("HF_TOKEN"):
    print("✅ HF_TOKEN is set")
else:
    print("⚠️  HF_TOKEN not set - you may hit rate limits")
    print("To set it: os.environ['HF_TOKEN'] = 'your_token_here'")
    print("Or get a token from: https://huggingface.co/settings/tokens")


✅ HF_TOKEN is set


In [17]:
# Training configuration
OUT_DIR = "./samo_out"
MODEL_NAME = "microsoft/deberta-v3-large"  # The powerful model we want - let's fix the tiktoken issue!
!mkdir -p "$OUT_DIR"

In [18]:
# Test tokenizer loading with robust workaround for DeBERTa-v3-large tiktoken issue
print("🧪 Testing tokenizer loading with robust workaround...")
try:
    from transformers import DebertaV2Tokenizer
    import os
    
    # Set environment variable to force slow tokenizer (workaround for tiktoken issue)
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    
    # Use DebertaV2Tokenizer directly (most reliable for DeBERTa-v3)
    tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_NAME)
    print(f"✅ Tokenizer loaded successfully for {MODEL_NAME} using DebertaV2Tokenizer!")
    print(f"📏 Vocab size: {tokenizer.vocab_size}")
    print(f"🔧 Tokenizer type: {type(tokenizer).__name__}")
    
    # Test tokenization
    test_text = "I love this movie! It's amazing."
    tokens = tokenizer(test_text, return_tensors="pt")
    print(f"🔤 Test tokenization: '{test_text}' -> {tokens['input_ids'].shape}")
    print("🚀 Ready for training with DebertaV2Tokenizer workaround!")
    
except Exception as e:
    print(f"❌ DebertaV2Tokenizer failed: {e}")
    print("🔄 Let's try AutoTokenizer fallback...")
    try:
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
        print(f"✅ Fallback successful with AutoTokenizer (use_fast=False)")
        print(f"📏 Vocab size: {tokenizer.vocab_size}")
    except Exception as e2:
        print(f"❌ All tokenizer methods failed: {e2}")
        print("This indicates a deeper compatibility issue that needs investigation.")


🧪 Testing tokenizer loading with robust workaround...
❌ DebertaV2Tokenizer failed: 
DebertaV2Tokenizer requires the SentencePiece library but it was not found in your environment. Check out the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.

🔄 Let's try AutoTokenizer fallback...
❌ All tokenizer methods failed: Converting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: ['AlbertTokenizer', 'BartTokenizer', 'BarthezTokenizer', 'BertTokenizer', 'BigBirdTokenizer', 'BlenderbotTokenizer', 'CamembertTokenizer', 'CLIPTokenizer', 'CodeGenTokenizer', 'ConvBertTokenizer', 'DebertaTokenizer', 'DebertaV2Tokenizer', 'DistilBertTokenizer', 'DPRReaderTokenizer', 'DPRQuestionEncoderT

In [None]:
!accelerate launch --num_processes=2 --mixed_precision=fp16 \
train_samo.py \
--output_dir "$OUT_DIR" \
--model_name "$MODEL_NAME" \
--per_device_train_batch_size 8 --per_device_eval_batch_size 16 \
--gradient_accumulation_steps 4 \
--num_train_epochs 3 \
--learning_rate 1e-5 --lr_scheduler_type cosine --warmup_ratio 0.1 \
--weight_decay 0.01 --fp16 true --tf32 true --gradient_checkpointing true \
--ddp_backend nccl

✅ PEFT available for LoRA fine-tuning
🚀 SAMO - GoEmotions Multi-Label Trainer
📁 Output directory: ./samo_out
🤖 Model: microsoft/deberta-v3-large
📊 Loading GoEmotions dataset using alternative method...
🔑 Authenticating with Hugging Face...
✅ PEFT available for LoRA fine-tuning
🚀 SAMO - GoEmotions Multi-Label Trainer
📁 Output directory: ./samo_out
🤖 Model: microsoft/deberta-v3-large
📊 Loading GoEmotions dataset using alternative method...
🔑 Authenticating with Hugging Face...
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
✅ Successfully authenticated with Hugging Face
🤖 Using DeBERTa-v3-large: microsoft/deberta-v3-large
📊 Loading real GoEmotions dataset directly from Hugging Face hub...
📥 Downloading GoEmotions dataset files...
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
✅ Successfully authenticated with Hugging Face
🤖 Usi

In [None]:
import json, os

eval_report_path = os.path.join(OUT_DIR, "eval_report.json")
if os.path.exists(eval_report_path):
    with open(eval_report_path, "r") as f:
        rep = json.load(f)
    print("F1_micro:", rep["f1_micro"], " F1_macro:", rep["f1_macro"])
    # Show 5 worst & best classes by F1
    pc = rep["per_class"]
    sorted_items = sorted(pc.items(), key=lambda kv: kv[1]["f1"])
    print("\nWorst 5:")
    for k,v in sorted_items[:5]:
        print(k, v)
    print("\nBest 5:")
    for k,v in sorted_items[-5:]:
        print(k, v)
else:
    print(f"❌ Evaluation report not found at {eval_report_path}")
    print("This means training hasn't completed yet or failed.")
    print("Please run Cell 4 (training) first and wait for it to complete.")
    print(f"Output directory contents: {os.listdir(OUT_DIR) if os.path.exists(OUT_DIR) else 'Directory does not exist'}")