# Download InternVL3-2B Model

This notebook downloads the OpenGVLab/InternVL3-2B model to `/home/jovyan/nfs_share/models/` for vision processing tasks.


In [None]:
# Install required packages
!pip install huggingface_hub transformers torch torchvision

# Alternative: Use individual file download for guaranteed flat structure
USE_INDIVIDUAL_DOWNLOAD = False  # Set to True for maximum flat file control

In [None]:
import os
from pathlib import Path
from huggingface_hub import snapshot_download, hf_hub_download, list_repo_files
import torch

print(f"🔧 PyTorch version: {torch.__version__}")
print(f"🔧 CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"🔧 CUDA device count: {torch.cuda.device_count()}")
    print(f"🔧 Current CUDA device: {torch.cuda.current_device()}")
    print(f"🔧 GPU name: {torch.cuda.get_device_name()}")
    
print(f"🔧 HuggingFace Hub version available for flat file download")

In [None]:
# Configuration
MODEL_NAME = "OpenGVLab/InternVL3-2B"
DOWNLOAD_PATH = "/home/jovyan/nfs_share/models/InternVL3-2B"

print(f"📥 Model: {MODEL_NAME}")
print(f"📁 Download path: {DOWNLOAD_PATH}")

# Create directory if it doesn't exist
Path(DOWNLOAD_PATH).parent.mkdir(parents=True, exist_ok=True)
print(f"✅ Directory created/verified: {Path(DOWNLOAD_PATH).parent}")

In [None]:
# Check available disk space
import shutil

def get_disk_usage(path):
    """Get disk usage statistics for the given path."""
    total, used, free = shutil.disk_usage(path)
    return {
        'total_gb': total // (1024**3),
        'used_gb': used // (1024**3), 
        'free_gb': free // (1024**3)
    }

usage = get_disk_usage('/home/jovyan/nfs_share')
print(f"💾 Disk space on /home/jovyan/nfs_share:")
print(f"   Total: {usage['total_gb']} GB")
print(f"   Used:  {usage['used_gb']} GB")
print(f"   Free:  {usage['free_gb']} GB")

# InternVL3-2B is approximately 4-6 GB
required_gb = 8  # Safety margin
if usage['free_gb'] < required_gb:
    print(f"⚠️  Warning: Available space ({usage['free_gb']} GB) may be insufficient")
    print(f"   Recommended: At least {required_gb} GB free space")
else:
    print(f"✅ Sufficient disk space available")

In [None]:
# Check if model already exists
if Path(DOWNLOAD_PATH).exists() and any(Path(DOWNLOAD_PATH).iterdir()):
    print(f"📁 Model directory already exists: {DOWNLOAD_PATH}")
    print(f"📋 Contents:")
    for item in Path(DOWNLOAD_PATH).iterdir():
        if item.is_file():
            size_mb = item.stat().st_size / (1024**2)
            print(f"   📄 {item.name} ({size_mb:.1f} MB)")
        else:
            print(f"   📁 {item.name}/")
    
    response = input("\n🤔 Model appears to already exist. Download anyway? (y/N): ")
    if response.lower() not in ['y', 'yes']:
        print("⏹️  Download cancelled")
        SKIP_DOWNLOAD = True
    else:
        SKIP_DOWNLOAD = False
        print("🔄 Proceeding with download (will overwrite existing files)")
else:
    SKIP_DOWNLOAD = False
    print(f"📁 Model directory is empty or doesn't exist")
    print(f"🚀 Ready to download")

In [None]:
# Download the model (FLAT FILE download)
if not SKIP_DOWNLOAD:
    print(f"📥 Starting FLAT FILE download of {MODEL_NAME}...")
    print(f"📍 Destination: {DOWNLOAD_PATH}")
    print(f"⏱️  This may take 10-30 minutes depending on your connection...")
    print(f"🔧 Using flat file download (no HuggingFace cache or storage mapping)")
    
    try:
        if USE_INDIVIDUAL_DOWNLOAD:
            # Method 1: Individual file download (guaranteed flat structure)
            print(f"🔍 Listing repository files...")
            repo_files = list_repo_files(MODEL_NAME)
            model_files = [f for f in repo_files if not f.startswith('.git')]
            
            print(f"📋 Found {len(model_files)} files to download")
            
            Path(DOWNLOAD_PATH).mkdir(parents=True, exist_ok=True)
            
            for i, filename in enumerate(model_files, 1):
                print(f"📥 Downloading {i}/{len(model_files)}: {filename}")
                
                file_path = hf_hub_download(
                    repo_id=MODEL_NAME,
                    filename=filename,
                    cache_dir=None,  # No cache
                    local_dir=DOWNLOAD_PATH,
                    local_dir_use_symlinks=False,  # Flat files only
                    resume_download=True
                )
                
            downloaded_path = DOWNLOAD_PATH
            print(f"✅ Individual file download completed!")
            
        else:
            # Method 2: Snapshot download with flat file settings
            downloaded_path = snapshot_download(
                repo_id=MODEL_NAME,
                cache_dir=None,  # Don't use HF cache directory  
                local_dir=DOWNLOAD_PATH,
                local_dir_use_symlinks=False,  # Force actual file copies (no symlinks)
                resume_download=True,  # Resume if interrupted
                # Download all files as flat files
                ignore_patterns=["*.git*"],  # Only ignore git files
                # Force flat file download without storage mapping
                force_download=False,  # Don't re-download existing files
                proxies=None,
                etag_timeout=10,
                token=None
            )
            print(f"✅ Snapshot download completed!")
        
        print(f"📁 Model files saved directly to: {downloaded_path}")
        print(f"🔧 All files are actual copies (no symlinks or cache mapping)")
        
        # Verify flat structure
        files = list(Path(DOWNLOAD_PATH).glob("*"))
        print(f"📋 Downloaded {len(files)} files directly to target directory")
        
        # Show actual file paths to confirm flat structure
        print(f"🔍 Sample files in target directory:")
        for file_path in sorted(files)[:5]:
            if file_path.is_file():
                size_mb = file_path.stat().st_size / (1024**2)
                print(f"   📄 {file_path.name} ({size_mb:.1f} MB)")
        
    except Exception as e:
        print(f"❌ Download failed: {e}")
        print(f"💡 Possible solutions:")
        print(f"   - Check internet connection")
        print(f"   - Verify disk space")
        print(f"   - Clear HuggingFace cache: rm -rf ~/.cache/huggingface")
        print(f"   - Set USE_INDIVIDUAL_DOWNLOAD=True for alternative method")
        print(f"   - Try running again (download will resume)")
        raise
else:
    print(f"⏩ Skipping download")

In [None]:
# Verify the download
print(f"🔍 Verifying download...")

if Path(DOWNLOAD_PATH).exists():
    # List all files in the model directory
    files = list(Path(DOWNLOAD_PATH).rglob('*'))
    total_size = 0
    
    print(f"📋 Model directory contents ({len(files)} files):")
    
    key_files = []
    for file_path in sorted(files):
        if file_path.is_file():
            size_mb = file_path.stat().st_size / (1024**2)
            total_size += size_mb
            rel_path = file_path.relative_to(DOWNLOAD_PATH)
            
            # Highlight important files
            if any(important in file_path.name.lower() for important in ['config', 'model', 'tokenizer', '.bin', '.safetensors']):
                key_files.append(f"   📄 {rel_path} ({size_mb:.1f} MB)")
    
    # Show key files
    print(f"\n🔑 Key model files:")
    for key_file in key_files[:10]:  # Show first 10 key files
        print(key_file)
    
    if len(key_files) > 10:
        print(f"   ... and {len(key_files) - 10} more files")
    
    print(f"\n📊 Total model size: {total_size:.1f} MB ({total_size/1024:.2f} GB)")
    
    # Check for essential files
    essential_files = ['config.json', 'tokenizer_config.json']
    missing_files = []
    
    for essential in essential_files:
        if not any(essential in f.name for f in files):
            missing_files.append(essential)
    
    if missing_files:
        print(f"⚠️  Warning: Some essential files may be missing: {missing_files}")
    else:
        print(f"✅ All essential files appear to be present")
        
else:
    print(f"❌ Model directory not found: {DOWNLOAD_PATH}")

In [None]:
# Test loading the model (optional)
TEST_LOAD = True  # Set to False to skip model loading test

if TEST_LOAD and Path(DOWNLOAD_PATH).exists():
    print(f"🧪 Testing model loading...")
    
    try:
        from transformers import AutoTokenizer, AutoModel
        
        print(f"📥 Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(
            DOWNLOAD_PATH, 
            trust_remote_code=True
        )
        print(f"✅ Tokenizer loaded successfully")
        
        print(f"📥 Loading model (this may take a few minutes)...")
        model = AutoModel.from_pretrained(
            DOWNLOAD_PATH,
            torch_dtype=torch.float16,  # Use half precision to save memory
            trust_remote_code=True,
            device_map="auto" if torch.cuda.is_available() else "cpu"
        )
        print(f"✅ Model loaded successfully")
        
        # Print model info
        if hasattr(model, 'config'):
            print(f"📋 Model configuration:")
            if hasattr(model.config, 'hidden_size'):
                print(f"   Hidden size: {model.config.hidden_size}")
            if hasattr(model.config, 'num_attention_heads'):
                print(f"   Attention heads: {model.config.num_attention_heads}")
            if hasattr(model.config, 'num_hidden_layers'):
                print(f"   Hidden layers: {model.config.num_hidden_layers}")
        
        # Get model size
        total_params = sum(p.numel() for p in model.parameters())
        print(f"📊 Total parameters: {total_params:,} ({total_params/1e9:.2f}B)")
        
        # Clean up to free memory
        del model
        del tokenizer
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        print(f"✅ Model test completed successfully")
        print(f"🧹 Memory cleaned up")
        
    except Exception as e:
        print(f"❌ Model loading test failed: {e}")
        print(f"💡 This doesn't necessarily mean the download failed")
        print(f"   The model files may still be valid for use in your application")
        
else:
    print(f"⏩ Skipping model loading test")

In [None]:
# Final summary
print(f"\n" + "="*60)
print(f"📋 DOWNLOAD SUMMARY")
print(f"="*60)
print(f"🤖 Model: {MODEL_NAME}")
print(f"📁 Location: {DOWNLOAD_PATH}")

if Path(DOWNLOAD_PATH).exists():
    files = list(Path(DOWNLOAD_PATH).rglob('*'))
    total_size = sum(f.stat().st_size for f in files if f.is_file()) / (1024**3)
    print(f"📊 Size: {total_size:.2f} GB")
    print(f"📄 Files: {len([f for f in files if f.is_file()])}")
    print(f"✅ Status: Download completed successfully")
else:
    print(f"❌ Status: Model not found")

print(f"\n💡 Next steps:")
print(f"   1. Use the model in your vision processing applications")
print(f"   2. Model path: {DOWNLOAD_PATH}")
print(f"   3. Remember to use trust_remote_code=True when loading")
print(f"\n🎉 InternVL3-2B model is ready for use!")