# Classifier

Lightweight version using pre-merged weights.

In [1]:
# Install dependencies
%pip install --upgrade pip
%pip install -q transformers==4.44.2 peft==0.13.2 huggingface-hub accelerate safetensors
%pip install -q llm2vec==0.2.3 gradio plotly hf_transfer

print("‚úì Dependencies installed")
print("‚ö†Ô∏è  IMPORTANT: Restart the kernel before continuing!")

[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
‚úì Dependencies installed
‚ö†Ô∏è  IMPORTANT: Restart the kernel before continuing!


In [2]:
import transformers
print(f"Transformers version: {transformers.__version__}")
assert transformers.__version__ == "4.44.2", f"Need transformers==4.44.2 for llm2vec compatibility, got {transformers.__version__}"
print("‚úì Version check passed")

Transformers version: 4.44.2
‚úì Version check passed


In [3]:
# Download pre-merged model to PERSISTENT VOLUME (not ephemeral root disk!)
import os

# CRITICAL: Use /workspace for large files to avoid filling the container root disk
CHECKPOINT_PATH = "/workspace/classifier_chat"

!rm -rf {CHECKPOINT_PATH}
print(f"Downloading model (15.5 GB) to {CHECKPOINT_PATH}...")
!hf download Yida/classifier_chat --local-dir {CHECKPOINT_PATH}
print("‚úì Download complete")
print(f"‚úì Model saved to persistent volume: {CHECKPOINT_PATH}")

Downloading model (15.5 GB) to /workspace/classifier_chat...


Fetching 8 files:   0%|                                   | 0/8 [00:00<?, ?it/s]Still waiting to acquire lock on /workspace/classifier_chat/.cache/huggingface/.gitignore.lock (elapsed: 0.1 seconds)
Still waiting to acquire lock on /workspace/classifier_chat/.cache/huggingface/.gitignore.lock (elapsed: 0.1 seconds)
Still waiting to acquire lock on /workspace/classifier_chat/.cache/huggingface/.gitignore.lock (elapsed: 0.1 seconds)
Downloading 'scheduler.pt' to '/workspace/classifier_chat/.cache/huggingface/download/-TKHT9st1Ll35ofujHvZcGIOiAc=.505e140348cf428a705b42cb8aafd791b8d0b0ac55c70e82b985098cfa08a46a.incomplete'
Downloading 'training_args.bin' to '/workspace/classifier_chat/.cache/huggingface/download/wDrERwdtvdWRGcZ53Ku-FoERHzQ=.e655c9f4ec242123684bdefde757c27947012689a113dea3c6edf3c217226334.incomplete'

scheduler.pt:   0%|                                 | 0.00/1.06k [00:00<?, ?B/s][ADownloading 'model.safetensors' to '/workspace/classifier_chat/.cache/huggingface/download/xG

In [4]:
from huggingface_hub import login
from getpass import getpass
token = getpass("Enter your HuggingFace token: ")
login(token=token)
print("‚úì Authenticated!")


‚úì Authenticated!


In [5]:
import shutil
import os

# CRITICAL: Clear ALL HuggingFace cache for this model
cache_locations = [
    "/workspace/.cache/huggingface/modules/transformers_modules/McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp",
    os.path.expanduser("~/.cache/huggingface/modules/transformers_modules/McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp"),
    "/workspace/.cache/huggingface/hub/models--McGill-NLP--LLM2Vec-Meta-Llama-3-8B-Instruct-mntp",
    os.path.expanduser("~/.cache/huggingface/hub/models--McGill-NLP--LLM2Vec-Meta-Llama-3-8B-Instruct-mntp"),
]

print("üóëÔ∏è  Clearing HuggingFace cache...")
cleared_any = False
for cache_dir in cache_locations:
    if os.path.exists(cache_dir):
        print(f"  Deleting: {cache_dir}")
        try:
            shutil.rmtree(cache_dir)
            cleared_any = True
            print(f"  ‚úì Cleared: {cache_dir}")
        except Exception as e:
            print(f"  ‚ö†Ô∏è  Failed to clear {cache_dir}: {e}")

if cleared_any:
    print("\n‚úÖ Cache cleared! OLD model code will be downloaded on next load.")
else:
    print("\n‚úì No cache found. OLD model code will be downloaded on first load.")

print("\n‚ö†Ô∏è  IMPORTANT: Run this cell, THEN immediately run the model loading cell!")

üóëÔ∏è  Clearing HuggingFace cache...
  Deleting: /root/.cache/huggingface/hub/models--McGill-NLP--LLM2Vec-Meta-Llama-3-8B-Instruct-mntp
  ‚úì Cleared: /root/.cache/huggingface/hub/models--McGill-NLP--LLM2Vec-Meta-Llama-3-8B-Instruct-mntp

‚úÖ Cache cleared! OLD model code will be downloaded on next load.

‚ö†Ô∏è  IMPORTANT: Run this cell, THEN immediately run the model loading cell!


In [6]:
import torch
import numpy as np
from transformers import AutoConfig, AutoModel, AutoTokenizer
from llm2vec import LLM2Vec
import gc
import os

# CRITICAL: Use persistent volume paths to avoid filling ephemeral root disk
CHECKPOINT_PATH = "/workspace/classifier_chat"
OFFLOAD_PATH = "/workspace/offload"

def load_classifier(checkpoint_path=CHECKPOINT_PATH, num_labels=5):
    """
    Memory-optimized model loading that avoids the double-load RAM spike.
    - Uses /workspace for all large files (persistent volume)
    - Loads checkpoint weights directly to GPU to avoid CPU RAM exhaustion
    """
    print("Loading classifier...")
    torch.cuda.empty_cache()
    gc.collect()

    base_model_id = "McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp"
    
    # Ensure offload directory exists on persistent volume
    os.makedirs(OFFLOAD_PATH, exist_ok=True)

    print(f"Step 1: Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(
        base_model_id,
        trust_remote_code=True,
        token=True
    )
    print("‚úì Tokenizer loaded")
    
    print(f"Step 2: Loading base model from HuggingFace Hub...")
    print("  (Loading with memory optimization)")
    
    # Load base model with optimized settings
    base_model = AutoModel.from_pretrained(
        base_model_id,
        torch_dtype=torch.bfloat16,
        device_map="cuda:0",  # Direct to GPU
        offload_folder=OFFLOAD_PATH,  # Use persistent volume for offload
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        token=True
    )
    print("‚úì Base model loaded")
    
    # Clear any CPU memory used during loading
    gc.collect()
    torch.cuda.empty_cache()
    
    print("Step 3: Loading fine-tuned weights from checkpoint...")
    checkpoint_file = os.path.join(checkpoint_path, "model.safetensors")
    if os.path.exists(checkpoint_file):
        from safetensors.torch import load_file
        
        # CRITICAL FIX: Load weights directly to GPU to avoid CPU RAM spike
        # This prevents the "double load" that exhausts System RAM
        print("  Loading weights directly to GPU (memory-optimized)...")
        state_dict = load_file(checkpoint_file, device="cuda:0")
        
        # Now assign weights - both model and state_dict are on GPU
        base_model.load_state_dict(state_dict, strict=False)
        
        # Immediately free the state_dict to release VRAM
        del state_dict
        gc.collect()
        torch.cuda.empty_cache()
        
        print("‚úì Checkpoint weights loaded (directly to GPU)")
    else:
        print(f"‚ö†Ô∏è  Checkpoint not found at {checkpoint_file}, using base model")
    
    print("Step 4: Wrapping with LLM2Vec...")
    model = LLM2Vec(base_model, tokenizer, pooling_mode="mean", max_length=512)
    
    print("Step 5: Adding classification head...")
    hidden_size = base_model.config.hidden_size
    model.head = torch.nn.Linear(hidden_size, num_labels, dtype=torch.bfloat16)

    head_file = os.path.join(checkpoint_path, "head.pt")
    if os.path.exists(head_file):
        target_device = torch.device("cuda:0")
        model.head.load_state_dict(torch.load(head_file, map_location=target_device, weights_only=True))
        model.head = model.head.to(target_device)
        print("‚úì Classification head loaded")
    else:
        print(f"‚ö†Ô∏è  Head not found at {head_file}")
    
    model.eval()
    
    # Final cleanup
    gc.collect()
    torch.cuda.empty_cache()
    
    print(f"\n‚úÖ Model fully loaded on GPU!")
    print(f"   VRAM used: {torch.cuda.memory_allocated()/1e9:.1f} GB")
    return model

model = load_classifier()

Loading classifier...
Step 1: Loading tokenizer...


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/335 [00:00<?, ?B/s]

‚úì Tokenizer loaded
Step 2: Loading base model from HuggingFace Hub...
  (Loading with memory optimization)


config.json:   0%|          | 0.00/781 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

‚úì Base model loaded
Step 3: Loading fine-tuned weights from checkpoint...
  Loading weights directly to GPU (memory-optimized)...
‚úì Checkpoint weights loaded (directly to GPU)
Step 4: Wrapping with LLM2Vec...
Step 5: Adding classification head...
‚úì Classification head loaded

‚úÖ Model fully loaded on GPU!
   VRAM used: 15.1 GB


In [7]:
def predict_text(model, text):
    label_names = ["ChatGPT", "Claude", "Grok", "Gemini", "DeepSeek"]
    
    # Prepare & Tokenize
    prepared_text = model.prepare_for_tokenization(text)
    inputs = model.tokenize([prepared_text])
    
    # Device handling
    try:
        device = next(model.parameters()).device
    except:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
    
    # Inference
    with torch.no_grad():
        embeddings = model.forward(inputs)
        if hasattr(model, 'head'):
            embeddings = embeddings.to(next(model.head.parameters()).device)
        
        embeddings = embeddings.to(torch.bfloat16)
        probs = torch.nn.functional.softmax(model.head(embeddings), dim=-1)
    
    # Results
    pred_idx = torch.argmax(probs, dim=-1).item()
    all_probs = probs[0].float().cpu().numpy()
    
    print(f"\nPrediction: {label_names[pred_idx]} ({all_probs[pred_idx]*100:.2f}%)")
    sorted_idxs = np.argsort(all_probs)[::-1]
    for i in sorted_idxs:
        print(f"{label_names[i]:10} {all_probs[i]*100:6.2f}% {'‚ñà' * int(all_probs[i]*20)}")

In [8]:
text = """
Hello! I'd be happy to help you with that question. Let me break this down into a few key points:
1. First, it's important to understand the context
2. Second, we should consider the implications
"""
predict_text(model, text)

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)



Prediction: ChatGPT (36.13%)
ChatGPT     36.13% ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
Grok        29.30% ‚ñà‚ñà‚ñà‚ñà‚ñà
Claude      26.37% ‚ñà‚ñà‚ñà‚ñà‚ñà
DeepSeek     4.27% 
Gemini       3.91% 


In [None]:
import gradio as gr
import plotly.graph_objects as go
import io

def predict_gradio(text):
    """Predict for Gradio interface with detailed logs."""
    if not text.strip():
        return "Enter text to analyze", None, "‚ö†Ô∏è No text provided"
    
    log_capture = io.StringIO()
    
    try:
        label_names = ["ChatGPT", "Claude", "Grok", "Gemini", "DeepSeek"]
        
        log_capture.write("üîÑ Starting prediction...\n")
        log_capture.write(f"üìù Text length: {len(text)} characters\n")
        
        log_capture.write("\nüî§ Tokenizing input...\n")
        prepared_text = model.prepare_for_tokenization(text)
        inputs = model.tokenize([prepared_text])
        log_capture.write("‚úì Tokenization complete\n")
        
        # Dynamic device detection
        try:
            target_device = next(model.parameters()).device
        except:
            target_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        log_capture.write(f"\nüñ•Ô∏è  Device: {target_device}\n")
        
        inputs = {k: v.to(target_device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
        
        log_capture.write("\nüß† Running model inference...\n")
        with torch.no_grad():
            embeddings = model.forward(inputs)
            log_capture.write(f"‚úì Generated embeddings: {embeddings.shape}\n")
            
            if hasattr(model, 'head'):
                head_device = next(model.head.parameters()).device
                embeddings = embeddings.to(head_device)
            
            embeddings = embeddings.to(torch.bfloat16)
            logits = model.head(embeddings)
            probabilities = torch.nn.functional.softmax(logits, dim=-1)
            log_capture.write("‚úì Computed probabilities\n")
        
        pred_label = torch.argmax(probabilities, dim=-1).item()
        all_probs = probabilities[0].float().cpu().numpy()
        
        log_capture.write(f"\n{'='*40}\n")
        log_capture.write(f"üéØ Prediction: {label_names[pred_label]}\n")
        log_capture.write(f"üíØ Confidence: {all_probs[pred_label]*100:.1f}%\n")
        log_capture.write(f"{'='*40}\n\n")
        
        sorted_indices = np.argsort(all_probs)[::-1]
        log_capture.write("üìä All probabilities:\n")
        for idx in sorted_indices:
            bar = "‚ñà" * int(all_probs[idx] * 30)
            log_capture.write(f"  {label_names[idx]:12} {all_probs[idx]*100:5.1f}% {bar}\n")
        
        log_capture.write("\n‚úÖ Analysis complete!\n")
        
        # Result text with clear formatting
        result_text = f"## Detected LLM: **{label_names[pred_label]}**\n\n### Confidence: **{all_probs[pred_label]*100:.1f}%**"
        
        # Bar chart
        sorted_labels = [label_names[i] for i in sorted_indices]
        sorted_probs = [float(all_probs[i]) for i in sorted_indices]
        
        colors = ['#1f77b4' if i == 0 else '#aec7e8' for i in range(len(sorted_labels))]
        
        fig = go.Figure(data=[
            go.Bar(
                x=sorted_labels,
                y=sorted_probs,
                text=[f'{p*100:.1f}%' for p in sorted_probs],
                textposition='outside',
                marker_color=colors,
                marker_line_width=0,
            )
        ])
        
        fig.update_layout(
            xaxis_title=None,
            yaxis_title=None,
            yaxis=dict(range=[0, max(sorted_probs) * 1.15], showticklabels=False, showgrid=False),
            xaxis=dict(showgrid=False),
            height=200,
            margin=dict(l=10, r=10, t=10, b=30),
            showlegend=False,
            plot_bgcolor='white',
            paper_bgcolor='white',
        )
        
        return result_text, fig, log_capture.getvalue()
        
    except Exception as e:
        import traceback
        error_msg = f"‚ùå Error: {str(e)}\n\n{traceback.format_exc()}"
        log_capture.write(error_msg)
        
        empty_fig = go.Figure()
        empty_fig.update_layout(height=200)
        return f"Error: {str(e)}", empty_fig, log_capture.getvalue()


# Single viewport UI with logs
with gr.Blocks(title="Which LLM Wrote This? ChatGPT, Claude, Gemini, or Grok?") as demo:
    gr.Markdown("# Which LLM Wrote This? ChatGPT, Claude, Gemini, or Grok?")
    gr.Markdown("**[Research Paper](https://eric-mingjie.github.io/llm-idiosyncrasies/index.html)** (97% accuracy) ‚Ä¢ **[GitHub](https://github.com/syedamaann/llm-idiosyncrasies)** ‚Ä¢ **[syedamaan.com](https://syedamaan.com)**")
    
    with gr.Row():
        # Left: Input
        with gr.Column(scale=1):
            text_input = gr.Textbox(
                label="Input Text",
                placeholder="Paste text here...",
                lines=8,
                max_lines=8,
            )
            submit_btn = gr.Button("Analyze", variant="primary", size="lg")
        
        # Right: Results and Chart
        with gr.Column(scale=1):
            result_output = gr.Markdown(value="**Results will appear here**")
            plot_output = gr.Plot()
    
    # Bottom: Processing logs (compact)
    logs_output = gr.Textbox(
        label="Processing Log",
        lines=8,
        max_lines=8,
        interactive=False,
    )
    
    submit_btn.click(
        fn=predict_gradio,
        inputs=text_input,
        outputs=[result_output, plot_output, logs_output]
    )
    
    text_input.submit(
        fn=predict_gradio,
        inputs=text_input,
        outputs=[result_output, plot_output, logs_output]
    )

demo.launch(share=True, debug=True)

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://42865431bbb7ee278e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
