# Classifier (Low Bandwidth Mode)

Lightweight version optimized for low disk usage (downloads only 42KB head, streams model).

In [None]:
# Install dependencies
%pip install -q transformers==4.44.2 peft==0.13.2 huggingface-hub accelerate safetensors
%pip install -q llm2vec==0.2.3 gradio plotly hf_transfer

print("‚úì Dependencies installed")
print("‚ö†Ô∏è  IMPORTANT: Restart the kernel before continuing!")

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
‚úì Dependencies installed
‚ö†Ô∏è  IMPORTANT: Restart the kernel before continuing!


In [1]:
import transformers
print(f"Transformers version: {transformers.__version__}")
assert transformers.__version__ == "4.44.2", f"Need transformers==4.44.2 for llm2vec compatibility, got {transformers.__version__}"
print("‚úì Version check passed")

  from .autonotebook import tqdm as notebook_tqdm


Transformers version: 4.44.2
‚úì Version check passed


## Download Classification Head

This notebook only downloads the classification head (42KB) and streams the base model, saving significant disk space.

In [1]:
# Download classification head only (Low Bandwidth mode)
import os

!rm -rf ./classifier_chat
print("Downloading classification head (42 KB)...")
# Only downloads the small head file; base model is fetched via from_pretrained
!huggingface-cli download Yida/classifier_chat head.pt --local-dir ./classifier_chat
print("‚úì Download complete")

Downloading classification head (42 KB)...
Downloading 'head.pt' to 'classifier_chat/.cache/huggingface/download/YRa3C5umg44TBDeunKxqMu5V1K4=.c0858f64786ece9aee3b49091464a82fb78981c292dd87775d1d004a5ece5795.incomplete'
head.pt: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 42.4k/42.4k [00:00<00:00, 110kB/s]
Download complete. Moving file to classifier_chat/head.pt
classifier_chat/head.pt
‚úì Download complete


In [3]:
import shutil
import os

# CRITICAL: Clear ALL HuggingFace cache for this model
cache_locations = [
    "/workspace/.cache/huggingface/modules/transformers_modules/McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp",
    os.path.expanduser("~/.cache/huggingface/modules/transformers_modules/McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp"),
    "/workspace/.cache/huggingface/hub/models--McGill-NLP--LLM2Vec-Meta-Llama-3-8B-Instruct-mntp",
    os.path.expanduser("~/.cache/huggingface/hub/models--McGill-NLP--LLM2Vec-Meta-Llama-3-8B-Instruct-mntp"),
]

print("üóëÔ∏è  Clearing HuggingFace cache...")
cleared_any = False
for cache_dir in cache_locations:
    if os.path.exists(cache_dir):
        print(f"  Deleting: {cache_dir}")
        try:
            shutil.rmtree(cache_dir)
            cleared_any = True
            print(f"  ‚úì Cleared: {cache_dir}")
        except Exception as e:
            print(f"  ‚ö†Ô∏è  Failed to clear {cache_dir}: {e}")

if cleared_any:
    print("\n‚úÖ Cache cleared! OLD model code will be downloaded on next load.")
else:
    print("\n‚úì No cache found. OLD model code will be downloaded on first load.")

print("\n‚ö†Ô∏è  IMPORTANT: Run this cell, THEN immediately run the model loading cell!")

üóëÔ∏è  Clearing HuggingFace cache...
  Deleting: /workspace/.cache/huggingface/modules/transformers_modules/McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp
  ‚úì Cleared: /workspace/.cache/huggingface/modules/transformers_modules/McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp
  Deleting: /workspace/.cache/huggingface/hub/models--McGill-NLP--LLM2Vec-Meta-Llama-3-8B-Instruct-mntp
  ‚úì Cleared: /workspace/.cache/huggingface/hub/models--McGill-NLP--LLM2Vec-Meta-Llama-3-8B-Instruct-mntp

‚úÖ Cache cleared! OLD model code will be downloaded on next load.

‚ö†Ô∏è  IMPORTANT: Run this cell, THEN immediately run the model loading cell!


In [5]:
import torch
import numpy as np
from transformers import AutoConfig, AutoModel, AutoTokenizer
from peft import PeftModel
from llm2vec import LLM2Vec
import gc
import os
from accelerate import dispatch_model, infer_auto_device_map

def load_classifier_low_bandwidth(checkpoint_path="./classifier_chat", num_labels=5):
    print("Loading classifier (Low Bandwidth)...")
    torch.cuda.empty_cache()
    gc.collect()

    base_model_id = "McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp"
    max_memory = {0: "14GiB", "cpu": "30GiB"}
    
    # Load tokenizer with authentication
    print("Step 1: Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(
        base_model_id,
        trust_remote_code=True,
        token=True  # Use stored HF token for authentication
    )
    print("‚úì Tokenizer loaded")

    # Load base model with authentication
    print("Step 2: Loading base model...")
    config = AutoConfig.from_pretrained(
        base_model_id,
        trust_remote_code=True,
        token=True
    )
    model = AutoModel.from_pretrained(
        base_model_id,
        torch_dtype=torch.bfloat16,
        device_map={"": 0},
        max_memory=max_memory,
        offload_folder="./offload",
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        token=True  # Use stored HF token for authentication
    )
    print("‚úì Base model loaded")
    
    # Merge MNTP Adapter
    print("Step 3: Merging MNTP adapter...")
    model = PeftModel.from_pretrained(
        model,
        base_model_id,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        token=True
    )
    model = model.merge_and_unload()
    print("‚úì MNTP adapter merged")

    # Load Supervised Adapter
    print("Step 4: Loading supervised adapter...")
    model = PeftModel.from_pretrained(
        model,
        f"{base_model_id}-supervised",
        is_trainable=True,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        token=True
    )
    print("‚úì Supervised adapter loaded")

    # Wrap with LLM2Vec
    print("Step 5: Wrapping with LLM2Vec...")
    model = LLM2Vec(model, tokenizer, pooling_mode="mean", max_length=512)
    
    # Get hidden size from config (correct way)
    print("Step 6: Adding classification head...")
    hidden_size = config.hidden_size
    model.head = torch.nn.Linear(hidden_size, num_labels, dtype=torch.bfloat16)

    # Load Head Weights
    head_file = os.path.join(checkpoint_path, "head.pt")
    if os.path.exists(head_file):
        try:
            target_device = next(model.parameters()).device
        except:
            target_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        model.head.load_state_dict(torch.load(head_file, map_location=target_device))
        model.head = model.head.to(target_device)
        print("‚úì Classification head loaded")
    else:
        print(f"‚ö†Ô∏è  Head not found at {head_file}")
    
    model.eval()
    print(f"\n‚úÖ Model fully loaded on {'GPU' if torch.cuda.is_available() else 'CPU'}!")
    return model

model = load_classifier_low_bandwidth()

Loading classifier (Low Bandwidth)...
Step 1: Loading tokenizer...
‚úì Tokenizer loaded
Step 2: Loading base model...


Downloading shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:00<00:00, 18.21it/s]
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [01:57<00:00, 29.46s/it]


‚úì Base model loaded
Step 3: Merging MNTP adapter...
‚úì MNTP adapter merged
Step 4: Loading supervised adapter...
‚úì Supervised adapter loaded
Step 5: Wrapping with LLM2Vec...
Step 6: Adding classification head...
‚úì Classification head loaded

‚úÖ Model fully loaded on GPU!


In [6]:
def predict_text(model, text):
    label_names = ["ChatGPT", "Claude", "Grok", "Gemini", "DeepSeek"]
    
    # Prepare & Tokenize
    prepared_text = model.prepare_for_tokenization(text)
    inputs = model.tokenize([prepared_text])
    
    # Device handling
    try:
        device = next(model.parameters()).device
    except:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
    
    # Inference
    with torch.no_grad():
        embeddings = model.forward(inputs)
        if hasattr(model, 'head'):
            embeddings = embeddings.to(next(model.head.parameters()).device)
        
        embeddings = embeddings.to(torch.bfloat16)
        probs = torch.nn.functional.softmax(model.head(embeddings), dim=-1)
    
    # Results
    pred_idx = torch.argmax(probs, dim=-1).item()
    all_probs = probs[0].float().cpu().numpy()
    
    print(f"\nPrediction: {label_names[pred_idx]} ({all_probs[pred_idx]*100:.2f}%)")
    sorted_idxs = np.argsort(all_probs)[::-1]
    for i in sorted_idxs:
        print(f"{label_names[i]:10} {all_probs[i]*100:6.2f}% {'‚ñà' * int(all_probs[i]*20)}")

In [7]:
text = """
Hello! I'd be happy to help you with that question. Let me break this down into a few key points:
1. First, it's important to understand the context
2. Second, we should consider the implications
"""
predict_text(model, text)

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)



Prediction: Grok (56.25%)
Grok        56.25% ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
ChatGPT     30.08% ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
DeepSeek     5.03% ‚ñà
Claude       4.52% 
Gemini       3.96% 


In [None]:
%pip install -q gradio plotly

import gradio as gr
import plotly.graph_objects as go
import io

def predict_gradio(text):
    """Predict for Gradio interface with detailed logs."""
    if not text.strip():
        return "Enter text to analyze", None, "‚ö†Ô∏è No text provided"
    
    log_capture = io.StringIO()
    
    try:
        label_names = ["ChatGPT", "Claude", "Grok", "Gemini", "DeepSeek"]
        
        log_capture.write("üîÑ Starting prediction...\n")
        log_capture.write(f"üìù Text length: {len(text)} characters\n")
        
        log_capture.write("\nüî§ Tokenizing input...\n")
        prepared_text = model.prepare_for_tokenization(text)
        inputs = model.tokenize([prepared_text])
        log_capture.write("‚úì Tokenization complete\n")
        
        # Dynamic device detection
        try:
            target_device = next(model.parameters()).device
        except:
            target_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        log_capture.write(f"\nüñ•Ô∏è  Device: {target_device}\n")
        
        inputs = {k: v.to(target_device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
        
        log_capture.write("\nüß† Running model inference...\n")
        with torch.no_grad():
            embeddings = model.forward(inputs)
            log_capture.write(f"‚úì Generated embeddings: {embeddings.shape}\n")
            
            if hasattr(model, 'head'):
                head_device = next(model.head.parameters()).device
                embeddings = embeddings.to(head_device)
            
            embeddings = embeddings.to(torch.bfloat16)
            logits = model.head(embeddings)
            probabilities = torch.nn.functional.softmax(logits, dim=-1)
            log_capture.write("‚úì Computed probabilities\n")
        
        pred_label = torch.argmax(probabilities, dim=-1).item()
        all_probs = probabilities[0].float().cpu().numpy()
        
        log_capture.write(f"\n{'='*40}\n")
        log_capture.write(f"üéØ Prediction: {label_names[pred_label]}\n")
        log_capture.write(f"üíØ Confidence: {all_probs[pred_label]*100:.1f}%\n")
        log_capture.write(f"{'='*40}\n\n")
        
        sorted_indices = np.argsort(all_probs)[::-1]
        log_capture.write("üìä All probabilities:\n")
        for idx in sorted_indices:
            bar = "‚ñà" * int(all_probs[idx] * 30)
            log_capture.write(f"  {label_names[idx]:12} {all_probs[idx]*100:5.1f}% {bar}\n")
        
        log_capture.write("\n‚úÖ Analysis complete!\n")
        
        # Result text with clear formatting
        result_text = f"## Detected LLM: **{label_names[pred_label]}**\n\n### Confidence: **{all_probs[pred_label]*100:.1f}%**"
        
        # Bar chart
        sorted_labels = [label_names[i] for i in sorted_indices]
        sorted_probs = [float(all_probs[i]) for i in sorted_indices]
        
        colors = ['#1f77b4' if i == 0 else '#aec7e8' for i in range(len(sorted_labels))]
        
        fig = go.Figure(data=[
            go.Bar(
                x=sorted_labels,
                y=sorted_probs,
                text=[f'{p*100:.1f}%' for p in sorted_probs],
                textposition='outside',
                marker_color=colors,
                marker_line_width=0,
            )
        ])
        
        fig.update_layout(
            xaxis_title=None,
            yaxis_title=None,
            yaxis=dict(range=[0, max(sorted_probs) * 1.15], showticklabels=False, showgrid=False),
            xaxis=dict(showgrid=False),
            height=200,
            margin=dict(l=10, r=10, t=10, b=30),
            showlegend=False,
            plot_bgcolor='white',
            paper_bgcolor='white',
        )
        
        return result_text, fig, log_capture.getvalue()
        
    except Exception as e:
        import traceback
        error_msg = f"‚ùå Error: {str(e)}\n\n{traceback.format_exc()}"
        log_capture.write(error_msg)
        
        empty_fig = go.Figure()
        empty_fig.update_layout(height=200)
        return f"Error: {str(e)}", empty_fig, log_capture.getvalue()


# Single viewport UI with logs
with gr.Blocks(title="Which LLM Wrote This? ChatGPT, Claude, Gemini, or Grok?") as demo:
    gr.Markdown("# Which LLM Wrote This? ChatGPT, Claude, Gemini, or Grok?")
    gr.Markdown("**[Research Paper](https://eric-mingjie.github.io/llm-idiosyncrasies/index.html)** (97% accuracy) ‚Ä¢ **[GitHub](https://github.com/syedamaann/llm-idiosyncrasies)** ‚Ä¢ **[syedamaan.com](https://syedamaan.com)**")
    
    with gr.Row():
        # Left: Input
        with gr.Column(scale=1):
            text_input = gr.Textbox(
                label="Input Text",
                placeholder="Paste text here...",
                lines=8,
                max_lines=8,
            )
            submit_btn = gr.Button("Analyze", variant="primary", size="lg")
        
        # Right: Results and Chart
        with gr.Column(scale=1):
            result_output = gr.Markdown(value="**Results will appear here**")
            plot_output = gr.Plot()
    
    # Bottom: Processing logs (compact)
    logs_output = gr.Textbox(
        label="Processing Log",
        lines=8,
        max_lines=8,
        interactive=False,
    )
    
    submit_btn.click(
        fn=predict_gradio,
        inputs=text_input,
        outputs=[result_output, plot_output, logs_output]
    )
    
    text_input.submit(
        fn=predict_gradio,
        inputs=text_input,
        outputs=[result_output, plot_output, logs_output]
    )

demo.launch(share=True, debug=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


TypeError: Textbox.__init__() got an unexpected keyword argument 'show_copy_button'