EAI-Distill
====

**ESSENTIAL-WEB V1.0: 24T tokens of organized web data**

 * Paper: https://arxiv.org/abs/2506.14111

![EssentialAI overview](../assets/essentialai_overview.png)

In [1]:
import random
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "EssentialAI/EAI-Distill-0.5b",
    trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
    "EssentialAI/EAI-Distill-0.5b"
)

model.eval().to(device);

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def chunk_text(text, max_char_per_doc=30000):
    if len(text) <= max_char_per_doc:
        return text
        
    chunk_size = max_char_per_doc // 3
    start = text[:chunk_size]
    
    middle_start = chunk_size 
    middle_end = len(text) - chunk_size 
    
    mid_point = random.randint(
        middle_start + chunk_size//2,
        middle_end - chunk_size//2
    )
    
    middle = text[mid_point - chunk_size//2:mid_point + chunk_size//2]
    end = text[-chunk_size:]
    return f"[beginning]\n{start}\n[middle]\n{middle}\n[end]\n{end}"


def classify_document(text):
    chunked_text = chunk_text(text)
    
    messages = [
        {"role": "system", "content": "taxonomy"},
        {"role": "user", "content": chunked_text},
    ]
    
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=100)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)
