In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import time

print("🚀 EMERGENCY SPEED MODE - TINY EVERYTHING!")

# 1. FORCE GPU SETTINGS
torch.backends.cudnn.benchmark = True  # Optimize for consistent input sizes
torch.cuda.empty_cache()  # Clear GPU memory

# 2. TINY DATASET - 3000 samples only
dataset = load_dataset("ag_news")
tiny_subset = dataset['train'].shuffle(seed=42).select(range(3000))
split = tiny_subset.train_test_split(test_size=0.2, seed=42)

train_dataset = split['train']  # 2400 samples
val_dataset = split['test']     # 600 samples

print(f"Train: {len(train_dataset)} samples")
print(f"Val: {len(val_dataset)} samples")

# 3. LIGHTEST MODEL POSSIBLE
MODEL_NAME = "prajjwal1/bert-tiny"  # Only 4.4M parameters!
print(f"Loading ultra-light model: {MODEL_NAME}")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=4)

# Move to GPU
if torch.cuda.is_available():
    model = model.cuda()
    print("Model moved to GPU")

# 4. MINIMAL TOKENIZATION
def preprocess(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=32)  # Very short

print("Tokenizing...")
start = time.time()
train_enc = train_dataset.map(preprocess, batched=True, remove_columns=['text']).rename_column("label", "labels")
val_enc = val_dataset.map(preprocess, batched=True, remove_columns=['text']).rename_column("label", "labels")
print(f"Tokenization: {time.time() - start:.1f} seconds")

train_enc.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_enc.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# 5. ULTRA-AGGRESSIVE TRAINING ARGS
training_args = TrainingArguments(
    output_dir="/kaggle/working/results",
    eval_strategy="no",                 # NO EVALUATION during training
    save_strategy="no",                 # NO SAVING during training
    learning_rate=1e-3,                 # Very high learning rate
    per_device_train_batch_size=128,    # Maximum batch size
    num_train_epochs=1,                 # ONLY 1 EPOCH
    fp16=True,
    dataloader_num_workers=0,           # No parallel loading
    remove_unused_columns=True,
    report_to=None,
    disable_tqdm=False,
    logging_steps=10,
    max_steps=50,                       # HARD STOP at 50 steps (emergency exit)
    gradient_accumulation_steps=1,
)

print(f"Max steps: {training_args.max_steps}")
print(f"Batch size: {training_args.per_device_train_batch_size}")

# 6. MINIMAL TRAINER
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_enc,
    # processing_class=tokenizer, # Removed as it's not needed for Trainer
)

# 7. START TRAINING WITH TIMER
print("\n🔥 STARTING ULTRA-FAST TRAINING...")
start_time = time.time()

try:
    trainer.train()
    training_time = time.time() - start_time
    print(f"✅ TRAINING COMPLETED in {training_time/60:.1f} minutes!")

    # Quick evaluation
    print("Quick evaluation...")
    eval_start = time.time()
    results = trainer.evaluate(val_enc)
    eval_time = time.time() - eval_start
    print(f"Evaluation time: {eval_time:.1f} seconds")
    print(f"Accuracy: {results['eval_accuracy']:.3f}" if 'eval_accuracy' in results else "Accuracy: N/A")

except Exception as e:
    print(f"An error occurred during training: {e}")

🚀 EMERGENCY SPEED MODE - TINY EVERYTHING!


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Train: 2400 samples
Val: 600 samples
Loading ultra-light model: prajjwal1/bert-tiny


config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model moved to GPU
Tokenizing...


Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Tokenization: 0.8 seconds
Max steps: 50
Batch size: 128

🔥 STARTING ULTRA-FAST TRAINING...


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33miamyash09[0m ([33miamyash09-amity-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,1.221
20,0.7843
30,0.4995
40,0.4168
50,0.311


✅ TRAINING COMPLETED in 1.3 minutes!
Quick evaluation...


Evaluation time: 0.5 seconds
Accuracy: N/A


In [None]:
# Optimized AG News Training Code
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os

# Check GPU availability
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")

# Load dataset
dataset = load_dataset("ag_news")

# Use smaller subset for faster training/testing
SUBSET_SIZE = 22500  # Use 22.5K samples for faster training
dataset_subset = dataset['train'].shuffle(seed=42).select(range(SUBSET_SIZE))
dataset_split = dataset_subset.train_test_split(test_size=0.1, seed=42)

train_dataset = dataset_split['train']  # ~20K samples
val_dataset = dataset_split['test']     # ~2.5K samples
test_dataset = dataset['test']          # Keep full test set (7.6K samples)

print(f"Train size: {len(train_dataset)}")
print(f"Val size: {len(val_dataset)}")
print(f"Test size: {len(test_dataset)}")

# Labels
NUM_LABELS = 4
id2label = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
label2id = {v: k for k, v in id2label.items()}

# Keep your original model
MODEL_NAME = "distilroberta-base"  # Your original choice
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
MAX_LEN = 64  # Keep your original max length

def preprocess(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN
    )

# Tokenize datasets
train_enc = train_dataset.map(preprocess, batched=True, remove_columns=['text'])
val_enc = val_dataset.map(preprocess, batched=True, remove_columns=['text'])
test_enc = test_dataset.map(preprocess, batched=True, remove_columns=['text'])

# Rename label column
train_enc = train_enc.rename_column("label", "labels")
val_enc = val_enc.rename_column("label", "labels")
test_enc = test_enc.rename_column("label", "labels")

# Set format for PyTorch
train_enc.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_enc.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_enc.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    id2label=id2label,
    label2id=label2id
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="weighted"
    )
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Optimized training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",  # Changed from epoch to steps for faster feedback
    eval_steps=200,         # Evaluate every 200 steps
    save_strategy="steps",
    save_steps=200,
    learning_rate=2e-5,     # Slightly lower learning rate
    per_device_train_batch_size=32,  # Increased batch size for efficiency
    per_device_eval_batch_size=64,   # Larger eval batch size
    num_train_epochs=2,     # Reduced from 3 epochs
    weight_decay=0.01,
    warmup_steps=100,       # Added warmup
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir="./logs",
    logging_steps=50,
    seed=42,
    fp16=True,              # Keep fp16 for speed
    dataloader_num_workers=2,  # Faster data loading
    remove_unused_columns=True,
    report_to=None,         # Disable wandb logging for speed
    disable_tqdm=False,     # Keep progress bars
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_enc,
    eval_dataset=val_enc,
    processing_class=tokenizer,  # Updated parameter name
    compute_metrics=compute_metrics
)

# Start training
print("Starting training...")
trainer.train()

# Evaluate on test set
print("Evaluating on test set...")
test_results = trainer.evaluate(test_enc)
print(f"Test Results: {test_results}")

# Save the model
drive.mount('/content/drive')
model.save_pretrained("/content/drive/MyDrive/ag-news-model")
tokenizer.save_pretrained("/content/drive/MyDrive/ag-news-classifier")
print("Model saved successfully!")

CUDA available: True
GPU: Tesla T4
Train size: 20250
Val size: 2250
Test size: 7600


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/20250 [00:00<?, ? examples/s]

Map:   0%|          | 0/2250 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
200,0.3068,0.315764,0.896889,0.896794,0.896889,0.896595
400,0.2781,0.279858,0.904444,0.905283,0.904444,0.904332
600,0.2641,0.29512,0.895556,0.900219,0.895556,0.895601
800,0.218,0.254168,0.916,0.91607,0.916,0.915893
1000,0.2001,0.254537,0.914667,0.914927,0.914667,0.914535
1200,0.2124,0.240746,0.919556,0.919488,0.919556,0.919409


Evaluating on test set...


Test Results: {'eval_loss': 0.23299522697925568, 'eval_accuracy': 0.9247368421052632, 'eval_precision': 0.9247054881783472, 'eval_recall': 0.9247368421052632, 'eval_f1': 0.9246051933012254, 'eval_runtime': 3.1308, 'eval_samples_per_second': 2427.522, 'eval_steps_per_second': 38.01, 'epoch': 2.0}
Model saved successfully!


In [None]:
# Simple Model Testing and Saving

import torch
from sklearn.metrics import classification_report
import pandas as pd

# Labels
id2label = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}

# 2. TEST SINGLE SAMPLE
def test_sample(index):
    """Test any sample by index"""
    sample = test_enc[index]
    raw_text = test_dataset[index]['text']
    true_label = sample['labels'].item()

    # Get prediction
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    with torch.no_grad():
        inputs = {
            'input_ids': sample['input_ids'].unsqueeze(0).to(device),
            'attention_mask': sample['attention_mask'].unsqueeze(0).to(device)
        }
        outputs = model(**inputs)
        predicted_id = torch.argmax(outputs.logits, dim=-1).item()
        probs = torch.softmax(outputs.logits, dim=-1).squeeze()

    # Show results
    print(f"\n--- SAMPLE {index} ---")
    print(f"Text: {raw_text[:100]}...")
    print(f"True: {id2label[true_label]}")
    print(f"Predicted: {id2label[predicted_id]}")
    print(f"Correct: {'✅' if true_label == predicted_id else '❌'}")
    print(f"Confidence: {probs[predicted_id]:.3f}")

# 3. FULL CLASSIFICATION REPORT
def get_report():
    """Get classification report on test set"""
    print("Getting predictions...")

    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    all_preds = []
    all_labels = []

    for i in range(len(test_enc)):
        sample = test_enc[i]
        true_label = sample['labels'].item()

        with torch.no_grad():
            inputs = {
                'input_ids': sample['input_ids'].unsqueeze(0).to(device),
                'attention_mask': sample['attention_mask'].unsqueeze(0).to(device)
            }
            outputs = model(**inputs)
            pred = torch.argmax(outputs.logits, dim=-1).item()

        all_preds.append(pred)
        all_labels.append(true_label)

    # Print report
    print("\n📊 CLASSIFICATION REPORT:")
    print(classification_report(
        all_labels,
        all_preds,
        target_names=list(id2label.values())
    ))

# 4. USAGE EXAMPLES

# Test individual samples
test_sample(0)
test_sample(50)
test_sample(100)

# Get full report
get_report()


--- SAMPLE 0 ---
Text: Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disap...
True: Business
Predicted: Business
Correct: ✅
Confidence: 0.940

--- SAMPLE 50 ---
Text: US fighter squadron to be deployed in South Korea next month (AFP) AFP - A squadron of US Air Force ...
True: World
Predicted: World
Correct: ✅
Confidence: 0.992

--- SAMPLE 100 ---
Text: Olympic history for India, UAE An Indian army major shot his way to his country #39;s first ever ind...
True: Sports
Predicted: Sports
Correct: ✅
Confidence: 0.508
Getting predictions...

📊 CLASSIFICATION REPORT:
              precision    recall  f1-score   support

       World       0.94      0.92      0.93      1900
      Sports       0.97      0.99      0.98      1900
    Business       0.90      0.88      0.89      1900
    Sci/Tech       0.89      0.91      0.90      1900

    accuracy                           0.92      7600
   macro avg       0.92      0.92      0.92      7600
we

In [None]:
!pip freeze > requirements.txt

In [None]:
!pip install numpy pandas scikit-learn matplotlib seaborn torch transformers datasets evaluate sentencepiece sacremoses tqdm accelerate flask fastapi uvicorn




In [None]:
import os, sys, re, json, logging

# --- System / Utility (built-in) ---
print("Python:", sys.version)
print("os: built-in")
print("sys: built-in")
print("re: built-in")
print("json: built-in")
print("logging: built-in")

# --- Data Handling & Preprocessing ---
import numpy, pandas, sklearn, matplotlib, seaborn
print("numpy:", numpy.__version__)
print("pandas:", pandas.__version__)
print("scikit-learn:", sklearn.__version__)
print("matplotlib:", matplotlib.__version__)
print("seaborn:", seaborn.__version__)

# --- Core ML / DL ---
import torch
print("torch:", torch.__version__)

# --- Hugging Face NLP ---
import transformers, datasets, evaluate, sentencepiece, sacremoses
print("transformers:", transformers.__version__)
print("datasets:", datasets.__version__)
print("evaluate:", evaluate.__version__)
print("sentencepiece:", sentencepiece.__version__)
print("sacremoses:", sacremoses.__version__)

# --- Training Utils ---
import tqdm, accelerate
print("tqdm:", tqdm.__version__)
print("accelerate:", accelerate.__version__)

# --- Deployment ---
import flask, fastapi, uvicorn
print("flask:", flask.__version__)
print("fastapi:", fastapi.__version__)
print("uvicorn:", uvicorn.__version__)

# --- WhatsApp / API ---
import twilio, requests
print("twilio:", twilio.__version__)
print("requests:", requests.__version__)

# --- Optional Tools ---
import jupyter, dotenv, pytest
print("jupyter:", jupyter.__version__)
print("python-dotenv:", dotenv.__version__)
print("pytest:", pytest.__version__)


Python: 3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]
os: built-in
sys: built-in
re: built-in
json: built-in
logging: built-in
numpy: 2.0.2
pandas: 2.2.2
scikit-learn: 1.6.1
matplotlib: 3.10.0
seaborn: 0.13.2
torch: 2.8.0+cu126
transformers: 4.55.4
datasets: 4.0.0
evaluate: 0.4.5
sentencepiece: 0.2.1
sacremoses: 0.1.1
tqdm: 4.67.1
accelerate: 1.10.1
flask: 3.1.2
fastapi: 0.116.1
uvicorn: 0.35.0


  print("flask:", flask.__version__)


ModuleNotFoundError: No module named 'twilio'

In [None]:
requirements = """
numpy==2.0.2
pandas==2.2.2
scikit-learn==1.6.1
matplotlib==3.10.0
seaborn==0.13.2
torch==2.8.0+cu126
transformers==4.55.4
datasets==4.0.0
evaluate==0.4.5
sentencepiece==0.2.1
sacremoses==0.1.1
tqdm==4.67.1
accelerate==1.10.1
flask==3.1.2
fastapi==0.116.1
uvicorn==0.35.0
"""

with open("requirements.txt", "w") as f:
    f.write(requirements.strip())

print("✅ requirements.txt has been created!")


✅ requirements.txt has been created!
