# NLP Starter Notebook

Explore tokenization, datasets, and a quick text classification fine-tune with 🤗 Transformers.

In [None]:
# Install (if running in a fresh environment)
# !pip install -r ../requirements.txt

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset

# Load a small sample from AG News
ds = load_dataset("ag_news")
df = pd.DataFrame(ds["train"][:1000])
df.head()

In [None]:
# Class distribution plot
df['label'].value_counts().plot(kind='bar')
plt.title('Label Distribution (sample)')
plt.xlabel('Label')
plt.ylabel('Count')
plt.show()

In [None]:
from transformers import AutoTokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

example = df['text'][0]
tokens = tokenizer.tokenize(example)[:50]
print(tokens)

In [None]:
# Quick train with Trainer (1 epoch to smoke-test)
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")

tokenized = ds.map(lambda b: tokenizer(b['text'], truncation=True), batched=True)
tokenized = tokenized.rename_column("label", "labels")
tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)
args = TrainingArguments("checkpoints-notebook", num_train_epochs=1, per_device_train_batch_size=16, per_device_eval_batch_size=16, evaluation_strategy="epoch")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy.compute(predictions=preds, references=labels)['accuracy']}

trainer = Trainer(model, args, train_dataset=tokenized['train'].select(range(2000)),
                  eval_dataset=tokenized['test'].select(range(1000)),
                  tokenizer=tokenizer,
                  data_collator=DataCollatorWithPadding(tokenizer))
trainer.train()
trainer.evaluate()

## Optional: spaCy quick demo

In [None]:
# import spacy
# nlp = spacy.load('en_core_web_sm')
# doc = nlp('Apple is looking at buying U.K. startup for $1 billion')
# [(t.text, t.pos_, t.dep_) for t in doc]