In [None]:
# Uncomment and run this cell if you're on Colab or Kaggle
# !git clone https://github.com/nlp-with-transformers/notebooks.git
# %cd notebooks
# from install import *
# install_requirements(is_chapter2=True)

In [None]:
# hide
# from utils import *
# setup_chapter()

In [None]:
from datasets import list_datasets

all_datasets = list_datasets()
print(f"There are {len(all_datasets)} datasets currently available on the Hub")
print(f"The first 10 are: {all_datasets[:50]}")

In [None]:
from datasets import load_dataset

news = load_dataset("ag_news")

In [None]:
news

In [None]:
train_ds = news["train"]
train_ds

In [None]:
len(train_ds)

In [None]:
train_ds[10]

In [None]:
train_ds.column_names

In [None]:
print(train_ds.features)

In [None]:
print(train_ds[:5])

In [None]:
print(train_ds["text"][:5])

In [None]:
import pandas as pd

news.set_format(type="pandas")
df = news["train"][:]
df.head()

In [None]:
def label_int2str(row):
    return news["train"].features["label"].int2str(row)

df["label_name"] = df["label"].apply(label_int2str)
df.head()

In [None]:
import matplotlib.pyplot as plt

df["label"].value_counts(ascending=True).plot.barh()
plt.title("Freqeuncy of classes")
plt.show()

In [None]:
df["Words per post"] = df["text"].str.split().apply(len)
df.boxplot("Words per post", by="label", grid=False, showfliers=False, color="black")
plt.suptitle("")
plt.xlabel("")
plt.show()

In [None]:
news.reset_format()

In [None]:
from transformers import AutoTokenizer

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
text = news['test'][10]['text']
encoded_text = tokenizer(text)
print(encoded_text)

In [None]:
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)

In [None]:
print(tokenizer.convert_tokens_to_string(tokens))

In [None]:
tokenizer.model_input_names

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

print(tokenize(news["train"][:2]))

In [None]:
news_encoded = news.map(tokenize, batched=True, batch_size=100)

In [None]:
print(news_encoded["train"].column_names)

In [None]:
from transformers import AutoModel
import torch

model_ckpt = "distilbert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)

In [None]:
text = "this is a test"
inputs = tokenizer(text, return_tensors="pt")
print(f"Input tensor shape: {inputs['input_ids'].size()}")

In [None]:
inputs = {k:v.to(device) for k,v in inputs.items()}
with torch.no_grad():
    outputs = model(**inputs)
print(outputs)

In [None]:
outputs.last_hidden_state.size()

In [None]:
outputs.last_hidden_state[:,0].size()

In [None]:
def extract_hidden_states(batch):
    inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [None]:
news_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
news_hidden = news_encoded.map(extract_hidden_states, batched=True, batch_size=100)

In [None]:
news_hidden["train"].columns_names

In [None]:
import numpy as np

X_train = np.array(news_hidden["train"]["hidden_state"])
X_valid = np.array(news_hidden["test"]["hidden_state"])
y_train = np.array(news_hidden["train"]["label"])
y_valid = np.array(news_hidden["test"]["label"])
X_train.shape, X_valid.shape

In [None]:
from umap import UMAP
from sklearn.preprocessing import MinMaxScaler

X_scaled = MinMaxScaler().fit_transform(X_train)
mapper = UMAP(n_components=2, metric="cosinde").fit(X_scaled)
df_emb = pd.DataFrame(mapper.embedding_, columns=["X", "Y"])
df_emb["label"] = y_train
df_emb.head()

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(7,5))
axes = axes.flatten()
cmaps = ["Greys", "Blues", "Oranges", "Reds", "Purples", "Greens"]
labels = news["train"].features["label"].names

for i, (label, cmap) in enumerate(zip(labels, cmaps)):
    df_emb_sub = df_emb.query(f"label == {i}")
    axes[i].hexbin(df_emb_sub["X"], df_emb_sub["Y"], cmap=cmap,
                   gridsize=20, linewidths=(0,))
    axes[i].set_title(label)
    axes[i].set_xticks([]), axes[i].set_yticks([])

plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

def plot_confusion_matrix(y_preds, y_true, labels):
    cm = confusion_matrix(y_true, y_preds, normalize="true")
    fig, ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
    plt.title("Normalized confusion matrix")
    plt.show()

In [None]:
from transformers import AutoModelForSequenceClassification

num_labels = 6
model = (AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(device))

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
from transformers import Trainer, TrainingArguments

batch_size = 64
logging_steps = len(news_encoded["train"]) // batch_size
model_name = f"{model_ckpt}-finetuned-sst-2-english"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=True,
                                  log_level="error")

In [None]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=news_encoded["train"],
                  eval_dataset=news_encoded["test"],
                  tokenizer=tokenizer)
trainer.train();

In [None]:
preds_output = trainer.predict(news_encoded["test"])

In [None]:
preds_output.metrics

In [None]:
y_preds = np.argmax(preds_output.predictions, axis=1)

In [None]:
plot_confusion_matrix(y_preds, y_valid, labels)

In [None]:
from torch.nn.functional import cross_entropy

def forward_pass_with_label(batch):
    # Place all input tensors on the same device as the model
    inputs = {k:v.to(device) for k,v in batch.items()
              if k in tokenizer.model_input_names}

    with torch.no_grad():
        output = model(**inputs)
        pred_label = torch.argmax(output.logits, axis=-1)
        loss = cross_entropy(output.logits, batch["label"].to(device),
                             reduction="none")

    # Place outputs on CPU for compatibility with other dataset columns
    return {"loss": loss.cpu().numpy(),
            "predicted_label": pred_label.cpu().numpy()}

In [None]:
#hide_output
# Convert our dataset back to PyTorch tensors
news_encoded.set_format("torch",
                            columns=["input_ids", "attention_mask", "label"])
# Compute loss values
news_encoded["test"] = news_encoded["test"].map(
    forward_pass_with_label, batched=True, batch_size=16)

In [None]:
news_encoded.set_format("pandas")
cols = ["text", "label", "predicted_label", "loss"]
df_test = news_encoded["test"][:][cols]
df_test["label"] = df_test["label"].apply(label_int2str)
df_test["predicted_label"] = (df_test["predicted_label"]
                              .apply(label_int2str))

In [None]:
df_test.sort_values("loss", ascending=False).head(10)

In [None]:
df_test.sort_values("loss", ascending=True).head(10)