<a href="https://colab.research.google.com/github/waelrash1/llms/blob/main/transformers/Text_Classification_HF_ch2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Transformers installation
! pip install transformers datasets
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

In [None]:
from datasets import list_datasets
all_datasets = list_datasets()
print(f"There are {len(all_datasets)} datasets currently available on the Hub")
print(f"The first 10 are: {all_datasets[:10]}")


In [None]:
from datasets import load_dataset
emotions = load_dataset("emotion")


In [None]:
print(emotions)
train_ds = emotions["train"]
print(train_ds)


In [None]:
print(len(train_ds))
print(train_ds[0])
print(train_ds.features)

In [None]:
import pandas as pd
emotions.set_format(type="pandas")
df = emotions["train"][:]
df.head()

In [None]:
def label_int2str(row):
  return emotions["train"].features["label"].int2str(row)

df["label_name"] = df["label"].apply(label_int2str)
df

In [None]:
import matplotlib.pyplot as plt
df["label_name"].value_counts(ascending=True).plot.barh()
plt.title("Frequency of Classes")
plt.show()

In [None]:
df["Words Per Tweet"] = df["text"].str.split().apply(len)
df.boxplot("Words Per Tweet", by="label_name", grid=False,
showfliers=False, color="black")
plt.suptitle("")
plt.xlabel("")
plt.show()

In [None]:
from transformers import AutoTokenizer
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
text="This notebook is open with private outputs. Outputs will not be saved. You can disable this in Notebook settings."
encoded_text = tokenizer(text)
print(encoded_text)

tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)
print(tokenizer.convert_tokens_to_string(tokens))

In [None]:
tokenizer.vocab_size

In [None]:
tokenizer.model_max_length

In [None]:
def tokenize(batch):
  return tokenizer(batch["text"], padding=True, truncation=True)

In [None]:
emotions.reset_format

In [None]:
print(tokenize(emotions["train"][:2]))

In [None]:
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

In [None]:
from transformers import AutoModel
import torch
model_ckpt = "distilbert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)

In [None]:
text = [["this is a test","Hello to the world of transformers."]]
inputs = tokenizer(text, return_tensors="pt")
print(f"Input tensor shape: {inputs['input_ids'].size()}")

In [None]:
inputs

In [None]:
inputs = {k:v.to(device) for k,v in inputs.items()}


In [None]:
with torch.no_grad():
  outputs = model(**inputs)
print(outputs)

In [None]:
outputs.last_hidden_state.size()

In [None]:
outputs.last_hidden_state[:,0]

In [None]:
def extract_hidden_states(batch):
# Place model inputs on the GPU
  inputs = {k:v.to(device) for k,v in batch.items()
  if k in tokenizer.model_input_names}
# Extract last hidden states
  with torch.no_grad():
    last_hidden_state = model(**inputs).last_hidden_state
# Return vector for [CLS] token
  return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [None]:
emotions_encoded.set_format("torch",
columns=["input_ids", "attention_mask", "label"])
#We can then go ahead and extract the hidden states across all splits in one go:
emotions_hidden = emotions_encoded.map(extract_hidden_states, batched=True)
#Notice that we did not set batch_size=None in this case, which means the default
#batch_size=1000 is used instead. As expected, applying the extract_ hidden_
#states() function has added a new hidden_state column to our dataset:
emotions_hidden["train"].column_names

In [None]:
emotions_encoded

In [None]:
import numpy as np
X_train = np.array(emotions_hidden["train"]["hidden_state"])
X_valid = np.array(emotions_hidden["validation"]["hidden_state"])
y_train = np.array(emotions_hidden["train"]["label"])
y_valid = np.array(emotions_hidden["validation"]["label"])
X_train.shape, X_valid.shape

In [None]:
!pip install umap-learn

In [None]:
import umap.umap_ as UMAP
from sklearn.preprocessing import MinMaxScaler
# Scale features to [0,1] range
X_scaled = MinMaxScaler().fit_transform(X_train)
# Initialize and fit UMAP
mapper = UMAP.UMAP(n_components=2, metric="cosine").fit(X_scaled)
# Create a DataFrame of 2D embeddings
df_emb = pd.DataFrame(mapper.embedding_, columns=["X", "Y"])
df_emb["label"] = y_train
df_emb.head()

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 3, figsize=(7,5))
axes = axes.flatten()
cmaps = ["Greys", "Blues", "Oranges", "Reds", "Purples", "Greens"]
labels = emotions["train"].features["label"].names
for i, (label, cmap) in enumerate(zip(labels, cmaps)):
  df_emb_sub = df_emb.query(f"label == {i}")
  axes[i].hexbin(df_emb_sub["X"], df_emb_sub["Y"], cmap=cmap,
  gridsize=20, linewidths=(0,))
  axes[i].set_title(label)
  axes[i].set_xticks([]), axes[i].set_yticks([])
plt.tight_layout()
plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression
# We increase `max_iter` to guarantee convergence
lr_clf = LogisticRegression(max_iter=3000)
lr_clf.fit(X_train, y_train)
lr_clf.score(X_valid, y_valid)

In [None]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
dummy_clf.score(X_valid, y_valid)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt

def plot_confusion_matrix(y_preds, y_true, labels):
  cm = confusion_matrix(y_true, y_preds, normalize="true")
  fig, ax = plt.subplots(figsize=(6, 6))
  disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
  disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
  plt.title("Normalized confusion matrix")
  plt.show()

labels = emotions["train"].features["label"].names
y_preds = lr_clf.predict(X_valid)
plot_confusion_matrix(y_preds, y_valid, labels)

Loading a pretrained model
The first thing we need is a pretrained DistilBERT model like the one we used in the
feature-based approach. The only slight modification is that we use the AutoModelFor
SequenceClassification model instead of AutoModel. The difference is that the
AutoModelForSequenceClassification model has a classification head on top of the
pretrained model outputs, which can be easily trained with the base model. We just
need to specify how many labels the model has to predict (six in our case), since this
dictates the number of outputs the classification head has:


In [None]:
from transformers import AutoModelForSequenceClassification
num_labels = 6
model = (AutoModelForSequenceClassification
.from_pretrained(model_ckpt, num_labels=num_labels)
.to(device))

In [None]:
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average="weighted")
  acc = accuracy_score(labels, preds)
  return {"accuracy": acc, "f1": f1}

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
!pip install transformers[torch]

In [None]:
import accelerate; import transformers; print(transformers.__version__); print(accelerate.__version__)

In [None]:
from transformers import Trainer, TrainingArguments
import accelerate
batch_size = 32
logging_steps = len(emotions_encoded["train"]) // batch_size
model_name = f"{model_ckpt}-finetuned-emotion"
training_args = TrainingArguments(output_dir=model_name,
num_train_epochs=4,
learning_rate=2e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=0.01,
evaluation_strategy="epoch",
disable_tqdm=False,
logging_steps=logging_steps,
push_to_hub=True,
log_level="error")

In [None]:
from transformers import Trainer
trainer = Trainer(model=model, args=training_args,
compute_metrics=compute_metrics,
train_dataset=emotions_encoded["train"],
eval_dataset=emotions_encoded["validation"],
tokenizer=tokenizer)


In [None]:
trainer.train();

In [None]:
preds_output = trainer.predict(emotions_encoded["validation"])

In [None]:
preds_output.metrics

In [None]:
import numpy as np
y_preds = np.argmax(preds_output.predictions, axis=1)

In [None]:
labels = emotions["train"].features["label"].names
y_valid = np.array(emotions["validation"]["label"])

plot_confusion_matrix(y_preds, y_valid, labels)


## Error analysis
Before moving on, we should investigate our model’s predictions a little bit further. A
simple yet powerful technique is to sort the validation samples by the model loss.
When we pass the label during the forward pass, the loss is automatically calculated
and returned. Here’s a function that returns the loss along with the predicted label:

In [None]:

from torch.nn.functional import cross_entropy
def forward_pass_with_label(batch):
# Place all input tensors on the same device as the model
  inputs = {k:v.to(device) for k,v in batch.items()
  if k in tokenizer.model_input_names}
  with torch.no_grad():
    output = model(**inputs)
    pred_label = torch.argmax(output.logits, axis=-1)
    loss = cross_entropy(output.logits, batch["label"].to(device),
    reduction="none")
# Place outputs on CPU for compatibility with other dataset columns

  return {"loss": loss.cpu().numpy(),
  "predicted_label": pred_label.cpu().numpy()}

In [None]:
# Convert our dataset back to PyTorch tensors
emotions_encoded.set_format("torch",
columns=["input_ids", "attention_mask", "label"])
# Compute loss values
emotions_encoded["validation"] = emotions_encoded["validation"].map(
forward_pass_with_label, batched=True, batch_size=16)

In [None]:
emotions_encoded.set_format("pandas")
cols = ["text", "label", "predicted_label", "loss"]
df_test = emotions_encoded["validation"][:][cols]
df_test["label"] = df_test["label"].apply(label_int2str)
df_test["predicted_label"] = (df_test["predicted_label"]
.apply(label_int2str))

In [None]:
df_test.sort_values("loss", ascending=False).head(10)

In [None]:
df_test.sort_values("loss", ascending=True).head(10)

In [None]:
trainer.push_to_hub(commit_message="Training completed!")

In [None]:
from transformers import pipeline
# Change `transformersbook` to your Hub username
model_id = "waelrash1/distilbert-base-uncased-finetuned-emotion"
classifier = pipeline("text-classification", model=model_id)

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("WaelRash1/distilbert-base-uncased-finetuned-emotion")
model = AutoModelForSequenceClassification.from_pretrained("WaelRash1/distilbert-base-uncased-finetuned-emotion")

In [None]:
import pandas as pd

custom_tweet = "I saw a movie today and it was really good."
preds = classifier(custom_tweet, return_all_scores=True)
#Finally, we can plot the probability for each class in a bar plot. Clearly, the model estimates
#that the most likely class is joy, which appears to be reasonable given the tweet:
preds_df = pd.DataFrame(preds[0])
plt.bar(labels, 100 * preds_df["score"], color='C0')
plt.title(f'"{custom_tweet}"')
plt.ylabel("Class probability (%)")
plt.show()

In [None]:
preds