In [None]:
import numpy as np
from datasets import load_dataset, get_dataset_split_names
from transformers import AutoTokenizer, AutoModel
from transformers import pipeline
import torch
from transformers import DistilBertForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import AutoFeatureExtractor
from transformers import DistilBertModel, DistilBertTokenizer

import evaluate
import wandb

from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score

from tdc.multi_pred.anndata_dataset import DataLoader
from tdc import tdc_hf_interface
from tdc.model_server.tokenizers.scgpt import scGPTTokenizer
import torch

from src.data.create_dataset import create_dataset


## Working with Transformers in the HuggingFace Ecosystem

In this laboratory exercise we will learn how to work with the HuggingFace ecosystem to adapt models to new tasks. As you will see, much of what is required is *investigation* into the inner-workings of the HuggingFace abstractions. With a little work, a little trial-and-error, it is fairly easy to get a working adaptation pipeline up and running.

### Exercise 1: Sentiment Analysis (warm up)

In this first exercise we will start from a pre-trained BERT transformer and build up a model able to perform text sentiment analysis. Transformers are complex beasts, so we will build up our pipeline in several explorative and incremental steps.

#### Exercise 1.1: Dataset Splits and Pre-trained model
There are a many sentiment analysis datasets, but we will use one of the smallest ones available: the [Cornell Rotten Tomatoes movie review dataset](cornell-movie-review-data/rotten_tomatoes), which consists of 5,331 positive and 5,331 negative processed sentences from the Rotten Tomatoes movie reviews.

**Your first task**: Load the dataset and figure out what splits are available and how to get them. Spend some time exploring the dataset to see how it is organized. Note that we will be using the [HuggingFace Datasets](https://huggingface.co/docs/datasets/en/index) library for downloading, accessing, splitting, and batching data for training and evaluation.

In [2]:
dataset_id = "cornell-movie-review-data/rotten_tomatoes"
ds_train = load_dataset(dataset_id, split="train")
ds_test = load_dataset(dataset_id, split="test")
ds_validation = load_dataset(dataset_id, split="validation")

In [3]:
ds_train

Dataset({
    features: ['text', 'label'],
    num_rows: 8530
})

In [4]:
print(f"Text example: {ds_train['text'][0]}")
print(f"Label example: {ds_train['label'][0]}")

Text example: the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .
Label example: 1


In [5]:
print(f"Labels in the dataset: {np.unique(ds_train['label'])}")

Labels in the dataset: [0 1]


#### Exercise 1.2: A Pre-trained BERT and Tokenizer

The model we will use is a *very* small BERT transformer called [Distilbert](https://huggingface.co/distilbert/distilbert-base-uncased) this model was trained (using self-supervised learning) on the same corpus as BERT but using the full BERT base model as a *teacher*.

**Your next task**: Load the Distilbert model and corresponding tokenizer. Use the tokenizer on a few samples from the dataset and pass the tokens through the model to see what outputs are provided. I suggest you use the [`AutoModel`](https://huggingface.co/transformers/v3.0.2/model_doc/auto.html) class (and the `from_pretrained()` method) to load the model and `AutoTokenizer` to load the tokenizer).

In [6]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert/distilbert-base-uncased")

In [7]:
model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [8]:
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert/distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [9]:
inputs = tokenizer(ds_train[:2]['text'], padding=True, return_tensors="pt")

In [10]:
outputs = model(**inputs) # model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])

In [11]:
outputs.keys()

odict_keys(['last_hidden_state'])

In [12]:
hidden_states = outputs['last_hidden_state']
hidden_states.shape # (batch_size, sequence_length, hidden_size)

torch.Size([2, 52, 768])

#### Exercise 1.3: A Stable Baseline

In this exercise I want you to:
1. Use Distilbert as a *feature extractor* to extract representations of the text strings from the dataset splits;
2. Train a classifier (your choice, by an SVM from Scikit-learn is an easy choice).
3. Evaluate performance on the validation and test splits.

These results are our *stable baseline* -- the **starting** point on which we will (hopefully) improve in the next exercise.

**Hint**: There are a number of ways to implement the feature extractor, but probably the best is to use a [feature extraction `pipeline`](https://huggingface.co/tasks/feature-extraction). You will need to interpret the output of the pipeline and extract only the `[CLS]` token from the *last* transformer layer. *How can you figure out which output that is?*

In [13]:
extractor = pipeline("feature-extraction", model=model, tokenizer=tokenizer)

Device set to use cuda:0


To verify which output embedding corresponds to the `[CLS]` token, we checked both the tokenizer and the pipeline output

In [14]:
text = "Hello world!"
encoding = tokenizer(text, return_tensors="pt")
tokens = tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])

print("Tokens:", tokens)

Tokens: ['[CLS]', 'hello', 'world', '!', '[SEP]']


In [15]:
features = extractor(text, return_tensors="pt")
print("Feature shape:", features.shape)

Feature shape: torch.Size([1, 5, 768])


This confirms that features[0][0] corresponds to the [CLS] token, which we use as the embedding representing the entire sentence.

In [16]:
svc = LinearSVC()

def get_cls_embeddings(texts):
    embeddings = extractor(texts)
    # Extract the CLS token embedding (first token) from each sequence
    cls_embeddings = np.array([emb[0][0] for emb in embeddings]) 
    return cls_embeddings


train_embeddings = get_cls_embeddings(ds_train['text'])
validation_embeddings = get_cls_embeddings(ds_validation['text'])
test_embeddings = get_cls_embeddings(ds_test['text'])

svc.fit(train_embeddings, ds_train['label'])

print("Validation set metrics:")
print(classification_report(ds_validation['label'], svc.predict(validation_embeddings)))
print("\nTest set metrics:")
print(classification_report(ds_test['label'], svc.predict(test_embeddings)))

Validation set metrics:
              precision    recall  f1-score   support

           0       0.81      0.84      0.83       533
           1       0.84      0.80      0.82       533

    accuracy                           0.82      1066
   macro avg       0.82      0.82      0.82      1066
weighted avg       0.82      0.82      0.82      1066


Test set metrics:
              precision    recall  f1-score   support

           0       0.79      0.81      0.80       533
           1       0.81      0.78      0.80       533

    accuracy                           0.80      1066
   macro avg       0.80      0.80      0.80      1066
weighted avg       0.80      0.80      0.80      1066



-----
### Exercise 2: Fine-tuning Distilbert

In this exercise we will fine-tune the Distilbert model to (hopefully) improve sentiment analysis performance.

#### Exercise 2.1: Token Preprocessing

The first thing we need to do is *tokenize* our dataset splits. Our current datasets return a dictionary with *strings*, but we want *input token ids* (i.e. the output of the tokenizer). This is easy enough to do my hand, but the HugginFace `Dataset` class provides convenient, efficient, and *lazy* methods. See the documentation for [`Dataset.map`](https://huggingface.co/docs/datasets/v3.5.0/en/package_reference/main_classes#datasets.Dataset.map).

**Tip**: Verify that your new datasets are returning for every element: `text`, `label`, `intput_ids`, and `attention_mask`.

In [17]:
def tokenize_function(data):
    return tokenizer(data["text"], truncation=True)

tokenized_ds_train = ds_train.map(tokenize_function, batched=True)
tokenized_ds_validation = ds_validation.map(tokenize_function, batched=True)
tokenized_ds_test = ds_test.map(tokenize_function, batched=True)

Map: 100%|██████████| 1066/1066 [00:00<00:00, 5386.89 examples/s]


In [18]:
def validate_tokenized_dataset(dataset, name="dataset"):
    required_fields = {"text", "label", "input_ids", "attention_mask"}
    missing_fields = required_fields - set(dataset.features.keys())
    if missing_fields:
        raise ValueError(
            f"{name} is missing the following required fields: {', '.join(missing_fields)}"
        )
    print(f"{name} is valid")

validate_tokenized_dataset(tokenized_ds_train, name="train_dataset")
validate_tokenized_dataset(tokenized_ds_validation, name="validation_dataset")
validate_tokenized_dataset(tokenized_ds_test, name="test_dataset")



train_dataset is valid
validation_dataset is valid
test_dataset is valid


#### Exercise 2.2: Setting up the Model to be Fine-tuned

In this exercise we need to prepare the base Distilbert model for fine-tuning for a *sequence classification task*. This means, at the very least, appending a new, randomly-initialized classification head connected to the `[CLS]` token of the last transformer layer. Luckily, HuggingFace already provides an `AutoModel` for just this type of instantiation: [`AutoModelForSequenceClassification`](https://huggingface.co/transformers/v3.0.2/model_doc/auto.html#automodelforsequenceclassification). You will want you instantiate one of these for fine-tuning.

In [19]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Exercise 2.3: Fine-tuning Distilbert

Finally. In this exercise you should use a HuggingFace [`Trainer`](https://huggingface.co/docs/transformers/main/en/trainer) to fine-tune your model on the Rotten Tomatoes training split. Setting up the trainer will involve (at least):


1. Instantiating a [`DataCollatorWithPadding`](https://huggingface.co/docs/transformers/en/main_classes/data_collator) object which is what *actually* does your batch construction (by padding all sequences to the same length).
2. Writing an *evaluation function* that will measure the classification accuracy. This function takes a single argument which is a tuple containing `(logits, labels)` which you should use to compute classification accuracy (and maybe other metrics like F1 score, precision, recall) and return a `dict` with these metrics.  
3. Instantiating a [`TrainingArguments`](https://huggingface.co/docs/transformers/v4.51.1/en/main_classes/trainer#transformers.TrainingArguments) object using some reasonable defaults.
4. Instantiating a `Trainer` object using your train and validation splits, you data collator, and function to compute performance metrics.
5. Calling `trainer.train()`, waiting, waiting some more, and then calling `trainer.evaluate()` to see how it did.

**Tip**: When prototyping this laboratory I discovered the HuggingFace [Evaluate library](https://huggingface.co/docs/evaluate/en/index) which provides evaluation metrics. However I found it to have insufferable layers of abstraction and getting actual metrics computed. I suggest just using the Scikit-learn metrics...

In [20]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [26]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average='weighted')
    }

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=100,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",

)

In [32]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds_train,
    eval_dataset=tokenized_ds_validation,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
trainer.train()
trainer.evaluate()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0757,0.562579,0.845216,0.845156
2,0.0506,0.72417,0.845216,0.845101
3,0.0602,0.760153,0.848968,0.848938
4,0.0509,0.805423,0.84803,0.847925
5,0.0364,0.914386,0.84334,0.843323
6,0.0331,0.902946,0.847092,0.847069
7,0.022,1.062121,0.844278,0.844185
8,0.0213,1.153478,0.848968,0.848594
9,0.0163,1.285319,0.826454,0.825636
10,0.0179,1.194522,0.841463,0.841229




-----
### Exercise 3: Choose at Least One


#### Exercise 3.3: Choose your Own Adventure

There are a *ton* of interesting and fun models on the HuggingFace hub. Pick one that does something interesting and adapt it in some way to a new task. Or, combine two or more models into something more interesting or fun. The sky's the limit.

**Note**: Reach out to me by email or on the Discord if you are unsure about anything.

In [None]:
scgpt = tdc_hf_interface("scGPT")
model = scgpt.load()
tokenizer = scGPTTokenizer()

In [None]:
print(f"Number of transformer blocks: {len(model.transformer.layers)}")

In [None]:
from src.data.download_spatial_data import setup_huggingface, get_metadata, download_cancer_type, download_specific_sample, download_first_n_samples

# Prima di tutto, configura il tuo token Hugging Face
setup_huggingface("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")

# Per scaricare i metadati
metadata = get_metadata()

# Per scaricare un tipo specifico di cancro con un tipo specifico di vetrino
# Opzioni per cancer_type: 'TCGA_KIRC', 'TCGA_SKCM', 'TCGA_LUSC', 'TCGA_LUAD'
# Opzioni per slide_type: 'FF' (Fresh Frozen) o 'FFPE' (Formalin-Fixed Paraffin-Embedded)
# download_specific_sample
# download_cancer_type("TCGA_KIRC", "FF")
# download_first_n_samples(30)

In [None]:


data_fold_path = '/data2/jupyterhub/vcivale/prova/Esercitazione_3/data/raw/TCGA_SKCM/FFPE'

dataset = create_dataset(data_fold_path, tokenizer, model)

#### Processing Data

In [None]:
from datasets import load_from_disk

dataset = load_from_disk("/home/vcivale/prova/Esercitazione_3/data/interim/embeddings_dataset")


In [None]:



new_dataset = predict_expr_per_layer(dataset, model)


In [None]:
new_dataset