In [1]:
import comet_ml
import numpy as np
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import TrainingArguments, Trainer
import torch
import torch.nn as nn
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm
2024-09-20 04:34:15.647354: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-20 04:34:15.671179: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-20 04:34:15.671202: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-20 04:34:15.686979: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler f

In [2]:
student_id = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(student_id)

dataset = load_dataset("imdb")

def pre_process(examples):
    return tokenizer(examples["text"], truncation = True, max_length = 512)

tokenized_data = dataset.map(pre_process, batched = True)

labels = tokenized_data['train'].features['label'].names
num_labels = len(labels)
label2id, id2label = {}, {}

for idx, lbl in enumerate(labels):
    label2id[lbl] = idx
    id2label[idx] = lbl

train_subset = tokenized_data["train"].select(range(1000))
val_subset = tokenized_data["test"].select(range(1000))

In [3]:
from transformers import DistilBertForSequenceClassification, AutoModelForSequenceClassification, DistilBertConfig, DataCollatorWithPadding
from iDistilbert import iDistilBertForSequenceClassification
student_config = DistilBertConfig(
    output_hidden_states = False,
    distance_metric = "manhattan_distance",
    activation_function = "relu",
    signed_inhibitor =  True,
    alpha = 0,
    center = False,
    num_labels = num_labels,
    label2id = label2id,
    id2label = id2label,
    )

#student_model = iDistilBertForSequenceClassification.from_pretrained('/mnt/tony/MSc2024/results/checkpoint-2582', config = student_config)
student_model = iDistilBertForSequenceClassification(
        config=student_config,
    )
initialized_weights = torch.load('/mnt/tony/MSc2024/distilbert_init/models/distilbert_init.pth')
student_model.load_state_dict(initialized_weights, strict=False)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
student_model.to(device)

iDistilBertForSequenceClassification(
  (distilbert): iDistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): iTransformer(
      (layer): ModuleList(
        (0-5): 6 x iTransformerBlock(
          (attention): iMultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=Fal

In [None]:
from transformers import DistilBertForSequenceClassification, AutoModelForSequenceClassification, DistilBertConfig, DataCollatorWithPadding
from iDistilbert import iDistilBertForSequenceClassification
student_id = "distilbert/distilbert-base-uncased"
student_config = DistilBertConfig(    
    distance_metric = "cosine_distance",
    activation_function = "softmax",
    signed_inhibitor =  False,
    alpha = 0,
    center = False,
    output_contexts = False,
)
    
student_model = iDistilBertForSequenceClassification(
        config=student_config,
    )

initialized_weights = torch.load('/mnt/tony/MSc2024/distilbert_init/models/q_k_distilbert_layerwise.pth')
student_model.load_state_dict(initialized_weights, strict=False)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
student_model.to(device)


In [4]:
import evaluate
import numpy as np

#experiment = comet_ml.get_global_experiment()

accuracy = evaluate.load("accuracy")

def preprocess_logits_for_metrics(logits, labels):
    """
    Preprocess the logits to ensure they are in the correct format for metric computation.
    This function will be called during the evaluation process.
    """
    if isinstance(logits, tuple):  
        logits = logits[0]  # get logit tensors

    pred_ids = torch.argmax(logits, dim=-1)
    
    return pred_ids, labels
    
def compute_metrics(eval_pred):
    
    predictions, labels = eval_pred

    return accuracy.compute(predictions=predictions[0], references=labels)


In [5]:
EPOCHS = 4
BATCH_SIZE = 4
LEARNING_RATE = 4e-5
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

            
training_args = TrainingArguments(
    output_dir = './results',
    num_train_epochs = EPOCHS,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size = BATCH_SIZE,
    learning_rate = LEARNING_RATE,
    logging_dir = './logs',
    load_best_model_at_end= True,
    metric_for_best_model="accuracy",
    eval_strategy="steps",
    eval_steps = 390,
    save_steps=390,
    logging_steps = 20,
    save_strategy="steps",
    save_total_limit=2,
    seed = 42,
    #report_to=['comet_ml', 'tensorboard'],
    report_to=['tensorboard'],
    lr_scheduler_type="cosine",
    gradient_accumulation_steps=4,
    fp16 = True,
    weight_decay = 0.01,
)

trainer = Trainer(
    model=student_model,                         
    args=training_args,                  
    train_dataset=tokenized_data['train'],         
    eval_dataset=tokenized_data['test'],
    compute_metrics = compute_metrics,
    preprocess_logits_for_metrics = preprocess_logits_for_metrics,
    tokenizer = tokenizer,
    data_collator = data_collator,
)


In [6]:
trainer.train()



Step,Training Loss,Validation Loss,Accuracy
390,0.4046,0.367775,0.83736
780,0.3019,0.305658,0.87472
1170,0.2206,0.29791,0.88044
1560,0.1808,0.315318,0.88068




TrainOutput(global_step=1560, training_loss=0.32507384022076924, metrics={'train_runtime': 10386.978, 'train_samples_per_second': 9.627, 'train_steps_per_second': 0.15, 'total_flos': 1.308404299261008e+16, 'train_loss': 0.32507384022076924, 'epoch': 3.9923224568138194})

In [7]:
trainer.evaluate()



{'eval_loss': 0.2468193769454956,
 'eval_accuracy': 0.904,
 'eval_runtime': 168.4103,
 'eval_samples_per_second': 148.447,
 'eval_steps_per_second': 9.281,
 'epoch': 1.9961612284069097}

In [4]:
from torchinfo import summary

input_ids = torch.randint(0, 30522, (4, 512)).long().to(device)  # Assuming vocab size 30522

# Attention mask (optional, but typically used)
attention_mask = torch.ones((4, 512)).long().to(device)

# Generate summary, note that input size should match what the model expects
summary(student_model, input_data={'input_ids': input_ids, 'attention_mask': attention_mask})

Layer (type:depth-idx)                                  Output Shape              Param #
iDistilBertForSequenceClassification                    [4, 512, 768]             --
├─iDistilBertModel: 1-1                                 [4, 512, 768]             --
│    └─Embeddings: 2-1                                  [4, 512, 768]             --
│    │    └─Embedding: 3-1                              [4, 512, 768]             23,440,896
│    │    └─Embedding: 3-2                              [1, 512, 768]             393,216
│    │    └─LayerNorm: 3-3                              [4, 512, 768]             1,536
│    │    └─Dropout: 3-4                                [4, 512, 768]             --
│    └─iTransformer: 2-2                                [4, 512, 768]             --
│    │    └─ModuleList: 3-5                             --                        42,527,232
├─Linear: 1-2                                           [4, 768]                  590,592
├─Dropout: 1-3                 