In [1]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    AutoConfig,
    TrainingArguments,
    Trainer,PretrainedConfig,
    DataCollatorWithPadding,
    default_data_collator

)
from datasets import load_dataset
import evaluate
from distillation import DistilModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
teacher=AutoModelForSequenceClassification.from_pretrained("/scratch/pb2276/GLUE-pretrain/out_l/cola")

In [3]:
student=AutoModelForSequenceClassification.from_pretrained("/scratch/pb2276/GLUE-pretrain/out/cola")

In [362]:
student2=AutoModelForSequenceClassification.from_pretrained("/scratch/pb2276/GLUE-pretrain/out_p2/rte/checkpoint-100")

In [366]:
task_name="rte"

In [368]:
tokenizer = AutoTokenizer.from_pretrained("/scratch/pb2276/GLUE-pretrain/out_p2/rte")

In [369]:
raw_datasets = load_dataset(
            "nyu-mll/glue",
            task_name)

In [370]:
is_regression = task_name == "stsb"
if not is_regression:
    label_list = raw_datasets["train"].features["label"].names
    num_labels = len(label_list)
else:
    num_labels = 1

In [371]:
task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}
sentence1_key, sentence2_key = task_to_keys[task_name]

In [372]:
padding = "max_length"

In [373]:
max_seq_length = 128

In [374]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 2490
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 277
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3000
    })
})

In [375]:
def preprocess_function(examples):
        # Tokenize the texts
        args = (
            (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
        )
        result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True,return_tensors="pt")
        return result

In [376]:
raw_datasets = raw_datasets.map(
            preprocess_function,
            batched=True,
            desc="Running tokenizer on dataset")
raw_datasets.set_format("pt", columns=["input_ids","token_type_ids","attention_mask", "label"], output_all_columns=False)

Running tokenizer on dataset: 100%|██████████| 2490/2490 [00:00<00:00, 7444.08 examples/s]
Running tokenizer on dataset: 100%|██████████| 277/277 [00:00<00:00, 6110.96 examples/s]
Running tokenizer on dataset: 100%|██████████| 3000/3000 [00:00<00:00, 7807.18 examples/s]


In [15]:
from distillation import DistillationLoss, DistilTrainer

In [16]:
l=DistillationLoss(similarity_measure="cka", align_match=[[3, 6], [6, 12]])

In [21]:
data_collator = default_data_collator

In [18]:
training_args=TrainingArguments(output_dir="tmp", remove_unused_columns=False, 
                                do_train=True, do_eval=True, logging_strategy="epoch",eval_strategy ="epoch",  
                                label_names=["labels"], resume_from_checkpoint=True, 
                               )

In [19]:
if task_name is not None:
    metric = evaluate.load("glue",task_name)
elif is_regression:
    metric = evaluate.load("mse")
else:
    metric = evaluate.load("accuracy")

import numpy as np

# You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
# predictions and label_ids field) and has to return a dictionary string to float.
def compute_metrics(p):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
    result = metric.compute(predictions=preds, references=p.label_ids)
    if len(result) > 1:
        result["combined_score"] = np.mean(list(result.values())).item()
    return result

In [20]:
trainer=DistilTrainer(student_model=student, teacher_model=teacher, loss_fn=l, 
                      train_dataset= raw_datasets["train"].select(range(10)), eval_dataset = raw_datasets["validation"].select(range(5)), 
                      data_collator=default_data_collator, args=training_args,compute_metrics=compute_metrics)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
s=trainer.predict(raw_datasets["validation"].select(range(10)))

In [None]:
s

In [None]:
trainer.train()

In [43]:
s2=trainer.predict(raw_datasets["validation"].select(range(10)))

In [None]:
s2

In [None]:
s2.predictions[1]

In [None]:
s.predictions[1]

In [38]:
f=raw_datasets["validation"].select(range(10))

In [None]:
f[:10]

In [47]:
ff=raw_datasets.remove_columns(["label"])

In [50]:
student2=AutoModelForSequenceClassification.from_pretrained("/scratch/pb2276/GLUE-pretrain/out/cola")

In [51]:
test_out = student2(**ff["validation"][:10])

In [None]:
test_out

In [64]:
s.predictions[0]

array([[-3.1297874 ,  1.8954954 ],
       [-2.9468465 ,  1.7077515 ],
       [-3.102969  ,  1.9307866 ],
       [-2.1478105 ,  0.9368223 ],
       [ 2.755     , -1.5207777 ],
       [-2.515923  ,  1.3795925 ],
       [-1.239342  ,  0.9594818 ],
       [-0.19918849,  0.29507843],
       [-3.1980803 ,  1.8948785 ],
       [-3.3853219 ,  2.239891  ]], dtype=float32)

In [55]:
s2.predictions[0]

array([[-3.2961469,  2.099805 ],
       [-3.0494585,  1.8269999],
       [-3.253298 ,  2.065838 ],
       [-3.048266 ,  1.8305788],
       [-2.209058 ,  1.7699184],
       [-2.8450418,  1.6309398],
       [-3.53449  ,  2.3524985],
       [-3.3218148,  2.1642723],
       [-3.0117486,  1.7694576],
       [-3.5312912,  2.374811 ]], dtype=float32)

In [57]:
updated_student= trainer.model.student_model

In [60]:

updated_student=updated_student.to(torch.device("cpu"))

In [62]:
s3=updated_student(**ff["validation"][:10])

In [63]:
s3

SequenceClassifierOutput(loss=None, logits=tensor([[-3.2961,  2.0998],
        [-3.0495,  1.8270],
        [-3.2533,  2.0658],
        [-3.0483,  1.8306],
        [-2.2091,  1.7699],
        [-2.8450,  1.6309],
        [-3.5345,  2.3525],
        [-3.3218,  2.1643],
        [-3.0117,  1.7695],
        [-3.5313,  2.3748]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [65]:
trainer.save_model()

In [66]:
trainer.save_state()

# Significance

In [178]:
model = DistilModel(student, teacher)

In [179]:
import safetensors

In [549]:
safetensors.torch.load_model(model,"linear_middle_layer/rte/model.safetensors")

(set(), [])

In [531]:
model2=model.student_model

In [532]:
i=0
for p1, p2 in zip(student2.parameters(), model2.parameters()):
    i+=1
    if i == 10:
        break

In [None]:
p1

In [None]:
p2

In [550]:
labels=raw_datasets["validation"]["label"].numpy()

In [551]:
len(labels)

277

In [552]:
ff=raw_datasets.remove_columns(["label"])

In [553]:
cka_outputs=[]
from tqdm import tqdm
for i in tqdm(range(3)):
    preds=model2(**ff["validation"][i*100:(i+1)*100])
    output=torch.argmax(preds.logits.detach(), dim=1).numpy()
    cka_outputs+= list(output)

100%|██████████| 3/3 [00:05<00:00,  1.98s/it]


In [382]:
ft_output=[]
from tqdm import tqdm
for i in tqdm(range(3)):
    preds=student2(**ff["validation"][i*100:(i+1)*100])
    output=torch.argmax(preds.logits.detach(), dim=1).numpy()
    ft_output+= list(output)

100%|██████████| 3/3 [00:06<00:00,  2.19s/it]


In [517]:
import numpy as np
len(cka_outputs)

277

In [554]:
n_cka_correct= np.array(cka_outputs) == labels

In [555]:
n_ft_correct=np.array(ft_output)==labels

In [556]:
both_correct = np.sum( (n_cka_correct * n_cka_correct).astype(int) )

In [557]:
cka_correct = np.sum( (n_cka_correct * (~n_ft_correct)).astype(int))

In [558]:
ft_correct = np.sum( (n_ft_correct  * (~n_cka_correct)).astype(int))

In [559]:
both_incorrect = np.sum( (~n_cka_correct * ~n_cka_correct).astype(int) )

In [560]:
both_incorrect + both_correct + cka_correct +ft_correct

311

In [561]:
table=[[both_correct, cka_correct], [ft_correct, both_incorrect]]

In [562]:
print(table)

[[183, 23], [11, 94]]


In [563]:
from statsmodels.stats.contingency_tables import mcnemar

In [564]:
result = mcnemar(table, exact=True)

In [565]:
result.pvalue

0.05761267291381955

# Ranks etc


In [25]:
model = DistilModel(student, teacher)

In [27]:
student_output, teacher_output = model(**filtered_raw["validation"][:20])

In [29]:
align_student, align_teacher=l(student_output.logits, teacher_output.logits, student_output.hidden_states, teacher_output.hidden_states)

In [34]:
align_student[0]

tensor([[ 0.6397,  0.2883, -0.3506,  ...,  0.1039,  0.2825,  0.0324],
        [-0.3096,  0.2155,  0.3793,  ..., -0.9359,  0.8348,  0.3450],
        [ 0.3856, -0.3995,  0.7752,  ..., -0.3891,  1.5308, -0.1334],
        ...,
        [ 0.0819, -0.1230, -0.2317,  ..., -0.5294, -0.0580, -0.2894],
        [-0.0236, -0.2535, -0.2139,  ..., -0.5686,  0.0900, -0.1302],
        [ 0.0441, -0.1422, -0.2268,  ..., -0.7129,  0.0716, -0.1383]],
       grad_fn=<SelectBackward0>)

In [40]:
import torch


tensor([128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
        128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
        128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128])

In [39]:
align_student=torch.nn.functional.pad(align_student, (0, 1024-768))

In [43]:
wxy= torch.bmm(align_student.transpose(1,2), align_teacher)

In [50]:
U, _, Vt = torch.linalg.svd(wxy, full_matrices=False)

In [52]:
Vt.shape

torch.Size([40, 1024, 1024])