## premier test avec pytorch et premier hook

In [1]:
import requests
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
import torch.nn.functional as F
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

print(torch.cuda.is_available())  # Should return True if CUDA is available
print(torch.version.cuda)  # Check CUDA version
print(torch.backends.cudnn.enabled)  # Should be True if cuDNN is available

torch.backends.cudnn.benchmark = True
print(torch.__version__)  # Should match your installed CUDA version
print(torch.version.cuda)  # Should match NVIDIA drivers

Using device: cuda
True
12.6
True


In [5]:
model_name = "gpt2"
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
tokenizer.pad_token =  tokenizer.eos_token
#tokenizer.padding_side = "left"

In [6]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [7]:
def output_hook(module, input, output):
    print(f'{module} : output')
    print(output.shape)
    print(output)
def input_hook(module, input, output):
    print(f'{module} : input')
    print(input)

def naive_noise_hook(module,input,output): #a rajouter : le fait de selectionner uniquement les bon token, et la bonne variance pour le bruit
    noise = torch.randn_like(output)#*sqrt(3*variance)
    return output+noise

In [8]:
# hook pour observer ce qu'il se passe dedans
hook1 = model.transformer.wte.register_forward_hook(input_hook)
hook2 = model.transformer.wte.register_forward_hook(output_hook)
hook3 = model.transformer.wpe.register_forward_hook(input_hook)
hook4 = model.transformer.wpe.register_forward_hook(output_hook)
hook5 = model.transformer.drop.register_forward_hook(input_hook)
hook6 = model.transformer.drop.register_forward_hook(output_hook)


In [9]:
#hook pour ajouter du bruit
noise_hook1 = model.transformer.drop.register_forward_hook(naive_noise_hook)

In [10]:
# deuxième run avec bruit
input= tokenizer("Audible.com is owned by", return_tensors="pt").to(device)
with torch.no_grad():
    outputs = model(**input, labels = input.input_ids, output_hidden_states = True, output_attentions =True)
probs = F.softmax(outputs.logits[0, -1, :], dim=-1)
top_probs, top_indices = torch.topk(probs, 4)

# les probas des mots
top_words = [tokenizer.decode([idx]) for idx in top_indices]
for word, prob in zip(top_words, top_probs):
    print(f"{word}: {prob.item():.4f}")



Embedding(50257, 768) : input
(tensor([[16353,   856,    13,   785,   318,  6898,   416]], device='cuda:0'),)
Embedding(50257, 768) : output
torch.Size([1, 7, 768])
tensor([[[ 0.1093, -0.0582,  0.1388,  ...,  0.2905,  0.1858, -0.1798],
         [-0.1931,  0.1698,  0.1210,  ...,  0.1461, -0.2375, -0.0535],
         [ 0.0466, -0.0113,  0.0283,  ..., -0.0735,  0.0496,  0.0963],
         ...,
         [-0.0097,  0.0101,  0.0556,  ...,  0.1145, -0.0380, -0.0254],
         [ 0.0211,  0.1182,  0.0958,  ..., -0.1856, -0.1424,  0.1010],
         [ 0.0040,  0.0265,  0.0364,  ..., -0.0668, -0.0158,  0.1041]]],
       device='cuda:0')
Embedding(1024, 768) : input
(tensor([[0, 1, 2, 3, 4, 5, 6]], device='cuda:0'),)
Embedding(1024, 768) : output
torch.Size([1, 7, 768])
tensor([[[-1.8821e-02, -1.9742e-01,  4.0267e-03,  ..., -4.3044e-02,
           2.8267e-02,  5.4490e-02],
         [ 2.3959e-02, -5.3792e-02, -9.4879e-02,  ...,  3.4170e-02,
           1.0172e-02, -1.5573e-04],
         [ 4.2161e-03, -

In [11]:
hook1.remove()
hook2.remove()
hook3.remove()
hook4.remove()
hook5.remove()
hook6.remove()
noise_hook1.remove()

## dit si un token se réfère au sujet

In [12]:
url = 'https://rome.baulab.info/data/dsets/known_1000.json'
response = requests.get(url) 
data = response.json()

In [13]:
prompts = [dict['prompt'] for dict in data]
subjects = [dict['subject'] for dict in data]
input= tokenizer(prompts, return_tensors="pt", padding= True, return_offsets_mapping= True).to(device)

In [14]:
mask = []
for j, prompt in enumerate(prompts):
    map = torch.zeros_like(input.input_ids[j], dtype=torch.int)
    for i,t in enumerate(input.offset_mapping[j]):
        
        if (prompts[j].find(subjects[j])-1<=t[0]) and (t[1]<=prompts[j].find(subjects[j])+len(subjects[j])):
            map[i] = 1
    mask.append(map)
masks_tensor = torch.stack(mask)
masks_tensor = torch.logical_and(masks_tensor, input.attention_mask).int()
masks_tensor

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 0,  ..., 0, 0, 0]], device='cuda:0', dtype=torch.int32)

pour le prompt i, masks-tensor[i] donne un mask qui dit si oui ou non les tokens se réfèrent au sujet (1 si c'est le cas, 0 sinon)

## rajoute le bruit sur les bons tokens

In [15]:
def noise_hook(module,input,output):
    std_dev_all = torch.std(output.flatten())
    noise = torch.randn_like(output)*3*std_dev_all
    noisy_output = output + noise * masks_tensor.unsqueeze(-1).float()
    return noisy_output

In [16]:
# fonction qui sert à obtenir le logits du dernier non-padding token
def last_non_padding_token_logits(logits, attention_mask):
    # For each input, find the last non-padding token
    last_non_padding_logits = []
    
    for i in range(logits.size(0)):  # Loop over each prompt in the batch
        # Find the last non-padding token position
        non_padding_positions = (attention_mask[i] == 1).nonzero(as_tuple=True)[0]
        last_non_padding_token_index = non_padding_positions[-1]
        
        # Get the logits of the last non-padding token
        last_non_padding_logits.append(logits[i, last_non_padding_token_index])
    last_non_padding_logits = torch.stack(last_non_padding_logits)
    return last_non_padding_logits

Séparations des prompts en batchs pour optimiser les performances GPU

In [20]:
batch_size = 100
batchedInput = [prompts[i:i+batch_size] for i in range(0, len(prompts), batch_size)]
logits_utile_batched_no_noise = [] 

sans le bruit

In [21]:
for batch in batchedInput:
    print("in progress")
    input= tokenizer(batch, return_tensors="pt", padding= True).to(device)
    with torch.no_grad():
        logits_utile_batched_no_noise.append(last_non_padding_token_logits(model(**input, labels = input.input_ids, output_hidden_states = True, output_attentions =True).logits,input.attention_mask))
    del input
    torch.cuda.empty_cache()

in progress
in progress
in progress
in progress
in progress
in progress
in progress
in progress
in progress
in progress
in progress
in progress
in progress


avec le bruit cette fois : 

In [None]:
masked_noise_hook = model.transformer.drop.register_forward_hook(noise_hook)
logits_utile_batched_with_noise=[]

for batch in batchedInput:
    input= tokenizer(batch, return_tensors="pt", padding= True).to(device)
    with torch.no_grad():
        logits_utile_batched_with_noise.append(last_non_padding_token_logits(model(**input, labels = input.input_ids, output_hidden_states = True, output_attentions =True).logits,input.attention_mask))



In [12]:
#logits_utile = last_non_padding_token_logits(outputs.logits,input.attention_mask)
logits_utile= torch.cat(logits_utile_batched_with_noise)
#Le mot prédit pour le 1er prompt avec le noise
probs = F.softmax(logits_utile, dim=-1)
top_probs, top_indices = torch.topk(probs[0], 4)

# les probas des mots
top_words = [tokenizer.decode([idx]) for idx in top_indices]
for word, prob in zip(top_words, top_probs):
    print(f"{word}: {prob.item():.4f}")

 the: 0.5994
 Europe: 0.0154
 Africa: 0.0143
 a: 0.0143


In [None]:
masked_noise_hook.remove()

## Rectified corrupted run 