In [None]:
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer
import torch
from collections import Counter
import transformers
from transformers import pipeline
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments
from transformers import Trainer
import math
from torch.utils.data import DataLoader
from transformers import default_data_collator
from torch.optim import AdamW
from accelerate import Accelerator
from transformers import get_scheduler
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import preprocessing
import pickle
import pandas as pd
import os
import re
import comprehension_model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
model_checkpoint = "KBLab/bert-base-swedish-cased"
tokenizer =preprocessing.create_tokenizer(model_checkpoint)

In [None]:
model_kb = preprocessing.create_model_MLM(model_checkpoint)
model_kb=model_kb.to(device)

In [None]:
with open("lm_dataset.pkl","rb") as f:
    lm_datasets= pickle.load(f)

In [None]:
with open("valid_dataset.pkl","rb") as f:
    valid_dataset= pickle.load(f)

valid_dataset

In [None]:
valid_dataset=valid_dataset.remove_columns(["word_ids"])
data_collator = preprocessing.data_collector_masking(tokenizer,0.15)
lm_dataset_bis = lm_datasets.remove_columns(["word_ids","token_type_ids"])

print(lm_dataset_bis["test"])
eval_dataset = preprocessing.create_deterministic_eval_dataset(lm_dataset_bis["test"],data_collator)
valid_dataset=preprocessing.create_deterministic_eval_dataset(valid_dataset,data_collator)

In [None]:
from datasets import Dataset
#valid_dataset=valid_dataset.remove_columns(["word_ids"])
data_collator = preprocessing.data_collector_masking(tokenizer,0.15)
small_valid_dataset = preprocessing.create_deterministic_eval_dataset(valid_dataset.select(range(10000)),data_collator)
small_valid_dataloader=preprocessing.create_dataloader(small_valid_dataset,64,default_data_collator)

In [None]:
batch_size = 64
train_dataloader = preprocessing.create_dataloader(lm_dataset_bis["train"],batch_size,data_collator)
def to_device(batch):
    return {key: value.to(device) for key, value in batch.items()}

print("ok")
eval_dataloader = preprocessing.create_dataloader(eval_dataset,batch_size,default_data_collator)
valid_dataloader=preprocessing.create_dataloader(valid_dataset,batch_size,default_data_collator)

In [None]:
model_hugging_face = AutoModelForMaskedLM.from_pretrained("finetuning_hugging_whitespace_bis-finetuned-imdb/checkpoint-2061000")
model_hugging_face=model_hugging_face.to(device)

In [None]:
model_exbert = AutoModelForMaskedLM.from_pretrained("exbert-finetuned-imdb/checkpoint-1271340")
model_exbert=model_exbert.to(device)

In [None]:
import transformers
config = transformers.BertConfig.from_pretrained("pretraining_from_scratch/checkpoint-3944175")
mosaicBert = AutoModelForMaskedLM.from_pretrained("pretraining_from_scratch/checkpoint-3944175",config=config,trust_remote_code=True)

In [None]:
def get_most_frequent_tokens(valid_dataset, top_n=1000):
    token_counts = Counter()
    
    # Count the frequency of each token in the labels
    for example in valid_dataset:
        labels = example['labels']
        for label in labels:
            if label != -100:
                token_counts[label] += 1
    
    # Get the most common tokens
    most_common_tokens = token_counts.most_common(top_n)
    most_frequent_tokens = [token for token, count in most_common_tokens]
    
    return most_frequent_tokens

# Example usage
# Assuming valid_dataset and tokenizer are already loaded
most_frequent_tokens = get_most_frequent_tokens(valid_dataset)


def filter_and_process_dataset(valid_dataset, valid_token_list, tokenizer, preprocessing, max_examples=10000, max_filtered=1000):
    filtered_datasets = {}

    for token_id in most_frequent_tokens:
        # Filter the dataset for the current token
        valid_filtered_dataset = valid_dataset.select(range(max_examples)).filter(lambda example: special_token(token_id, example))
        
        # Skip if the filtered dataset is empty
        if len(valid_filtered_dataset) == 0:
            continue
        
        # Process the filtered dataset with the context mask
        valid_sentence_filtered = valid_filtered_dataset.map(lambda example: preprocessing.get_context_with_mask(example, token_id, tokenizer))
        
        # Limit to max_filtered examples for efficiency
        limited_dataset = valid_sentence_filtered.select(range(min(len(valid_sentence_filtered), max_filtered)))
        
        filtered_datasets[token_id] =limited_dataset
        print(f"Processed token ID: {token_id} with {len(limited_dataset)} examples")

    return filtered_datasets


filtered_datasets = filter_and_process_dataset(valid_dataset,valid_tokens_list,tokenizer,preprocessing)
with open("word_embedding_distance_layer12.csv","w") as file :
    writer=csv.writer(file)
    writer.writerow(["indices", "token","euclidean_distances","cosine similarity" ])




In [None]:
from scipy.spatial.distance import euclidean
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
euclidean_measures =[]
cosine_measures = []
tokens=[]
for token in filtered_datasets.keys() :
    valid_sentence_filtered = filtered_datasets[token]
    dataloader  =preprocessing.create_dataloader(valid_sentence_filtered,1,default_data_collator)
    baseline_embeddings = comprehension_model.get_embeddings(model_kb, dataloader, tokenizer)
    finetuned_embeddings = comprehension_model.get_embeddings(model_hugging_face, dataloader, tokenizer)
    euclidean_distances = [euclidean(baseline, finetuned) for baseline, finetuned in zip(baseline_embeddings[-1], finetuned_embeddings[-1])]
    euclidean_measures.append(np.mean(euclidean_distances))
    cosine_measure = cosine_similarity(baseline_embeddings[-1], finetuned_embeddings[-1])
    avg_cosine = np.mean(cosine_measure)
    cosine_measures.append(avg_cosine)
    tokens.append(token)
    with open("word_embedding_distance_layer12.csv", "a") as file:
        writer = csv.writer(file)
        writer.writerow([i, token,np.mean(euclidean_distances),avg_cosine ])
    print(tokenizer.decode(token))
    
high_separated_indices12 = sorted(range(len(euclidean_measures)), key=lambda i: euclidean_measures[i], reverse=True)[:50]
least_separated_indices12 = sorted(range(len(euclidean_measures)), key=lambda i: euclidean_measures[i])[:50]
high_separated_values12 = [euclidean_measures[i] for i in high_separated_indices12]
least_separated_values12 = [euclidean_measures[i] for i in least_separated_indices12]
print(high_separated_values12)
print(least_separated_values12)
high_separated_tokens12 = [tokens[i] for i in high_separated_indices12]
least_separated_tokens12 = [tokens[i] for i in least_separated_indices12]
high_separated_words12 = [tokenizer.decode([idx]) for idx in high_separated_tokens12]
least_separated_words12 = [tokenizer.decode([idx]) for idx in least_separated_tokens12]
print(high_separated_words12)
print(least_separated_words12)

In [None]:
# Compute similarity metrics
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean

cos_similarities = cosine_similarity(baseline_embeddings[0], finetuned_embeddings[0])
euclidean_distances = [euclidean(baseline, finetuned) for baseline, finetuned in zip(baseline_embeddings[0], finetuned_embeddings[0])]

print("Average Cosine Similarity:", np.mean(cos_similarities))
print("Average Euclidean Distance:", np.mean(euclidean_distances))

In [None]:
import torch
import torch.nn.functional as F
from scipy.optimize import linear_sum_assignment
import seaborn as sns
import matplotlib.pyplot as plt

# Define the kernelized correlation coefficient function
def kernelized_corr_coef(x, y, K1, K2, K12, emb_mean1, emb_mean2, n):
    mu1, mu2 = x @ emb_mean1, y @ emb_mean2
    print(mu1.shape)
    print(mu2.shape)
    mu12 = torch.outer(mu1, mu2)
    x_norm, y_norm = map(lambda x, K, mu: (1/n * (x @ K) * x).sum(dim=-1) - mu**2, [x, y], [K1, K2], [mu1, mu2])
    return (1 / n * x @ K12 @ y.T - mu12) / torch.sqrt(torch.outer(x_norm, y_norm))


# Collect parameters from models
num_layers = 12

multi_WQ_kb = torch.cat([model_kb.state_dict()[f"bert.encoder.layer.{j}.attention.self.query.weight"].T for j in range(num_layers)])
multi_WV_kb = torch.cat([model_kb.state_dict()[f"bert.encoder.layer.{j}.attention.self.value.weight"].T for j in range(num_layers)])    
multi_WK_kb = torch.cat([model_kb.state_dict()[f"bert.encoder.layer.{j}.attention.self.key.weight"].T for j in range(num_layers)])
multi_K_kb = torch.cat([model_kb.state_dict()[f"bert.encoder.layer.{j}.intermediate.dense.weight"].T for j in range(num_layers)])
multi_V_kb = torch.cat([model_kb.state_dict()[f"bert.encoder.layer.{j}.output.dense.weight"] for j in range(num_layers)])
multi_WO_kb = torch.cat([model_kb.state_dict()[f"bert.encoder.layer.{j}.attention.output.dense.weight"] for j in range(num_layers)])
multi_E_kb = model_kb.state_dict()['bert.embeddings.word_embeddings.weight'].T

multi_WQ_ft = torch.cat([model_hugging_face.state_dict()[f"bert.encoder.layer.{j}.attention.self.query.weight"].T for j in range(num_layers)])
multi_WV_ft = torch.cat([model_hugging_face.state_dict()[f"bert.encoder.layer.{j}.attention.self.value.weight"].T for j in range(num_layers)])    
multi_WK_ft = torch.cat([model_hugging_face.state_dict()[f"bert.encoder.layer.{j}.attention.self.key.weight"].T for j in range(num_layers)])
multi_K_ft = torch.cat([model_hugging_face.state_dict()[f"bert.encoder.layer.{j}.intermediate.dense.weight"].T for j in range(num_layers)])
multi_V_ft = torch.cat([model_hugging_face.state_dict()[f"bert.encoder.layer.{j}.output.dense.weight"] for j in range(num_layers)])
multi_WO_ft = torch.cat([model_hugging_face.state_dict()[f"bert.encoder.layer.{j}.attention.output.dense.weight"] for j in range(num_layers)])
multi_E_ft = model_hugging_face.state_dict()['bert.embeddings.word_embeddings.weight'].T

# Ensure correct kernel matrices calculation
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
kernel11 = (multi_E_kb @ multi_E_kb.T).to(device)
kernel22 = (multi_E_ft @ multi_E_ft.T).to(device)
kernel12 = (multi_E_kb @ multi_E_ft.T).to(device)

emb_mean1 = torch.mean(multi_E_kb, dim=1).to(device)
emb_mean2 = torch.mean(multi_E_ft, dim=1).to(device)

param1, param2 = multi_V_kb, multi_V_ft

# Calculate the kernelized correlation coefficient
S = kernelized_corr_coef(param1.to(device), param2.to(device), kernel11, kernel22, kernel12, emb_mean1, emb_mean2, n=len(tokenizer))
print(S)
layer_size = param1.shape[0] // num_layers

S_agg = S.view(num_layers, layer_size, num_layers, layer_size).abs().mean([-1, -3]).cpu().numpy()
print(S_agg.shape)
plt.figure(figsize=(10,6))
sns.heatmap(S_agg)
plt.title('Pearson Correlation between feed forward output layers of finetuned model and baseline model ')
plt.show()



In [None]:
# Token present in the training dataset

token_frequencies = Counter()


for example in lm_datasets["train"]:
    token_frequencies.update(example['input_ids'])

In [None]:
#Most frequent token

n = int(len(token_frequencies.keys())*0.8) 
most_common_tokens = token_frequencies.most_common(n)
most_common_ids, most_common_freqs = zip(*most_common_tokens)
most_common_ids_tensor = torch.tensor(most_common_ids)

print("Les IDs des tokens les plus fréquents et leurs fréquences :")
for token_id, freq in zip(most_common_ids, most_common_freqs):
    print(f"Token ID: {token_id}, Fréquence: {freq}")

In [None]:
comprehension_model.change_embedding_word(model_kb,model_hugging_face,valid_dataset,tokenizer,25)

In [None]:
# Graph represent the most changed word embedding thorugh epochs

checkpoint_directory = 'finetuning/finetuning_hugging_whitespace-finetuned-imdb'
checkpoint_files = os.listdir(checkpoint_directory)

checkpoint_files.sort(key=lambda x: int(re.search(r'checkpoint-(\d+)', x).group(1)))
epoch_data = {}
word_color = {}
total_frequencies = {}
initial_embeddings = model_kb.bert.embeddings.word_embeddings.weight.detach()[most_common_ids_tensor]
colors = plt.cm.viridis(np.linspace(0, 1, 5))
# Training loop
epoch=4
for checkpoint in checkpoint_files:
    epoch +=1
    model = AutoModelForMaskedLM.from_pretrained("finetuning/finetuning_hugging_whitespace-finetuned-imdb/" + checkpoint)
    model=model.to(device)
    #initial_embeddings = embeddings
    embeddings = model.bert.embeddings.word_embeddings.weight.detach()[most_common_ids_tensor]

    embedding_changes = torch.norm(embeddings - initial_embeddings, dim=1)
    
    # Get the top 5 changes
    _, top_indices = torch.topk(embedding_changes, 1)
    top_words = [tokenizer.convert_ids_to_tokens(idx.item()) for idx in top_indices]
    top_changes = embedding_changes[top_indices].tolist()
    for word in top_words:
            if word not in word_color:
                word_color[word] = colors[len(word_color) % len(colors)]
            if word not in total_frequencies :
                total_frequencies[word] =1
            if word  in total_frequencies :
                total_frequencies[word] +=1
        
    # Store data
    epoch_data[epoch] = list(zip(top_words, top_changes))

# Plotting
fig, ax = plt.subplots(figsize=(10, 7))
already_labeled = set()
for epoch in epoch_data:
    words, changes = zip(*epoch_data[epoch])
    if words[0] not in already_labeled:
        ax.plot([epoch] * 1, changes, marker='o', linestyle='', markersize=10,color=word_color[words[0]],label=words[0])
        already_labeled.add(words[0])
    else : 
        ax.plot([epoch] * 1, changes, marker='o', linestyle='', markersize=10,color=word_color[words[0]])

ax.set_xlabel('Epoch')
ax.set_ylabel('Change in Norm of Embeddings')
ax.set_title('Most changed word embedding by Epoch')
ax.legend()
plt.show()

In [None]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
#Saliency scores



model_hugging_face.eval()

# Prepare the text and encode it
text = "skrift om ersättning för resekostnad till och från [MASK] samt ar"
encoded_input = tokenizer(text, return_tensors="pt")
input_ids = encoded_input['input_ids'].to(device)
attention_mask = encoded_input['attention_mask'].to(device)
embeddings = model_hugging_face.bert.embeddings(input_ids)
embeddings.retain_grad()
output = model_hugging_face(inputs_embeds=embeddings, attention_mask=attention_mask)
logits = output.logits

mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1].item()

# Extract the logit corresponding to the masked token
masked_token_logit = logits[0, mask_token_index, :]

# Choose the target logit (e.g., the highest probability logit)
target_logit = masked_token_logit.max()

# Backward pass to compute the gradients
model_hugging_face.zero_grad()
target_logit.backward()
# Get the input embeddings
grads = embeddings.grad
print(grads)
# Compute the saliency map as the L2 norm of the gradients
saliency = torch.norm(grads, dim=-1).squeeze()

# Detach and move the saliency map to the CPU
saliency = saliency.detach().cpu().numpy()

# Tokenize the input text to match the saliency scores with the tokens
tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze().cpu().numpy())

# Display the tokens and their saliency scores
for token, score in zip(tokens, saliency):
    print(f"Token: {token}, Saliency: {score}")


In [None]:

import torch.nn as nn
import torch.nn.functional as F
inputs = valid_filtered_dataset[0]
index = inputs["labels"].index(token_id)
print(index)
criterion =nn.CrossEntropyLoss(reduction='none')
input_ids = torch.tensor(inputs['input_ids'][index-9:index+3], dtype=torch.long).unsqueeze(0).to(device)
attention_mask = torch.tensor(inputs["attention_mask"][index-9:index+3], dtype=torch.long).unsqueeze(0).to(device)
labels = torch.tensor(inputs["labels"][index-9:index+3], dtype=torch.long).unsqueeze(0).to(device)
print(tokenizer.decode(inputs['input_ids'][index-9:index+3]))
print(labels)
with torch.no_grad():
    outputs = model_hugging_face(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels=labels,
        output_attentions=True
    )
softmax_probs = F.softmax(outputs.logits.squeeze()[-3], dim=-1)
sorted_probs, sorted_indices = torch.sort(softmax_probs, descending=True)
sorted_tokens = [tokenizer.decode([idx]) for idx in sorted_indices[:10]]
# Afficher les résultats
plt.figure(figsize=(10, 5))
plt.bar(sorted_tokens, sorted_probs[:10].cpu().numpy())
plt.xlabel('Token')
plt.ylabel('Probability')
plt.title('Top 10 Predicted Tokens and their Probabilities')
plt.show()