In [1]:
%pip -q install --upgrade transformers
%pip install transformers scikit-learn torch

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


# Test dimension et embedding de Bert

In [7]:
from transformers import BertTokenizer, BertModel
import torch

# Charger le tokenizer et le modèle BERT pré-entraîné
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokeniser la phrase
input_ids = tokenizer("Hello world", return_tensors='pt')['input_ids']

# Obtenir les embeddings
with torch.no_grad():
    outputs = model(input_ids)
    last_hidden_states = outputs.last_hidden_state

size=last_hidden_states.size()
# last_hidden_states contient les embeddings contextuels pour chaque mot dans la phrase
print(last_hidden_states)
print("\nThe size of the sentence is :", size[1])
print("The dimension of each vector is :", size[2])

tensor([[[-0.1689,  0.1361, -0.1394,  ..., -0.6251,  0.0522,  0.3671],
         [-0.3633,  0.1412,  0.8800,  ...,  0.1043,  0.2888,  0.3727],
         [-0.6986, -0.6988,  0.0645,  ..., -0.2210,  0.0099, -0.5940],
         [ 0.8310,  0.1237, -0.1512,  ...,  0.1031, -0.6779, -0.2629]]])

The size of the sentence is : 4
The dimension of each vector is : 768


## On remarque que la dimension est de 768

# Test dimension et embedding de GPT2 (GPT3 et 4 sont accessible uniquement grace à un api et non dirrectement avec hugging face, mais cela se fait facilement je pense)

In [25]:
from transformers import GPT2Tokenizer, GPT2Model
import torch

# Charger le tokenizer et le modèle GPT-2 pré-entraîné
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

# Tokeniser la phrase
inputs = tokenizer("Hello world", return_tensors='pt')

# Obtenir les embeddings
with torch.no_grad():
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state


size=last_hidden_states.size()
# last_hidden_states contient les embeddings contextuels pour chaque mot dans la phrase
print(last_hidden_states)
print("\nThe size of the sentence is :", size[1])
print("The dimension of each vector is :", size[2])


tensor([[[-9.1303e-06, -1.4021e-01, -2.0845e-01,  ..., -1.5329e-01,
          -6.7827e-02, -1.9630e-01],
         [-1.6633e-01,  2.1910e-01,  4.4472e-02,  ..., -1.7681e-01,
          -1.6563e-01,  4.3342e-01]]])

The size of the sentence is : 2
The dimension of each vector is : 768


## On remarque que la dimension est de 768

# Test et dimension aveec Phi3

In [29]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

# Charger le tokenizer et le modèle GPT-2 pré-entraîné
tokenizer = AutoTokenizer.from_pretrained('microsoft/Phi-3-mini-4k-instruct')
model = AutoModelForCausalLM.from_pretrained('microsoft/Phi-3-mini-4k-instruct')

# Tokeniser la phrase
inputs = tokenizer("Hello world", return_tensors='pt')

# Obtenir les embeddings
with torch.no_grad():
    outputs = model(**inputs, output_hidden_states=True)
    last_hidden_states = outputs.hidden_states[-1]


size=last_hidden_states.size()

# last_hidden_states contient les embeddings contextuels pour chaque mot dans la phrase
print(last_hidden_states)
print("\nThe size of the sentence is :", size[1])
print("The dimension of each vector is :", size[2])


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tensor([[[-0.1166,  2.4228, -0.2542,  ...,  0.7847,  1.3498,  0.1613],
         [-0.2182,  1.6816,  0.4248,  ..., -0.6850,  0.6120, -0.7779]]])

The size of the sentence is : 2
The dimension of each vector is : 3072


## Cette fois ci, la dimension est beaucoup plus élevé...

# Réduction de dimension avec GPT2

In [30]:
from transformers import GPT2Tokenizer, GPT2Model
from sklearn.decomposition import PCA
import torch

# Charger le tokenizer et le modèle GPT-2 pré-entraîné
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

# Tokeniser la phrase
inputs = tokenizer("Hello world", return_tensors='pt')

# Obtenir les embeddings
with torch.no_grad():
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state

# Afficher la taille des embeddings
size = last_hidden_states.size()
print(last_hidden_states)
print("\nThe size of the sentence is :", size[1])
print("The dimension of each vector is :", size[2])

# Convertir les embeddings en numpy array
embeddings = last_hidden_states.squeeze().numpy()

# Utiliser PCA pour réduire la dimension des embeddings à 100
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)

# Afficher les embeddings réduits
print("\nReduced embeddings:\n", reduced_embeddings)
print("The reduced dimension of each vector is :", reduced_embeddings.shape[1])


tensor([[[-9.1303e-06, -1.4021e-01, -2.0845e-01,  ..., -1.5329e-01,
          -6.7827e-02, -1.9630e-01],
         [-1.6633e-01,  2.1910e-01,  4.4472e-02,  ..., -1.7681e-01,
          -1.6563e-01,  4.3342e-01]]])

The size of the sentence is : 2
The dimension of each vector is : 768

Reduced embeddings:
 [[-6.0051922e+01  1.7141889e-05]
 [ 6.0051891e+01  1.7141898e-05]]
The reduced dimension of each vector is : 2


# Obtenir la matrice de tout les mots du vocab (avec GPT2)

In [None]:
from transformers import GPT2Tokenizer, GPT2Model
import torch

# Charger le tokenizer et le modèle GPT-2 pré-entraîné
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

print("fin initialize model")

# Récupérer le vocabulaire du tokenizer
vocab = tokenizer.get_vocab()
print("len vocab : ", len(vocab))

# Liste pour stocker les embeddings
embeddings_list = []

print("start matrix vocab")
# Itérer sur chaque mot du vocabulaire
for word in vocab.keys():
    # Tokeniser le mot et obtenir les IDs
    input_ids = tokenizer(word, return_tensors='pt')['input_ids']

    # Obtenir les embeddings pour le mot
    with torch.no_grad():
        outputs = model(input_ids)
        last_hidden_state = outputs.last_hidden_state

    # Ajouter l'embedding du mot à la liste
    embeddings_list.append(last_hidden_state.squeeze().numpy())

print(len(embeddings_list))


## Beaucoup trop long (à faire tourner) de faire tout le vocabulaire donc on s'interesse uniquement aux 30 premiers tokens

In [4]:
from transformers import GPT2Tokenizer, GPT2Model
import torch
import numpy as np

# Charger le tokenizer et le modèle GPT-2 pré-entraîné
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

# Récupérer le vocabulaire du tokenizer
vocab = tokenizer.get_vocab()

# Limiter le traitement aux 30 premiers tokens du vocabulaire
vocab_keys = list(vocab.keys())[:10]

# Liste pour stocker les embeddings et leurs tokens correspondants
embeddings_list = []
tokens_list = []

# Itérer sur chaque mot du vocabulaire limité
for word in vocab_keys:
    # Tokeniser le mot et obtenir les IDs
    input_ids = tokenizer(word, return_tensors='pt')['input_ids']

    # Obtenir les embeddings pour le mot
    with torch.no_grad():
        outputs = model(input_ids)
        last_hidden_state = outputs.last_hidden_state

    # Ajouter l'embedding du mot à la liste
    embeddings_list.append(last_hidden_state.squeeze().numpy())
    tokens_list.append(word)  # Ajouter le token correspondant

print(len(embeddings_list[1]))

print(len(embeddings_list[0]))

print(len(embeddings_list[2]))

# Afficher les premiers tokens et leurs embeddings
print("\nTokens and their embeddings:")
for i, token in enumerate(tokens_list):
    print(f"Token: '{token}', Embedding: {embeddings_list[i]}")


768
768
768

Tokens and their embeddings:
Token: '!', Embedding: [-2.05749497e-01  7.20290393e-02 -1.85100347e-01  7.44467303e-02
 -6.33523567e-04 -8.87924060e-02  6.35696554e+00 -1.21937543e-01
 -2.03270286e-01  6.79208264e-02  2.84822732e-01 -6.58521876e-02
 -9.78591479e-03  1.98075618e-03 -1.97383061e-01  1.54231777e-02
  3.40801291e-02 -1.88278183e-01  1.03258528e-01 -6.01651907e-01
  9.49988291e-02 -1.61643445e-01 -2.06805781e-01 -4.95842174e-02
 -2.12001149e-02  6.69921041e-02 -1.61529884e-01 -1.14246227e-01
 -5.99225275e-02 -1.07664518e-01 -1.11224480e-01 -1.04296908e-01
 -8.97682160e-02 -3.61772090e-01  3.95202972e-02 -8.89691636e-02
  2.43044586e+01  9.83325019e-02  4.90854196e-02 -1.64520983e-02
 -2.39705779e-02 -5.57230823e-02 -2.59870123e-02 -2.88786769e-01
 -8.43879208e-02  8.63149911e-02 -1.33140117e-01 -4.68699075e-02
 -1.84854597e-01  1.01497635e-01  4.05560024e-02  2.77262889e-02
 -1.43159270e-01 -1.60134345e-01  2.13729739e-01  1.36295363e-01
  2.18680166e-02 -4.30199