## Installing packages

In [None]:
!pip install transformers

In [None]:
!pip install huggingface-hub

In [None]:
!pip install matplotlib

In [None]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [None]:
!pip3 install umap

In [None]:
!pip3 install scikit-learn

## Code

In [10]:
from datasets import load_dataset

emotions = load_dataset("emotion")

In [11]:
from transformers import AutoTokenizer

model_ckpt="distilbert-base-uncased"
tokenizer=AutoTokenizer.from_pretrained(model_ckpt)


In [12]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [13]:
emotions_encoded=emotions.map(tokenize, batched=True, batch_size=None)
emotions_encoded["train"].column_names

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map: 100%|██████████| 2000/2000 [00:00<00:00, 15033.05 examples/s]


['text', 'label', 'input_ids', 'attention_mask']

In [15]:
import torch
from transformers import AutoModel

device="cuda" if torch.cuda.is_available() else "cpu"
model=AutoModel.from_pretrained(model_ckpt).to(device)
device

'cuda'

In [16]:
def extract_hidden_states(batch):
    inputs={k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
    with torch.no_grad():
        last_hidden_state=model(**inputs).last_hidden_state
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [20]:
emotions_encoded.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
emotions_hidden=emotions_encoded.map(extract_hidden_states, batched=True)
emotions_hidden["train"].column_names

Map: 100%|██████████| 16000/16000 [00:08<00:00, 1855.95 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 2477.76 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 2677.16 examples/s]


['text', 'label', 'input_ids', 'attention_mask', 'hidden_state']

In [21]:
import numpy as np

X_train=np.array(emotions_hidden["train"]["hidden_state"])
y_train=np.array(emotions_hidden["train"]["label"])
X_valid=np.array(emotions_hidden["validation"]["hidden_state"])
y_valid=np.array(emotions_hidden["validation"]["label"])

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape


((16000, 768), (16000,), (2000, 768), (2000,))

In [26]:
from umap import UMAP
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

X_scaled=MinMaxScaler().fit_transform(X_train)
mapper=UMAP(n_components=2, metric="cosine").fit(X_scaled)
df_emb=pd.DataFrame(mapper.embedding_, columns=["x", "y"])
df_emb["label"]=y_train
df_emb.head()

ImportError: cannot import name 'UMAP' from 'umap' (c:\Users\vloba\git\Transformers\.venv\lib\site-packages\umap\__init__.py)