In [1]:
# Install required libraries
%pip install transformers datasets torch

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=12.0.0 (from datasets)
  Downloading pyarrow-16.0.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (3.0 kB)
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.9.5-cp310-cp310-macosx_11_0_arm64.whl.metadata (7.5 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->datasets)
  Downloading aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting attrs>=17.3.0 (from aiohttp->datasets)
  Downloading attrs-23.2.0-py3-none-any.whl.metadata (9.5 kB)
Collecti

In [2]:
# Import required libraries
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Set device to GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# Download IMDB dataset
imdb_dataset = load_dataset("imdb")

Device: cpu


Downloading readme: 100%|██████████| 7.81k/7.81k [00:00<00:00, 8.05MB/s]
Downloading data: 100%|██████████| 21.0M/21.0M [01:00<00:00, 344kB/s]
Downloading data: 100%|██████████| 20.5M/20.5M [00:53<00:00, 380kB/s]
Downloading data: 100%|██████████| 42.0M/42.0M [01:44<00:00, 403kB/s]
Generating train split: 100%|██████████| 25000/25000 [00:00<00:00, 214519.14 examples/s]
Generating test split: 100%|██████████| 25000/25000 [00:00<00:00, 286060.83 examples/s]
Generating unsupervised split: 100%|██████████| 50000/50000 [00:00<00:00, 242297.42 examples/s]


In [4]:
imdb_dataset['train'] = imdb_dataset['train'].select(range(10))

imdb_dataset['train']


Dataset({
    features: ['text', 'label'],
    num_rows: 10
})

In [5]:
# Create a tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Create a BERT model
model = BertModel.from_pretrained("bert-base-uncased")

tokenizer, model

(BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
 	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 },
 BertModel(
   (embeddings): BertEmbeddings(
     (word_embeddings): Embedding(30522, 768, padding_idx=0)
     (position_e

In [6]:
# Move model to GPU if available
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [7]:
import numpy as np
# Define a function to generate word embeddings
def generate_word_embeddings(text):
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        return_attention_mask=True,
        return_tensors="pt"
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model(inputs["input_ids"], attention_mask=inputs["attention_mask"])
    embeddings = outputs.last_hidden_state[:, 0, :]
    word_tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].flatten())
    word_tokens = [word for word in word_tokens if word not in ["[CLS]", "[SEP]", "[PAD]"]]  # Remove special tokens
    return {word: embedding for word, embedding in zip(word_tokens, embeddings.detach().cpu().numpy())}

# Generate word embeddings for the reduced IMDB dataset
embeddings_list = []
for i, review in enumerate(imdb_dataset["train"]["text"]):
    embeddings_dict = generate_word_embeddings(review)
    embeddings_dict["review_id"] = i
    embeddings_list.append(embeddings_dict)

# Convert embeddings to a Pandas DataFrame
embeddings_array = np.array(embeddings_list).squeeze()  # Remove extra dimension
embeddings_df = pd.DataFrame(embeddings_list).set_index("review_id")

# Replace missing values with an empty string
embeddings_df = embeddings_df.fillna("")

# Save the embeddings to a CSV file
embeddings_df.to_csv("imdb_word_embeddings.csv", index=False)

embeddings_df

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Unnamed: 0_level_0,i,"""",if,this,oh,whoever,when,who
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,"[-0.44797438, -0.17710215, -0.32687828, 0.0088...",,,,,,,
1,,"[-0.05165513, -0.19957957, -0.36138225, 0.1945...",,,,,,
2,,,"[0.15993415, -0.20449525, 0.2851096, -0.370434...",,,,,
3,,,,"[0.31823394, -0.16095433, 0.047124166, -0.0748...",,,,
4,,,,,"[0.10148057, -0.0166932, 0.29143655, 0.0186476...",,,
5,"[0.29642564, -0.09262828, 0.03899841, -0.14921...",,,,,,,
6,,,,,,"[0.15262482, -0.46020463, -0.33316272, -0.1757...",,
7,,,,,,,"[-0.13163166, -0.7564179, -0.47083983, 0.17071...",
8,,,,,,,,"[-0.22594354, 0.05046604, 0.17580913, -0.15455..."
9,,,,"[-0.2680882, -0.4476783, -0.28530872, 0.084038...",,,,
