#Obtaining embeddings for symbols

##0. Setup

In [1]:
import torch
import pandas as pd
import numpy as np
from transformers import BertModel, BertTokenizer
from torch.utils.data import DataLoader, Dataset

class SymbolDataset(Dataset):
  def __init__(self, csv_file, text_column):
    self.data = pd.read_csv(csv_file)
    self.texts = self.data[text_column].tolist()

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    return self.texts[idx]

def compute_embeddings(dataset, batch_size=32, max_length=128, model_name='klue/bert-base'):
  tokenizer = BertTokenizer.from_pretrained(model_name)
  model = BertModel.from_pretrained(model_name)
  model.eval()

  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  model.to(device)

  dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
  sentence_embeddings_list = []  # Store sentence-level embeddings

  with torch.no_grad():
    for batch in dataloader:
      inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=max_length, return_attention_mask=True)
      input_ids = inputs['input_ids'].to(device)
      attention_mask = inputs['attention_mask'].to(device)

      outputs = model(input_ids=input_ids, attention_mask=attention_mask)
      sentence_embeddings = (outputs.last_hidden_state * attention_mask.unsqueeze(-1)).sum(dim=1) / attention_mask.sum(dim=1, keepdim=True)  # Mean of token embeddings
      sentence_embeddings_list.append(sentence_embeddings.cpu())

  sentence_embeddings = torch.cat(sentence_embeddings_list, dim=0)  # Sentence embeddings for all expressions

  return sentence_embeddings

##1. Read symbols data and obtain embeddings

In [24]:
dataset = SymbolDataset(csv_file='symbols.csv', text_column='Symbol')

sentence_embeddings = compute_embeddings(dataset)
sentence_embeddings_list = sentence_embeddings.numpy().tolist()

symbol_df = pd.read_csv('symbols.csv')
sentence_df = pd.DataFrame([sentence_embeddings_list]).T
sentence_df.columns = ['Embeddings']

##2. Save the embeddings

In [25]:
result_df = pd.concat([symbol_df, sentence_df], axis=1)
result_df.to_csv('symbol_with_embeddings.csv', index=False)

In [26]:
result_df

Unnamed: 0,Symbol,Embeddings
0,73,"[0.13746967911720276, -0.96739262342453, 0.744..."
1,실수,"[-0.8837306499481201, -1.5502004623413086, 0.0..."
2,따끔거려요,"[1.5650123357772827, -0.7263830304145813, 0.40..."
3,교통사고가-났어요,"[0.4038792550563812, -1.2150514125823975, 0.04..."
4,정리해주세요,"[1.094342827796936, -0.37941300868988037, -0.5..."
...,...,...
7265,동식물,"[-0.225558802485466, 0.20258350670337677, 0.70..."
7266,복어,"[-0.8924968838691711, -0.39833080768585205, 0...."
7267,평면도형,"[0.4722371995449066, -0.9736231565475464, 0.23..."
7268,페브리즈,"[0.20655889809131622, -1.4036520719528198, 0.2..."
