In [16]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from datasets import load_dataset

In [3]:
import torch
from transformers import BertTokenizer, BertModel

# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Custom dataset with the word "apple" in different contexts
sentences = [
    "I love apple pies.",
    "He bought an apple from the store.",
    "Apple announced a new product yesterday.",
    "The apple tree in my garden is blooming."
]

# Tokenize sentences and prepare input
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
inputs = {key: value.to(device) for key, value in inputs.items()}

# Get the outputs (last hidden states)
with torch.no_grad():
    outputs = model(**inputs)

# Extract embeddings for the word "apple" from each sentence
apple_embeddings = []
for i, sentence in enumerate(sentences):
    # Find the index of the word "apple" in each sentence
    apple_index = inputs['input_ids'][i].tolist().index(tokenizer.encode('apple', add_special_tokens=False)[0])
    # Extract the embedding for this index
    print(f"For the {i}-th sentence, the index of Apple is: {apple_index}")
    apple_embedding = outputs.last_hidden_state[i, apple_index]
    apple_embeddings.append(apple_embedding)

# Now, apple_embeddings contains the contextual embeddings for the word "apple" in each sentence
print("Contextual embeddings for 'apple' extracted from each sentence.")


For the 0-th sentence, the index of Apple is: 3
For the 1-th sentence, the index of Apple is: 4
For the 2-th sentence, the index of Apple is: 1
For the 3-th sentence, the index of Apple is: 2
Contextual embeddings for 'apple' extracted from each sentence.


In [4]:
inputs['input_ids'][0].tolist()

[101, 1045, 2293, 6207, 11345, 2015, 1012, 102, 0, 0, 0, 0]

In [5]:
tokenizer.encode('apple', add_special_tokens=False)

[6207]

In [None]:

# Load a sample dataset and take a small subset for quick training
dataset = load_dataset('glue', 'mrpc')
train_dataset = dataset['train'].select(range(100))  # Use only the first 100 samples

# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Tokenization
def tokenize_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)

# Print some details about the tokenized dataset
print("Sample tokenized input:", train_dataset[0])

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
model.train()
for epoch in range(3):  # Looping for 3 epochs
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        print(f"Epoch {epoch + 1}, Training loss: {loss.item()}")

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_bert")

In [11]:
# Sample data
texts = ["The bank will not approve my loan.", "We sat on the river bank."]
target_words = ["bank", "bank"]
contexts = [(text, word) for text, word in zip(texts, target_words)]

# Custom dataset class
class ContextDataset(Dataset):
    def __init__(self, contexts):
        self.contexts = contexts
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.contexts)

    def __getitem__(self, idx):
        text, word = self.contexts[idx]
        inputs = self.tokenizer(text, return_tensors="pt", padding='max_length', truncation=True, max_length=128)
        word_tokens = self.tokenizer.tokenize(word)
        word_index = inputs["input_ids"].squeeze().tolist().index(self.tokenizer.convert_tokens_to_ids(word_tokens)[0])
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'word_index': word_index
        }

# Custom collate function
def custom_collate(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    word_indices = torch.tensor([item['word_index'] for item in batch])
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'word_index': word_indices
    }

dataset = ContextDataset(contexts)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=custom_collate)

In [13]:
import numpy as np

# Example sense embeddings for the word "bank"
sense_embeddings = {
    "financial_institution": np.random.rand(768).astype(np.float32),
    "river_side": np.random.rand(768).astype(np.float32)
}
sense_labels = list(sense_embeddings.keys())
sense_tensor = torch.tensor([sense_embeddings[label] for label in sense_labels])


In [14]:
import torch.nn as nn
import torch.optim as optim

# Load BERT model
model = BertModel.from_pretrained('bert-base-uncased')

# Define the loss function and optimizer
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

# Training loop
for epoch in range(3):  # Train for 3 epochs
    model.train()
    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        word_index = batch['word_index']

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state

        # Get the word embeddings
        word_embeddings = hidden_states[range(hidden_states.size(0)), word_index]

        # Calculate the distance to each sense embedding
        distances = torch.cdist(word_embeddings.unsqueeze(0), sense_tensor.unsqueeze(0), p=2).squeeze()

        # Calculate the loss
        target = torch.zeros(word_embeddings.size(0), dtype=torch.long)
        loss = loss_fn(distances, target.float())

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")


  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1, Loss: 486.03436279296875
Epoch 2, Loss: 419.296142578125
Epoch 3, Loss: 366.17523193359375


In [15]:
def match_sense(text, word):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", padding='max_length', truncation=True, max_length=128)
    outputs = model(**inputs)
    hidden_states = outputs.last_hidden_state

    # Get the word index
    word_tokens = tokenizer.tokenize(word)
    word_index = inputs["input_ids"].squeeze().tolist().index(tokenizer.convert_tokens_to_ids(word_tokens)[0])
    
    # Get the word embedding
    word_embedding = hidden_states[0, word_index, :].unsqueeze(0).float()
    
    # Calculate distances to sense embeddings
    distances = torch.cdist(word_embedding, sense_tensor.unsqueeze(0), p=2).squeeze()
    closest_sense_index = torch.argmin(distances).item()

    return sense_labels[closest_sense_index]

# Example usage
text = "The bank will not approve my loan."
word = "bank"
sense = match_sense(text, word)
print(f"The sense of '{word}' in the context is: {sense}")

The sense of 'bank' in the context is: river_side
