In [1]:
# Import the package
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertModel, AdamW
import numpy as np
import pandas as pd
from tqdm import tqdm
import xml.etree.ElementTree as ET
import nltk
from nltk.corpus import wordnet as wn

In [2]:
# Settings for loading training data
semcor_training_xml_path = 'WSD_Evaluation_Framework/Training_Corpora/Semcor/semcor.data.xml'
semcor_training_gk_path = 'WSD_Evaluation_Framework/Training_Corpora/Semcor/semcor.gold.key.txt'

In [3]:
def load_xml_data(xml_file_path=''):
    tree = ET.parse(xml_file_path)
    root = tree.getroot()
    data = []
    for text in root.findall('text'):
        for sentence in text.findall('sentence'):
            sentence_id = sentence.get('id')
            sentence_text = ' '.join([element.text for element in sentence])
            for instance in sentence.findall('instance'):
                instance_id = instance.get('id')
                lemma = instance.get('lemma')
                pos = instance.get('pos')
                word = instance.text
                data.append([sentence_id, instance_id, lemma, pos, word, sentence_text])
    columns = ['sentence_id', 'instance_id', 'lemma', 'pos', 'word', 'sentence_text']
    xml_data = pd.DataFrame(data, columns=columns)
    return xml_data


# Load xml training data from semcor
semcor_training_xml = load_xml_data(semcor_training_xml_path)
display(semcor_training_xml.head())

Unnamed: 0,sentence_id,instance_id,lemma,pos,word,sentence_text
0,d000.s000,d000.s000.t000,long,ADJ,long,How long has it been since you reviewed the ob...
1,d000.s000,d000.s000.t001,be,VERB,been,How long has it been since you reviewed the ob...
2,d000.s000,d000.s000.t002,review,VERB,reviewed,How long has it been since you reviewed the ob...
3,d000.s000,d000.s000.t003,objective,NOUN,objectives,How long has it been since you reviewed the ob...
4,d000.s000,d000.s000.t004,benefit,NOUN,benefit,How long has it been since you reviewed the ob...


In [4]:
def load_gold_keys(gold_key_file_path=''):
    gold_key_data = []
    with open(gold_key_file_path, 'r') as file:
        for line in file:
            parts = line.strip().split()
            instance_id = parts[0]
            sense_id = parts[1]
            gold_key_data.append([instance_id, sense_id])

    # Create a DataFrame
    gold_key_columns = ['instance_id', 'sense_id']
    gold_key_df = pd.DataFrame(gold_key_data, columns=gold_key_columns)
    return gold_key_df

# Load gold key training data from semcor
semcor_training_gk = load_gold_keys(semcor_training_gk_path)
display(semcor_training_gk.head())

Unnamed: 0,instance_id,sense_id
0,d000.s000.t000,long%3:00:02::
1,d000.s000.t001,be%2:42:03::
2,d000.s000.t002,review%2:31:00::
3,d000.s000.t003,objective%1:09:00::
4,d000.s000.t004,benefit%1:21:00::


In [5]:
# Merge data from two file
semcor_training_merged = pd.merge(semcor_training_xml, semcor_training_gk, on='instance_id', how='inner')
display(semcor_training_merged.head())

Unnamed: 0,sentence_id,instance_id,lemma,pos,word,sentence_text,sense_id
0,d000.s000,d000.s000.t000,long,ADJ,long,How long has it been since you reviewed the ob...,long%3:00:02::
1,d000.s000,d000.s000.t001,be,VERB,been,How long has it been since you reviewed the ob...,be%2:42:03::
2,d000.s000,d000.s000.t002,review,VERB,reviewed,How long has it been since you reviewed the ob...,review%2:31:00::
3,d000.s000,d000.s000.t003,objective,NOUN,objectives,How long has it been since you reviewed the ob...,objective%1:09:00::
4,d000.s000,d000.s000.t004,benefit,NOUN,benefit,How long has it been since you reviewed the ob...,benefit%1:21:00::


In [6]:
sense_id_cache = {}
def format_sense_id(sense_id):
    if sense_id not in sense_id_cache:
        sense_id_cache[sense_id] = wn.lemma_from_key(sense_id).synset().name()
    return sense_id_cache[sense_id]
    
semcor_training_merged['formatted_sense_id'] = semcor_training_merged['sense_id'].apply(format_sense_id)

# We keep those columns for now
keys_to_keep = ['lemma', 'word', 'sentence_text', 'formatted_sense_id']
semcor_training_merged = semcor_training_merged[keys_to_keep]

display(semcor_training_merged.head())

Unnamed: 0,lemma,word,sentence_text,formatted_sense_id
0,long,long,How long has it been since you reviewed the ob...,long.a.01
1,be,been,How long has it been since you reviewed the ob...,be.v.01
2,review,reviewed,How long has it been since you reviewed the ob...,review.v.01
3,objective,objectives,How long has it been since you reviewed the ob...,aim.n.02
4,benefit,benefit,How long has it been since you reviewed the ob...,benefit.n.01


In [7]:
# Setting to load nball embeddings
nball_small_path = 'training_set/nballSmall.txt'

def load_ball_embeddings(bFile):
    print("loading balls....")
    bdic=dict()
    with open(bFile, 'r') as w2v:
        for line in w2v.readlines():
            wlst = line.strip().split()
            bdic[wlst[0]] = list(map(float, wlst[1:]))
    print(len(bdic),' balls are loaded\n')
    return bdic


# Load ball embeddings
nball_small = load_ball_embeddings(nball_small_path)

# Check the length of the nball embeddings
nball_len =[]
for i, (key, value) in enumerate(nball_small.items()):
    value_len = len(nball_small[key])
    if value_len not in nball_len:
        nball_len.append(value_len)
print(f'The length of nball embeddings: {nball_len}')

loading balls....
8691  balls are loaded

The length of nball embeddings: [159]


In [8]:
# For now, we train on exist nball

print(f'Original length of the training set:{len(semcor_training_merged)}')
semcor_training = semcor_training_merged[semcor_training_merged['formatted_sense_id'].isin(nball_small.keys())].copy()
print(f'Actual data used of the training set:{len(semcor_training)}')

display(semcor_training.head())

Original length of the training set:226036
Actual data used of the training set:20032


Unnamed: 0,lemma,word,sentence_text,formatted_sense_id
6,program,program,How long has it been since you reviewed the ob...,program.n.02
10,program,program,Have you permitted it to become a giveaway pro...,program.n.02
24,program,program,What effort do you make to assess results of y...,program.n.02
46,use,using,Are you using the most economical printing met...,use.v.01
80,eating,eating,When improvements are recommended in working c...,eating.n.01


In [9]:
# Setting for the model choice
models = {
    "BERT-Base": "bert-base-uncased",
    "BERT-Large": "bert-large-uncased",
    "BERT-Medium": "google/bert_uncased_L-8_H-512_A-8",
    "BERT-Small": "google/bert_uncased_L-4_H-256_A-4",
    "BERT-Mini": "google/bert_uncased_L-4_H-128_A-2",
    "BERT-Tiny": "google/bert_uncased_L-2_H-128_A-2"
}

# With our nball dimention 159, we choose bert small with 256 dimentions
model_name = models["BERT-Small"]


In [10]:
# Get sense index
sense_labels = list(nball_small.keys())
sense_index = {sense: idx for idx, sense in enumerate(sense_labels)}
semcor_training.loc[:,'sense_idx'] = semcor_training['formatted_sense_id'].map(sense_index)

# Padding the embeddings
original_dim = len(nball_small[sense_labels[0]])
target_dim = 256  # Dimension of BERT-Small
padding_size = target_dim - original_dim

# Pad each embedding to match the target dimension
padded_embeddings = [np.pad(nball_small[label], (0, padding_size), 'constant', constant_values=0) for label in sense_labels]
# Convert to tensor
# The last one is too large 
sense_embeddings = torch.tensor(np.array(padded_embeddings), dtype=torch.float64)

print(f'Total data of nball embeddings:{len(sense_embeddings)}')
print(f'The length after padding: {len(sense_embeddings[0])}')

Total data of nball embeddings:8691
The length after padding: 256


In [11]:
# Tokenize the sentence
# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained(model_name)

# Could be problem here, as we always fid
def find_word_index(sentence_ids, word):
    word_tokens = tokenizer.tokenize(word)
    word_ids = tokenizer.convert_tokens_to_ids(word_tokens)
    for i in range(len(sentence_ids) - len(word_tokens) + 1):
        if sentence_ids[i:i+len(word_tokens)].tolist() == word_ids:
            return i
    return -1

def tokenize_data(df):
    # Tokenize all sentences
    print("Tokenizing sentences...")
    tokenized_data = tokenizer(list(df['sentence_text']), padding=True, truncation=True, return_tensors="pt", max_length=512)
    input_ids = tokenized_data['input_ids']
    attention_masks = tokenized_data['attention_mask']

    # Progress bar for calculating word indices
    print("Calculating word indices...")
    # pbar = tqdm(total=df.shape[0], desc="Calculating word indices")
    word_indices = []
    for sentence_ids, word in zip(input_ids, df['word']):
        word_indices.append(find_word_index(sentence_ids, word))
        # pbar.update(1)  # Update progress for each word index found

    # print(f"Length input_ids:{len(input_ids)}\n Length attention_mask:{len(attention_masks)}\n Length \
    # word_index:{len(word_indices)}\n Length dataframe:{len(df)}")
    df.loc[:, 'input_ids'] = input_ids.tolist()
    df.loc[:, 'attention_mask'] = attention_masks.tolist()
    df.loc[:, 'word_index'] = word_indices

    print('Tokenizing finished!')
    # pbar.close()  # Close the progress bar after completion


tokenize_data(semcor_training)
display(semcor_training.head())

Tokenizing sentences...
Calculating word indices...
Tokenizing finished!


Unnamed: 0,lemma,word,sentence_text,formatted_sense_id,sense_idx,input_ids,attention_mask,word_index
6,program,program,How long has it been since you reviewed the ob...,program.n.02,2382,"[101, 2129, 2146, 2038, 2009, 2042, 2144, 2017...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",16
10,program,program,Have you permitted it to become a giveaway pro...,program.n.02,2382,"[101, 2031, 2017, 7936, 2009, 2000, 2468, 1037...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",10
24,program,program,What effort do you make to assess results of y...,program.n.02,2382,"[101, 2054, 3947, 2079, 2017, 2191, 2000, 1435...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...",11
46,use,using,Are you using the most economical printing met...,use.v.01,7420,"[101, 2024, 2017, 2478, 1996, 2087, 21791, 802...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",3
80,eating,eating,When improvements are recommended in working c...,eating.n.01,3194,"[101, 2043, 8377, 2024, 6749, 1999, 2551, 3785...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",16


In [12]:
# Setting, small batch size here so that do not out of cuda memory
batch_size = 2

# Create tensor dataset to speed learning
# Convert lists of lists into tensors
all_input_ids = torch.stack([torch.tensor(ids, dtype=torch.long) for ids in semcor_training['input_ids']])
all_attention_masks = torch.stack([torch.tensor(mask, dtype=torch.long) for mask in semcor_training['attention_mask']])
all_word_indices = torch.tensor(semcor_training['word_index'].tolist(), dtype=torch.long)
all_senses = torch.tensor(semcor_training['sense_idx'].tolist(), dtype=torch.long)

# Create a TensorDataset
dataset = TensorDataset(all_input_ids, all_attention_masks, all_word_indices, all_senses)

# Use DataLoader to handle batching
dataloader = DataLoader(dataset, batch_size, shuffle=True)

In [13]:
# Custom Cosine Similarity Loss
def cosine_similarity_loss(embeddings1, embeddings2):
    # Cosine similarity returns a value between -1 and 1, where 1 means identical
    cosine_sim = F.cosine_similarity(embeddings1, embeddings2, dim=1)
    print(cosine_sim)
    # We subtract from 1 to convert similarity to loss: 0 means identical, 2 means totally opposite
    return (1 - cosine_sim).mean()

In [39]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Model setting
model = BertModel.from_pretrained(model_name).to(device)
model.train() 
sense_embeddings = sense_embeddings.to(device)  # Move sense embeddings to GPU
loss_fn = cosine_similarity_loss
optimizer = optim.Adam(model.parameters(), lr=2e-5)

# Fetch one batch of data
data_iter = iter(dataloader)
batch = next(data_iter)
batch_input_ids, batch_attention_masks, batch_word_indices, batch_sense_indices = [b.to(device) for b in batch]

# Forward pass to get outputs
model.eval()  # Set the model to evaluation mode to disable dropout
with torch.no_grad():
    outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks)
    hidden_states = outputs.last_hidden_state

# Retrieve embeddings for specific word indices
word_embeddings = torch.stack([hidden_states[i, idx, :] for i, idx in enumerate(batch_word_indices)])

# Retrieve the corresponding sense embeddings
target_embeddings = sense_embeddings[batch_sense_indices]

# Select one pair of embeddings
example_word_embedding = word_embeddings[0]
example_target_embedding = target_embeddings[0]

# Calculate cosine similarity manually
cosine_similarity = F.cosine_similarity(example_word_embedding.unsqueeze(0), example_target_embedding.unsqueeze(0))

print(f'Cosine similarity: {cosine_similarity.item()}')
print(f'Word embedding norm: {torch.norm(example_word_embedding)}')
print(f'Target embedding norm: {torch.norm(example_target_embedding)}')

# Optionally check for any NaN values
print("Any NaN in word embeddings?", torch.isnan(word_embeddings).any())
print("Any NaN in target embeddings?", torch.isnan(target_embeddings).any())

Using device: cuda
Cosine similarity: 0.0
Word embedding norm: 14.00044059753418
Target embedding norm: inf
Any NaN in word embeddings? tensor(False, device='cuda:0')
Any NaN in target embeddings? tensor(False, device='cuda:0')


In [40]:
print(batch_sense_indices)

tensor([7704, 7213], device='cuda:0')


In [43]:
comparison = torch.equal(sense_embeddings[7704], example_target_embedding)
print("Are tensors exactly equal?", comparison)
differences = sense_embeddings[7704] != example_target_embedding

# Find indices where the values differ
diff_indices = differences.nonzero(as_tuple=True)

# Print out the differing elements
for idx in diff_indices[0]:
    print(f"Index: {idx}, sense_embeddings[7226]: {sense_embeddings[7226][idx]}, example_target_embedding: {example_target_embedding[idx]}")

Are tensors exactly equal? True


In [44]:
print(sense_embeddings[7704])

tensor([-2.5823e-03,  6.9978e-04, -3.1382e-03,  1.2778e-03, -7.1062e-04,
        -1.2048e-04, -2.2161e-03,  2.6910e-03, -3.3114e-03,  2.3125e-03,
         1.7366e-04,  1.9518e-03, -4.4959e-03,  1.5646e-03,  2.5900e-03,
         5.1694e-05,  2.8415e-03, -1.2466e-03, -3.5970e-03, -4.8595e-04,
        -1.6719e-03,  1.1232e-03,  1.2293e-03,  2.7305e-03,  2.1141e-03,
        -4.3246e-03,  1.5143e-04, -1.4513e-03,  1.2121e-03, -4.5956e-03,
         1.2173e-02,  3.4685e-03,  1.3427e-04, -1.1819e-03,  1.0880e-03,
         2.2221e-03,  1.0810e-03,  2.4053e-03, -7.7180e-04, -2.7204e-03,
        -1.2946e-04,  1.8160e-04, -8.9300e-04, -2.2255e-04, -7.2533e-04,
         3.2039e-03,  2.8126e-03, -2.6422e-04, -1.5601e-03,  1.6650e-03,
         2.1219e-03,  1.5529e-01,  1.9290e-03,  1.9290e-03,  1.9290e-03,
         1.9290e-03,  1.9290e-03,  9.8767e-02, -9.8767e-02, -9.8767e-02,
        -9.8767e-02, -9.8767e-02, -9.8767e-02, -9.8767e-02, -9.8767e-02,
        -9.8767e-02, -9.8767e-02, -9.8767e-02, -9.8

In [52]:
print(torch.norm(sense_embeddings[7226]))
print(torch.tensor([1.1838e+174, 1.1838e-126], dtype=torch.float64))
print(torch.norm(torch.tensor([1.1838e+174, 1.1838e-126], dtype=torch.float64)))

tensor(inf, device='cuda:0', dtype=torch.float64)
tensor([1.1838e+174, 1.1838e-126], dtype=torch.float64)
tensor(inf, dtype=torch.float64)


In [31]:
comparison = torch.equal(sense_embeddings[7226], example_target_embedding)
print("Are tensors exactly equal?", comparison)

Are tensors exactly equal? False


In [17]:
print(sense_labels[7367])
print(sense_index['pass.v.05'])
print(nball_small['pass.v.05'][157])

pass.v.05
7367
4.074568017658608e+159


In [18]:
# Training the model
# Set the device to CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Model setting
model = BertModel.from_pretrained(model_name).to(device)
model.train() 
sense_embeddings = sense_embeddings.to(device)  # Move sense embeddings to GPU
loss_fn = nn.CosineEmbeddingLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)


# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    total_loss = 0
    progress_bar = tqdm(dataloader, desc=f'Epoch {epoch+1}/{num_epochs}', leave=True, position=0)
    for batch in progress_bar:
        # Send batch data to the device (GPU)
        batch_input_ids, batch_attention_masks, batch_word_indices, batch_sense_indices = [b.to(device) for b in batch]

        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks)
        hidden_states = outputs.last_hidden_state
        
        # Retrieve embeddings for specific word indices
        word_embeddings = torch.stack([hidden_states[i, idx, :] for i, idx in enumerate(batch_word_indices)])
        
        # Retrieve the corresponding sense embeddings
        target_embeddings = sense_embeddings[batch_sense_indices]

        # Labels tensor indicating that embeddings should be similar
        labels = torch.ones(word_embeddings.size(0), device=device)

        # Calculate loss
        loss = loss_fn(word_embeddings, target_embeddings, labels)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    print(f'Epoch {epoch + 1}, Loss: {total_loss / len(dataloader)}')

Using device: cuda


Epoch 1/3:   8%|████▎                                                    | 755/10016 [00:17<03:37, 42.67it/s, loss=nan]


KeyboardInterrupt: 

In [None]:
# Setting for the model
# Could be problem here, as we always find the first one

In [None]:
# Compare time
import time
# Function to time the execution
def time_function(func, file_path):
    start_time = time.time()
    func(file_path)
    end_time = time.time()
    return end_time - start_time

In [None]:

# Original function
def load_ball_embeddings_original(bFile):
    print("loading balls....")
    bdic = {}
    with open(bFile, 'r') as w2v:
        for line in w2v.readlines():
            wlst = line.strip().split()
            bdic[wlst[0]] = [decimal.Decimal(ele) for ele in wlst[1:]]
    print(len(bdic), ' balls are loaded\n')
    return bdic

# Optimized function
def load_ball_embeddings_optimized(bFile):
    print("loading balls....")
    bdic = {}
    with open(bFile, 'r') as w2v:
        for line in w2v:
            wlst = line.split()
            bdic[wlst[0]] = list(map(float, wlst[1:]))
    print(len(bdic), ' balls are loaded\n')
    return bdic

# Path to the embeddings file
file_path = nball_small_path  # Update this to your actual file path

# Time both functions
original_time = time_function(load_ball_embeddings_original, file_path)
optimized_time = time_function(load_ball_embeddings_optimized, file_path)

# Print the results
print(f"Original function time: {original_time} seconds")
print(f"Optimized function time: {optimized_time} seconds")

In [None]:
import swifter
# Function to apply formatting without caching
def process_without_cache(df):
    df['formatted_sense_id'] = df['sense_id'].apply(lambda x: wn.lemma_from_key(x).synset().name())

# Function to apply formatting with caching
sense_id_cache = {}
def process_with_cache(df):
    def format_sense_id(sense_id):
        if sense_id not in sense_id_cache:
            sense_id_cache[sense_id] = wn.lemma_from_key(sense_id).synset().name()
        return sense_id_cache[sense_id]
    df['formatted_sense_id'] = df['sense_id'].apply(format_sense_id)

# Function to apply formatting with swifter
def process_with_swifter(df):
    df['formatted_sense_id'] = df['sense_id'].swifter.apply(lambda x: wn.lemma_from_key(x).synset().name())

# Timing each method
time_no_cache = time_function(process_without_cache, semcor_training_merged.copy())
time_cache = time_function(process_with_cache, semcor_training_merged.copy())
time_swifter = time_function(process_with_swifter, semcor_training_merged.copy())

print("Time without caching:", time_no_cache)
print("Time with caching:", time_cache)
print("Time with swifter:", time_swifter)

In [None]:
# Initialize the tokenizer once, to be used across functions
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define the original method as a function
def original_method(df):
    def tokenize_and_find_index(row):
        sentence = str(row['sentence_text'])
        word = str(row['word'])
        tokens = tokenizer(sentence, padding='max_length', max_length=512, truncation=True, return_tensors="pt")
        input_ids = tokens['input_ids'][0]
        word_tokens = tokenizer.tokenize(word)
        for i in range(len(input_ids) - len(word_tokens) + 1):
            if input_ids[i:i+len(word_tokens)].tolist() == tokenizer.convert_tokens_to_ids(word_tokens):
                return tokens['input_ids'], tokens['attention_mask'], i
        return tokens['input_ids'], tokens['attention_mask'], -1
    df[['input_ids', 'attention_mask', 'word_index']] = df.apply(tokenize_and_find_index, axis=1, result_type='expand')

# Define the optimized method as a function
def optimized_method(df):
    tokenized_data = tokenizer(list(df['sentence_text']), padding=True, truncation=True, return_tensors="pt", max_length=512)
    input_ids = tokenized_data['input_ids']
    attention_masks = tokenized_data['attention_mask']
    word_indices = [
        find_word_index(sentence_ids, word)
        for sentence_ids, word in zip(input_ids, df['word'])
    ]
    df['input_ids'] = input_ids
    df['attention_mask'] = attention_masks
    df['word_index'] = word_indices

# Function to find word index used in optimized method
def find_word_index(sentence_ids, word):
    word_tokens = tokenizer.tokenize(word)
    word_ids = tokenizer.convert_tokens_to_ids(word_tokens)
    for i in range(len(sentence_ids) - len(word_tokens) + 1):
        if sentence_ids[i:i+len(word_tokens)].tolist() == word_ids:
            return i
    return -1

# Function to time the execution
def time_function(func, df):
    start_time = time.time()
    func(df)
    end_time = time.time()
    return end_time - start_time

# Make copies of the DataFrame and measure execution time
df_copy_for_original = semcor_training_merged.copy()
df_copy_for_optimized = semcor_training_merged.copy()
time_original = time_function(original_method, df_copy_for_original)
time_optimized = time_function(optimized_method, df_copy_for_optimized)

print(f"Original Method Time: {time_original} seconds")
print(f"Optimized Method Time: {time_optimized} seconds")
