In [None]:
import nltk
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')



def lesk_algorithm(word, sentence):
    best_sense = None
    max_overlap = 0
    word_synsets = wordnet.synsets(word)
    #print("word_synset:",word_synsets)

    tokens = word_tokenize(sentence)
    #print("word sysets:",tokens)
    pos_tags = nltk.pos_tag(tokens)
    #print("pos_tags:", pos_tags)
    target_word_pos = None

    for i, (token, pos) in enumerate(pos_tags):
        if token.lower() == word.lower():
            target_word_pos = pos
            break
    if target_word_pos is not None and target_word_pos.startswith('N'):
       context_words = set()
       window_size = 10
       for j in range(i - window_size, i + window_size + 1):
         if 0 <= j < len(pos_tags) and pos_tags[j][1].startswith('N') :  # Consider only nouns for context
            context_words.add(pos_tags[j][0])


       for synset in word_synsets:
         gloss = synset.definition()
         print("gloss:", gloss)
         examples = synset.examples()
         context = set(word_tokenize(gloss.lower())) | set(word_tokenize(' '.join(examples).lower())) | {word}
         print("context:",context)
         overlap = len(context_words.intersection(context))
         if overlap > max_overlap:
            max_overlap = overlap
            best_sense = synset
       return best_sense

    context_words = set(word_tokenize(sentence))

    for synset in word_synsets:
        gloss = synset.definition()
        examples = synset.examples()
        context = set(word_tokenize(gloss)) | set(word_tokenize(' '.join(examples)))

        overlap = len(context_words.intersection(context))

        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = synset

    return best_sense

# Example usage
word = "bank"
sentence ="I deposite money to bank "
sense = lesk_algorithm(word, sentence)
if sense:
    print("Word:", word)
    print("Sense:", sense.definition())
else:
    print("No sense found for the given word.")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


gloss: sloping land (especially the slope beside a body of water)
context: {'the', 'they', 'body', 'on', 'canoe', 'currents', 'sloping', ')', 'river', 'slope', 'up', 'bank', 'pulled', '(', 'especially', 'of', 'he', 'water', 'and', 'a', 'land', 'sat', 'beside', 'watched'}
gloss: a financial institution that accepts deposits and channels the money into lending activities
context: {'at', 'holds', 'the', 'that', 'into', 'check', 'on', 'my', 'financial', 'bank', 'money', 'channels', 'mortgage', 'institution', 'cashed', 'lending', 'he', 'deposits', 'a', 'and', 'activities', 'home', 'accepts'}
gloss: a long ridge or pile
context: {'or', 'pile', 'of', 'earth', 'long', 'ridge', 'a', 'huge', 'bank'}
gloss: an arrangement of similar objects in a row or in tiers
context: {'similar', 'or', 'tiers', 'of', 'arrangement', 'an', 'in', 'he', 'row', 'a', 'objects', 'operated', 'switches', 'bank'}
gloss: a supply or stock held in reserve for future use (especially in emergencies)
context: {'for', 'or', 'u

In [None]:
import torch
from nltk.corpus import wordnet as wn

# Download WordNet data if not already downloaded
import nltk
nltk.download('wordnet')

def get_wordnet_definition(lemma, pos, sense_num):
    synset_id = f"{lemma}.{pos}.{sense_num}"
    try:
        synset = wn.synset(synset_id)
        return synset.definition()
    except Exception as e:
        print(f"Error retrieving definition for {synset_id}: {e}")
        return "Definition not found."

def predict_sense_with_definition(model, sentence, target_word):
    # Tokenize and index words
    sentence = sentence.split()
    sentence_idx = [preProcessDataset.word2idx.get(word.lower(), preProcessDataset.word2idx['<unk>']) for word in sentence]

    # Pad the sequence
    while len(sentence_idx) < preProcessDataset.max_len:
        sentence_idx.append(preProcessDataset.word2idx['<pad>'])

    # Convert target word to index
    target_word_idx = preProcessDataset.target2idx.get(target_word.lower(), preProcessDataset.target2idx['<unk>'])

    # Convert to Torch Tensor
    input_tensor = torch.tensor([sentence_idx]).cuda()
    target_word_tensor = torch.tensor([target_word_idx]).cuda()

    # Forward Pass
    output = model(input_tensor, target_word_tensor)

    # Interpret Output
    predicted_sense_idx = torch.argmax(output, dim=1).item()
    predicted_sense = preProcessDataset.idx2sense[predicted_sense_idx]

    # Extract lemma, pos, and sense number from the predicted sense
    sense_components = predicted_sense.split('%')
    if len(sense_components) >= 3:
        lemma, pos, sense_num = sense_components[:3]
    else:
        lemma = sense_components[0]
        pos = "n"  # Default to noun if part of speech is not available
        sense_num = "01"  # Default sense number

    # Get WordNet definition
    definition = get_wordnet_definition(lemma, pos, sense_num)

    return predicted_sense, definition


[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
#pip install -U scikit-learn pandas torch
import nltk
from nltk.corpus import semcor
from nltk.stem import WordNetLemmatizer
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
# from transformers import BertTokenizer
import pandas as pd
df = pd.read_csv("semcor_copy.csv")
#df
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df, test_size=0.2)
from tqdm import tqdm
def getNewData(data):
    new_data = pd.DataFrame(columns=['sentence','target_word', 'sense', 'gloss'])

    for i in tqdm(range(0,len(data))):
        sentence = data.iloc[i]['sentence']
        idx1 = sentence.find('[TGT]')
        idx2 = sentence.find('[TGT]', idx1+1)
        target_word = sentence[idx1+6:idx2-1]
        sentence = sentence.replace('[TGT]', '')
        sense_keys = data.iloc[i]['sense_keys']
        glosses = data.iloc[i]['glosses']
        target = data.iloc[i]['target']
        sense_keys = sense_keys.strip('[]')
        sense_keys = sense_keys.split(',')
        target = target.strip('[]')
        target = target.split(',')
        glosses = glosses.strip('[]')
        glosses = glosses.split(',')
        # for every target value add the correspodinign sense key in a new column and also a new column for the gloss
        for j in range(0,len(target)):
            tgt = int(target[j])
            new_row = {'sentence': sentence, 'sense': sense_keys[tgt], 'gloss': glosses[tgt], 'target_word': target_word}
            new_data = pd.concat([new_data, pd.DataFrame(new_row, index=[0])], ignore_index=True)
            new_data['sense'] = new_data['sense'].str.replace('"', '')
            new_data['sense'] = new_data['sense'].str.replace("'", '')
    return new_data

train_data = getNewData(train_data[:30000])
test_data = getNewData(test_data)
train_data.head(10)
target_word_idx = {}
idx_to_target = {}
sense_labels = []
lemma_2_sense = {}
for i in range(0,len(train_data)):
    sense_label = train_data.iloc[i]['sense']
    sense_label = sense_label.replace(' ','')
    lemma, pos, wnsn,wnsn2 = sense_label.split('%')[0], int(sense_label.split(
        '%')[1].split(':')[0]), sense_label.split('%')[1].split(':')[1],sense_label.split('%')[1].split(':')[2]
    new_label = lemma + '%' + str(pos) + '%' + wnsn + '%' + wnsn2
    if lemma not in lemma_2_sense:
        lemma_2_sense[lemma] = []
        target_word_idx[lemma] = len(target_word_idx)
        idx_to_target[len(idx_to_target)] = lemma
    if sense_label not in lemma_2_sense[lemma]:
        lemma_2_sense[lemma].append(sense_label)
    # sense_labels.append(new_label)
target_word_idx['<unk>'] = len(target_word_idx)
idx_to_target[len(idx_to_target)] = '<unk>'
lemma_2_sense['<unk>'] = ['<unk>']
lemma_2_sense['long']
new_data = pd.read_csv('semcor_lstm.csv')
new_data
# find the max length of the sentence

class preProcessDataset():
    def __init__(self,data,min_freq):
        self.data = data
        self.min_freq = min_freq
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = []
        self.vocab_sense = []
        self.sense2idx = {}
        self.idx2sense = {}
        self.max_len = 0
        self.wordnet_lemmatizer = WordNetLemmatizer()

        self.lemma_2_sense = {}
        self.wordFreq = {}
        self.target2idx = target_word_idx
        self.word2idx['<pad>'] = len(self.word2idx)
        self.idx2word[len(self.idx2word)] = '<pad>'
        self.vocab.append('<pad>')
        self.vocab.append('<unk>')
        self.word2idx['<unk>'] = len(self.word2idx)
        self.idx2word[len(self.idx2word)] = '<unk>'
        self.vocab_sense.append('<unk>')
        self.sense2idx['<unk>'] = len(self.sense2idx)
        self.idx2sense[len(self.idx2sense)] = '<unk>'



        data = self.data
        for i in tqdm(range(len(data))):
            sentence = data.iloc[i]['sentence']
            target_word = data.iloc[i]['sense'].split('%')[0]
            target_word = target_word.lower()
            target_word = target_word.replace(' ','')
            sense_keys = data.iloc[i]['sense']
            sense_keys = sense_keys.replace(' ','')

            sentence = sentence.split()
            # count freq of words
            for word in sentence:
                word = word.lower()
                if word not in self.wordFreq:
                    self.wordFreq[word] = 0
                self.wordFreq[word] += 1


            for word in sentence:
                word = word.lower()
                # punctuation marks
                if word in ['.',',','?','!',';',':','(',')','[',']','{','}',"'",'"']:
                    continue
                if self.wordFreq[word] < self.min_freq:
                    word = '<unk>'
                if word not in self.word2idx:
                    self.word2idx[word] = len(self.word2idx)
                    self.idx2word[len(self.idx2word)] = word
                    self.vocab.append(word)
            if len(sentence) > self.max_len:
                self.max_len = len(sentence)
            if sense_keys not in self.sense2idx:
                self.sense2idx[sense_keys] = len(self.sense2idx)
                self.idx2sense[len(self.idx2sense)] = sense_keys
                self.vocab_sense.append(sense_keys)


class getDataset(Dataset):
    def __init__(self, data, word2idx, sense2idx, max_len, target2idx,idx2word,wordFreq,vocab):
        self.data = data
        self.word2idx = word2idx
        self.sense2idx = sense2idx
        self.max_len = max_len
        self.target2idx = target2idx
        self.idx2word = idx2word
        self.wordFreq = wordFreq
        self.vocab = vocab
        self.input_data = []
        self.sense_data = []
        self.target2word = []


        for i in tqdm(range(len(data))):
            sentence = data.iloc[i]['sentence']
            sense_keys = data.iloc[i]['sense']
            sense_keys = sense_keys.replace(' ','')
            target_word = sense_keys.split('%')[0]
            target_word = target_word.lower()

            target_word = target_word.replace(' ','')
            sense_keys = sense_keys.replace(' ','')
            sentence = sentence.split()
            sentence_idx = []
            sense_idx = []
            for word in sentence:
                word = word.lower()
                # punctuation marks
                if word in ['.',',','?','!',';',':','(',')','[',']','{','}',"'",'"']:
                    continue
                if word not in self.word2idx:
                    word = '<unk>'
                sentence_idx.append(self.word2idx[word])
            while len(sentence_idx) < self.max_len:
                sentence_idx.append(self.word2idx['<pad>'])
            self.input_data.append(sentence_idx)
            # sense_idx.append(self.sense2idx[sense_keys])
            if sense_keys not in self.sense2idx:
                sense_keys = '<unk>'
            self.sense_data.append(self.sense2idx[sense_keys])
            if target_word not in self.target2idx:
                target_word = '<unk>'
            self.target2word.append(self.target2idx[target_word])



    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):
        return torch.tensor(self.input_data[idx]),torch.tensor(self.sense_data[idx]),torch.tensor(self.target2word[idx])


preProcessDataset = preProcessDataset(train_data,2)
trainData = getDataset(train_data,preProcessDataset.word2idx,preProcessDataset.sense2idx,preProcessDataset.max_len,preProcessDataset.target2idx,preProcessDataset.idx2word,preProcessDataset.wordFreq,preProcessDataset.vocab)
testData = getDataset(test_data, preProcessDataset.word2idx, preProcessDataset.sense2idx, preProcessDataset.max_len,
                      preProcessDataset.target2idx, preProcessDataset.idx2word, preProcessDataset.wordFreq, preProcessDataset.vocab)

from sklearn.model_selection import train_test_split

train_dataset, valid_dataset = train_test_split(trainData, test_size=0.2, random_state=42)
# dataloader = DataLoader(dataset, batch_size=16, shuffle=True)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(testData, batch_size=8, shuffle=True)
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class biLSTMModel(nn.Module):
    def __init__(self,input_size,hidden_size,sense_vocab,embedding_dim,dataset):
        super(biLSTMModel,self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.sense_vocab = sense_vocab
        self.embedding_dim = embedding_dim
        self.dataset = dataset
        self.embedding = nn.Embedding(self.input_size,self.embedding_dim)
        self.lstm = nn.LSTM(self.embedding_dim,self.hidden_size,bidirectional=True)
        self.linear = nn.Linear(self.hidden_size*2,len(self.sense_vocab))
        self.sense2idx = self.dataset.sense2idx
        self.idx2word = idx_to_target

    def forward(self,x,target_word):
        x = self.embedding(x)
        x = x.permute(1,0,2)
        output,(hidden,cell) = self.lstm(x)
        hidden = torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim=1)
        out = self.linear(hidden)
        for i,target_wo in enumerate(target_word):
            target_wo = idx_to_target[target_wo.item()]
            target_word_sense = lemma_2_sense[target_wo]
            target_word_sense_idx = [self.sense2idx[sense] for sense in target_word_sense]
            out[i,target_word_sense_idx] = F.softmax(out[i,target_word_sense_idx],dim=0)

        return out

from torch.nn import CrossEntropyLoss
model = biLSTMModel(len(preProcessDataset.word2idx),128,preProcessDataset.vocab_sense,300,preProcessDataset)
model = model.cuda()
criterion = CrossEntropyLoss(ignore_index=preProcessDataset.word2idx['<pad>'])
optimizer = optim.Adam(model.parameters(),lr=0.001)
print(model)
num_epochs = 2
for epoch in range(num_epochs):
    total_correct = 0
    total_loss = 0
    model.train()
    for i,(sentence,sense,target_word) in enumerate(train_dataloader):
        sentence = sentence.cuda()
        sense = sense.cuda()
        target_word = target_word.cuda()
        optimizer.zero_grad()
        output = model(sentence,target_word)
        # print(output)
        loss = criterion(output,sense)
        pred_sense = torch.argmax(output,dim=1)
        correct = torch.sum(pred_sense == sense)
        total_correct += correct.item()
        loss.backward()
        total_loss += loss.item()
        optimizer.step()
    print('Epoch : {}/{} | Loss : {:.4f} | Accuracy : {:.4f}'.format(epoch+1,num_epochs,total_loss/len(train_dataloader),total_correct/len(train_dataset)))

    # validation on test dataset
    total_correct = 0
    total_loss = 0
    with torch.no_grad():
        for i,(sentence,sense,target_word) in enumerate(valid_dataloader):
            sentence = sentence.cuda()
            sense = sense.cuda()
            target_word = target_word.cuda()
            output = model(sentence,target_word)
            loss = criterion(output,sense)
            pred_sense = torch.argmax(output,dim=1)
            correct = torch.sum(pred_sense == sense)
            total_correct += correct.item()
            total_loss += loss.item()
        print('Epoch : {}/{} | Validation Loss : {:.4f} | Validation Accuracy : {:.4f}'.format(epoch+1,num_epochs,total_loss/len(valid_dataloader),total_correct/len(valid_dataset)))

torch.save(model.state_dict(),'biLSTM_model.pth')

total_correct = 0
total_loss = 0
with torch.no_grad():
    for i, (sentence, sense, target_word) in enumerate(test_dataloader):
        sentence = sentence.cuda()
        sense = sense.cuda()
        target_word = target_word.cuda()
        output = model(sentence, target_word)
        loss = criterion(output, sense)
        pred_sense = torch.argmax(output, dim=1)
        correct = torch.sum(pred_sense == sense)
        total_correct += correct.item()
        total_loss += loss.item()
    print('| Testing Loss : {:.4f} | Testing Accuracy : {:.4f}'.format(
        total_loss/len(test_dataloader), total_correct/len(testData)))
! pip install nltk
sentence = "he can play music"
target_word = "can"
predicted_sense, definition = predict_sense_with_definition(model, sentence, target_word)
print("Predicted Sense:", predicted_sense)
print("Definition:", definition)

100%|██████████| 8018/8018 [00:43<00:00, 184.56it/s]
100%|██████████| 2005/2005 [00:06<00:00, 311.30it/s]
100%|██████████| 8112/8112 [00:01<00:00, 5957.61it/s]
100%|██████████| 8112/8112 [00:01<00:00, 7933.58it/s]
100%|██████████| 2029/2029 [00:00<00:00, 7728.10it/s]


biLSTMModel(
  (embedding): Embedding(4736, 300)
  (lstm): LSTM(300, 128, bidirectional=True)
  (linear): Linear(in_features=256, out_features=4157, bias=True)
)
Epoch : 1/2 | Loss : 0.7630 | Accuracy : 0.7425
Epoch : 1/2 | Validation Loss : 0.5725 | Validation Accuracy : 0.7246
Epoch : 2/2 | Loss : 0.4765 | Accuracy : 0.8063
Epoch : 2/2 | Validation Loss : 0.5734 | Validation Accuracy : 0.7215
| Testing Loss : 0.5277 | Testing Accuracy : 0.7610
Error retrieving definition for <unk>.n.01: No lemma '<unk>' with part of speech 'n'
Predicted Sense: <unk>
Definition: Definition not found.
