In [62]:
%pip install pywsd nltk transformers datasets evaluate accelerate torch "tensorflow>=2.0.0" --upgrade tensorflow-hub scikit-learn gensim

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [63]:
import nltk
nltk.download('reuters')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package reuters to /Users/vivekk/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vivekk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/vivekk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/vivekk/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/vivekk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [64]:
# datasets
from datasets import load_dataset, DatasetDict, Dataset

# Transformers
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer, DataCollatorWithPadding


In [65]:
# Sklearn for data splitting
from sklearn.model_selection import train_test_split

In [66]:
# NLP and evaluation tools
from nltk.corpus import stopwords, reuters, wordnet

In [67]:
# General utilities
import pandas as pd
import numpy as np
import math
from collections import Counter

In [68]:
# Evaluation library
import evaluate
import torch

In [69]:
accuracy = evaluate.load("accuracy")

In [70]:
pun_dataset = load_dataset("CreativeLang/pun_detection_semeval2017_task7")

In [71]:
filtered_dataset = pun_dataset['train'].filter(lambda example: example['type'] == 'homographic')
df = pd.DataFrame(filtered_dataset)
df.head()

Unnamed: 0,id,label,type,text
0,hom_1,1,homographic,They hid from the gunman in a sauna where they...
1,hom_2,1,homographic,Wal - Mart isn't the only saving place !
2,hom_3,1,homographic,Can honeybee abuse lead to a sting operation ?
3,hom_4,1,homographic,A ditch digger was entrenched in his career .
4,hom_5,1,homographic,"She was only a Blacksmith's daughter , but she..."


In [72]:
traindf, testdf = train_test_split(df, test_size=0.2)
trainds = Dataset.from_pandas(traindf)
testds = Dataset.from_pandas(testdf)

ds = DatasetDict()
ds['train'] = trainds
ds['test'] = testds

In [73]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(input_data):
  return tokenizer(input_data["text"], truncation=True)



In [74]:
tokenized_data = ds.map(preprocess_function, batched = True)

Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

In [75]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [76]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [77]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [78]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [79]:
training_args = TrainingArguments(
    output_dir="./distilbert_finetuned_puns",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



In [80]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [81]:
trainer.train()

  0%|          | 0/226 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.2571030557155609, 'eval_accuracy': 0.9022222222222223, 'eval_runtime': 3.9296, 'eval_samples_per_second': 114.516, 'eval_steps_per_second': 7.38, 'epoch': 1.0}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.22389210760593414, 'eval_accuracy': 0.9177777777777778, 'eval_runtime': 3.16, 'eval_samples_per_second': 142.407, 'eval_steps_per_second': 9.177, 'epoch': 2.0}
{'train_runtime': 164.8126, 'train_samples_per_second': 21.843, 'train_steps_per_second': 1.371, 'train_loss': 0.28668486333526344, 'epoch': 2.0}


TrainOutput(global_step=226, training_loss=0.28668486333526344, metrics={'train_runtime': 164.8126, 'train_samples_per_second': 21.843, 'train_steps_per_second': 1.371, 'total_flos': 24663773787264.0, 'train_loss': 0.28668486333526344, 'epoch': 2.0})

In [82]:
def pun_detection(input_sentence):
      inputs = tokenizer(input_sentence, return_tensors="pt")

      # Check if GPU (CUDA) is available
      device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

      # Move your model to the device
      model.to(device)

      # Move your input tensor to the same device
      inputs = inputs.to(device)
      # Step 2: Pass the tokenized input through the model
      with torch.no_grad():
          outputs = model(**inputs)

      # The 'outputs' variable contains the model's predictions
      logits = outputs.logits
      # If you have binary classification, you can apply a softmax to get probabilities
      # If you have more than two labels, you can use a softmax for multiclass classification
      # For binary classification:
      probs = torch.softmax(logits, dim=1)

      # To get the predicted class labels for binary classification (0 or 1), you can use argmax
      predicted_class = torch.argmax(probs, dim=1).item()

      # For multiclass classification, you can get the class labels directly
      predicted_labels = torch.argmax(logits, dim=1).tolist()

      # If you have label-to-class mapping (id2label), you can map the labels back to their original labels
      predicted_labels = [id2label[label] for label in predicted_labels]

      return predicted_class


In [83]:
pun_detection("I used to be a baker, but I couldn't make enough dough.")

1

In [84]:
pun_detection("Why do elephants have a trunk? Because they donâ€™t have pockets to put stuff in.")

1

### Pun Location

In [144]:
import json
import random
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords

In [145]:
stop_words = set(stopwords.words('english'))
stop_words.update('.', '?', '-', "'", ':', ',', '!', '<', '>', '"', '/', '(', ')',
                      '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 's', 't', 're', 'm')

In [146]:
def get_random_pun_word(entry):
    sentence = [word for word in entry.split() if word.lower() not in stop_words]
    rand_pun_number = random.randint(0, len(sentence) - 1)
    predicted_word = sentence[rand_pun_number]
    return predicted_word

In [147]:
def get_last_word(entry):
    sentence = [word for word in entry.split() if word.lower() not in stop_words]
    if sentence:
        last_word = sentence[-1]
    return last_word

In [148]:
def get_word_with_most_senses(entry):
    senses_number = 0
    prev_word = ""
    sentence = [word for word in entry.split() if word.lower() not in stop_words]
    for word in sentence:
        word_senses_num = len(wn.synsets(word.lower()))  # Count number of senses in WordNet
        if word_senses_num >= senses_number:  # Track word with most senses
            senses_number = word_senses_num
            prev_word = word
    # Return the word with the most senses, or an empty string if no word was found
    return prev_word if prev_word else None

In [149]:
def calculate_pun_detection_accuracy(json_file_path, find_pun_function):
    sentences = []
    puns = []
    predicted = []
    
    # Load the JSON file
    with open(json_file_path, 'r') as json_file:
        data = json.load(json_file)

    # Extract sentences and puns
    for item in data:
        sentence = item.get('sentence')
        pun = item.get('src')  # Assuming 'src' contains the pun word
        sentences.append(sentence)
        puns.append(pun)

    # Use the provided function to predict pun words
    for sentence in sentences:
        pun_word = find_pun_function(sentence)
        predicted.append(pun_word)

    # Check if both lists have the same length
    total_puns = len(puns)
    total_predicted = len(predicted)

    if total_puns != total_predicted:
        print("Error: Different length between actual and predicted puns.")
        return None

    # Calculate accuracy
    correct_count = sum(1 for i in range(total_puns) if puns[i] == predicted[i])
    accuracy = (correct_count / total_puns) * 100

    # Print the function name used for prediction
    print(f"Function used for prediction: {find_pun_function.__name__}")
    
    # Print the accuracy
    print(f"Accuracy: {accuracy:.2f}%")

# Example of using the function with a sample pun prediction function
def sample_find_pun_function(sentence):
    # Placeholder pun detection logic (you can replace this with your actual logic)
    words = sentence.split()
    return words[-1]  # Example: Just returning the last word as a pun (replace with your logic)

In [150]:
import gensim

In [151]:
w2vmodel = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [152]:
def clean_sentence(sentence):
    words = sentence.lower().split()
    return [word for word in words if word not in stop_words and word in w2vmodel.key_to_index]

In [153]:
def get_possible_pun_words(sent):
    scores = {}
    if len(sent) <= 1:
        return set(sent[0])
    else:
        for i in range(len(sent)-1):
            for j in range(i+1, len(sent)):
                sim_score = w2vmodel.similarity(sent[i], sent[j])
                scores['{0}-{1}'.format(sent[i], sent[j])] = sim_score

        if len(scores) >= 5:
            top3 = sorted(zip(scores.values(), scores.keys()), reverse=True)[:3]
            poss = [tup[1].split(sep='-') for tup in top3]
            possible_pun_words = set(poss[0] + poss[1] + poss[2])
        else:
            poss = [pair.split(sep='-') for pair in scores.keys()]
            possible_pun_words = set()
            for i in range(len(poss)):
                possible_pun_words = possible_pun_words.union(set(poss[i]))
            
        return possible_pun_words

In [154]:
def find_pun_w2v(sentence):
    # Clean the sentence
    clean_sent = clean_sentence(sentence)
    
    if not clean_sent:
        return None  # No valid words to process

    # Get possible pun words
    possible_pun_words = get_possible_pun_words(clean_sent)

    # Tokenize the original sentence
    original_words = sentence.lower().split()

    # Filter possible pun words to include only those that exist in the original sentence
    possible_pun_words_in_original = [w for w in possible_pun_words if w in original_words]

    # Check if there are valid pun words in the original sentence
    if not possible_pun_words_in_original:
        return None  # No pun word found

    # Find the pun word with the highest index (appearing last in the sentence)
    pun_word = max(possible_pun_words_in_original, key=lambda w: original_words.index(w))
    
    return pun_word

In [155]:
documents = reuters.fileids()
word_frequencies = Counter()

for doc_id in documents:
    words = reuters.words(doc_id)
    word_frequencies.update(words)

In [156]:
# word frequency
def calculate_frequency(word):
    freq = word_frequencies[word]
    return freq

In [157]:
# NPMI
def calculate_npmi(sentence, word, content_word, total_words):
    words = nltk.word_tokenize(sentence.lower())
    # Calculate the number of appearances of word and content_word
    word_count = sum(1 for w in words if w == word)
    content_word_count = sum(1 for w in words if w == content_word)
    co_occurrence_count = sum(1 for i in range(len(words) - 1) if words[i] == word and words[i + 1] == content_word)

    # Calculate the probability of each word
    p_word = word_count / total_words
    p_content_word = content_word_count / total_words
    p_co_occurrence = co_occurrence_count / (total_words - 1)  # 1 less than the length of the sentence

    # Calculate NPMI
    if p_word == 0.0 or p_content_word == 0.0 or p_co_occurrence == 0.0:
        return 0.0

    pmi = math.log2(p_co_occurrence / (p_word * p_content_word))
    npmi = pmi / (-math.log2(p_co_occurrence))

    return npmi

In [158]:
# Defines the word class tagging function
def get_word_pos(word):
    pos = nltk.pos_tag([word])[0][1]
    if pos.startswith('N'):  # Noun
        return 'n'
    elif pos.startswith('J'):  # Adjective
        return 'a'
    elif pos.startswith('R'):  # Adverb
        return 'r'
    elif pos.startswith('V'):  # Verb
        return 'v'
    else:
        return None

In [159]:
# Function to get word frequency information to use in a sentence
def get_word_frequencies(sentence):
    words = nltk.word_tokenize(sentence.lower())
    word_freq = Counter(words)
    return words, word_freq

In [160]:
# Function to find pun in a sentence
def find_pun(sentence):
    words, word_freq = get_word_frequencies(sentence)  # Get word list and frequency information
    total_words = len(words)

    highest_score = -1
    pun_word = None

    # Calculate and evaluate scores for each word in a sentence
    for i, word in enumerate(word_freq.keys()):
        score = 0
        #only consider the words that are noun, adj, adverb or verb
        if get_word_pos(word) in ['n', 'a', 'r', 'v']:
          # 1. +1 points for words that are not in the corpus
          freq = calculate_frequency(word)
          if freq==0:
            score += 1

          # 2. Calculate the number of content words with NPMI(x, y) > 0.3
          content_words = [w for w in word_freq.keys() if w != word and w not in stopwords.words('english')]
          nmpi_count = sum(1 for content_word in content_words if calculate_npmi(sentence, word, content_word, total_words) > 0.3 and len(wordnet.synsets(word))>1)
          score += nmpi_count

          # 3. Give points to words located in 3/4 and 4/4
          if i >= total_words * 3 / 4:
              score += 2
          elif i >= total_words * 1 / 2:
              score += 1

          # 4. Score the words that appear after the first comma in a sentence
          if ',' in sentence[:i]:
              score += 1

          # 5. Score words that appear after first "then" in a sentence
          if 'then' in sentence[:i]:
              score += 1

          # 6. Score words that appear after first "but" in a sentence
          if 'but' in sentence[:i]:
              score += 1

          # 7. Bonus for having the highest IDF among words located in the second half (after three quarters)
          if i >= total_words * 3 / 4:
              second_half_words = list(word_freq.keys())[i:]
              idf_scores = {}
              for w in second_half_words:
                  idf = math.log(total_words / (word_freq.get(w, 1) + 1))
                  idf_scores[w] = idf
              max_idf_word = max(idf_scores, key=idf_scores.get)
              if word == max_idf_word:
                  score += 1

          # Update highest score and pun word
          if score > highest_score:
              highest_score = score
              pun_word = word
        else: continue
    return pun_word

In [207]:
find_pun("I used to be a baker, but I couldnâ€™t make enough dough.")

'dough'

In [211]:
find_pun("I used to be a banker, but I lost interest.")

'interest'

In [206]:
find_pun("I used to be a math teacher, but I found it too difficult to solve my problems.")

'problems'

In [210]:
find_pun("I gave all my dead batteries away todayâ€¦ free of charge.")

'charge'

In [213]:
find_pun("He had a photographic memory but never developed it.")

'never'

In [216]:
find_pun("The band is going to wind up their tour in Chicago.")

'chicago'

In [161]:
json_file_path = 'semeval-task3-homo.json'
calculate_pun_detection_accuracy(json_file_path, get_random_pun_word)
calculate_pun_detection_accuracy(json_file_path, get_word_with_most_senses)
calculate_pun_detection_accuracy(json_file_path, get_last_word)
calculate_pun_detection_accuracy(json_file_path, find_pun_w2v)
calculate_pun_detection_accuracy(json_file_path, find_pun)

Function used for prediction: get_random_pun_word
Accuracy: 17.87%
Function used for prediction: get_word_with_most_senses
Accuracy: 26.58%
Function used for prediction: get_last_word
Accuracy: 61.17%
Function used for prediction: find_pun_w2v
Accuracy: 48.00%
Function used for prediction: find_pun
Accuracy: 50.08%


### Pun Interpretation

In [162]:
import tensorflow as tf
import tensorflow_hub as hub
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import string

In [163]:
# Load pre-trained ELMo model
elmo = hub.KerasLayer("https://tfhub.dev/google/elmo/3", trainable=False)

In [164]:
def find_second_closest_number(numbers, target):
    if len(numbers) < 2:
        raise ValueError("The list must contain at least two numbers.")
    
    closest_number = numbers[0]
    second_closest_number = numbers[1]

    for number in numbers:
        diff1 = abs(number - target)
        diff2 = abs(second_closest_number - target)
        diff3 = abs(closest_number - target)

        if diff1 < diff3:
            second_closest_number = closest_number
            closest_number = number
        elif diff1 < diff2 and number != closest_number:
            second_closest_number = number

    return second_closest_number

In [165]:
def find_numbers(numbers, target):
    vicinity_values = [number for number in numbers if int(number * 10) == int(target * 10)]
    if not vicinity_values:
        target = (target * 10 - 10) / 10
        return find_numbers(numbers, target)
    return vicinity_values

In [166]:

def find_farthest_value(numbers, target):
    farthest_value = None
    max_distance = float('-inf')  # Initialize with negative infinity

    for number in numbers:
        distance = abs(number - target)
        if distance > max_distance:
            max_distance = distance
            farthest_value = number

    return farthest_value

In [167]:
from pywsd.lesk import adapted_lesk


In [168]:
def pun_interpretation(sentence, pun):
    # Find the index of the pun word
    sentence_words = sentence.split(" ")
    index = sentence_words.index(pun)

    # Use the adapted_lesk function from pywsd for word sense disambiguation
    sense1 = adapted_lesk(sentence, pun)
    
    if sense1:
        senses = wordnet.synsets(pun)
        
        # Get the definitions of the synsets
        sen_def = [i.definition() for i in senses]

        # Building word vectors for the target word from the test dataset
        ind_embeddings = elmo(tf.constant([sentence]))
        target_vector = ind_embeddings[0][index].numpy()

        ind_synset_embeddings = elmo(tf.constant(sen_def))
        synset_vectors = [tf.reduce_mean(ind_synset_embeddings[i], axis=0).numpy() for i in range(len(senses))]

        cosine = []
        for i in range(len(senses)):
            dot_product = tf.reduce_sum(tf.multiply(target_vector, synset_vectors[i]))
            magnitude1 = tf.sqrt(tf.reduce_sum(tf.square(target_vector)))
            magnitude2 = tf.sqrt(tf.reduce_sum(tf.square(synset_vectors[i])))
            cosine_similarity = dot_product / (magnitude1 * magnitude2)
            cosine.append(cosine_similarity.numpy())

        # Find the index of sense1 in the list of synsets, not just the definitions
        if sense1 in senses:
            k = senses.index(sense1)
        else:
            print(f"Warning: Sense from adapted_lesk not found in WordNet synsets for '{pun}'")
            return None

        if len(cosine) < 2:
            return None
        
        # Find second closest sense
        p = find_second_closest_number(cosine, cosine[k])
        sense2 = senses[cosine.index(p)]

        # Find farthest sense (optional, if required)
        vic = find_farthest_value(find_numbers(cosine, cosine[k]), cosine[k])
        farthest_sense = senses[cosine.index(vic)]

        # Return the two senses (sense1 and sense2)
        return {"sense1": sense1.definition(), "sense2": sense2.definition()}
    else:
        return None


In [169]:
def preprocess_sentence(sentence):
    # Remove punctuation
    sentence = ''.join([char for char in sentence if char not in string.punctuation])

    # Tokenize the sentence into words
    words = nltk.word_tokenize(sentence)

    # Remove stopwords
    words = [word for word in words if word.lower() not in stopwords.words("english")]

    # Rejoin the words to form a cleaned sentence
    return ' '.join(words)

In [170]:
def pun_complete_detection(sentence):
    if pun_detection(sentence) == 1:
        print("Pun detected!")
        cleaned_sentence = preprocess_sentence(sentence)
        pun_word = find_pun(cleaned_sentence)
        print("Pun word:", pun_word)
        print(pun_interpretation(cleaned_sentence, pun_word))
    else:
        print("No pun detected in the sentence")

In [197]:
sentences = ["They hid from the gunman in a sauna where they could sweat it out.","",
                     "I've been to the dentist so many times, I really know the drill.",
                     "A contest held by fire fighters is called a ' match '."
                     ]
for sentence in sentences:
  pun_complete_detection(sentence)
  print("-----------------------------------")
  

Pun detected!
Pun word: sweat
{'sense1': 'excrete perspiration through the pores in the skin', 'sense2': 'salty fluid secreted by sweat glands'}
-----------------------------------
No pun detected in the sentence
-----------------------------------
Pun detected!
Pun word: drill
{'sense1': 'systematic training by multiple repetitions', 'sense2': 'a tool with a sharp point and cutting edges for making holes in hard materials (usually rotating rapidly or by repeated blows)'}
-----------------------------------
Pun detected!
Pun word: match
{'sense1': 'a formal contest in which two or more persons or teams compete', 'sense2': 'lighter consisting of a thin piece of wood or cardboard tipped with combustible chemical; ignites with friction'}
-----------------------------------


In [172]:
%pip install sentence_transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [173]:
from sentence_transformers import SentenceTransformer, util
import torch

In [174]:
stmodel = SentenceTransformer('all-mpnet-base-v2')



In [175]:
def select_top_senses(sentence, pun_word, search_size):
    # Generate sentence embedding
    sentence_embedding = stmodel.encode(sentence, convert_to_tensor=True)

    # Retrieve WordNet senses for the pun word
    synsets = wn.synsets(pun_word)

    # Compute embeddings for each sense's definition or lemma name
    sense_embeddings = []
    sense_data = []  # Store both lemma and synset

    for synset in synsets:
        # Get all synonyms (lemma names) for each sense
        for lemma in synset.lemma_names():
            # Encode the lemma or definition of the synset
            lemma_embedding = stmodel.encode(lemma, convert_to_tensor=True)
            sense_embeddings.append(lemma_embedding)
            sense_data.append((lemma, synset))  # Store lemma and synset together

    # Compute cosine similarities between sentence embedding and sense embeddings
    similarities = util.pytorch_cos_sim(sentence_embedding, torch.stack(sense_embeddings))

    # Sort the similarities in descending order and extract the top two scores
    sorted_indices = similarities.argsort(descending=True)[0]
    top_n_similarities = similarities[0][sorted_indices[:search_size]]  # Top two similarities

    # Collect senses with the same similarity as the second highest score
    selected_senses = []
    nth_highest_score = top_n_similarities[search_size-1].item()  # Second highest similarity

    for idx in sorted_indices:
        sim_score = similarities[0][idx].item()
        if sim_score >= nth_highest_score:
            selected_senses.append((sense_data[idx][0], sense_data[idx][1].definition(), sim_score))
    return selected_senses

In [176]:
def interpret_pun(sentence, pun_word, search_size = 2):
    selected_senses = select_top_senses(sentence, pun_word, search_size)
    # Display senses and similarity scores
    print(f"Selected senses for the word '{pun_word}':")
    for lemma, definition, similarity in selected_senses:
        print(f"{lemma} (Similarity score: {similarity:.4f}) : {definition}")

In [177]:
def pun_complete_detection_st(sentence):
    if pun_detection(sentence) == 1:
        print("Pun detected!")
        cleaned_sentence = preprocess_sentence(sentence)
        pun_word = find_pun(cleaned_sentence)
        print("Pun word:", pun_word)
        print(interpret_pun(cleaned_sentence, pun_word,2))
    else:
        print("No pun detected in the sentence")

In [196]:
sentences = ["They hid from the gunman in a sauna where they could sweat it out.",
                     "I've been to the dentist so many times, I really know the drill.",
                     "A contest held by fire fighters is called a ' match '."
                     ]
for sentence in sentences:
  pun_complete_detection_st(sentence)
  print("-----------------------------------")

Pun detected!
Pun word: sweat
Selected senses for the word 'sweat':
sweat (Similarity score: 0.5356) : salty fluid secreted by sweat glands
sweat (Similarity score: 0.5356) : agitation resulting from active worry
sweat (Similarity score: 0.5356) : condensation of moisture on a cold surface
sweat (Similarity score: 0.5356) : use of physical or mental energy; hard work
sweat (Similarity score: 0.5356) : excrete perspiration through the pores in the skin
None
-----------------------------------
Pun detected!
Pun word: drill
Selected senses for the word 'drill':
drill (Similarity score: 0.6715) : a tool with a sharp point and cutting edges for making holes in hard materials (usually rotating rapidly or by repeated blows)
drill (Similarity score: 0.6715) : similar to the mandrill but smaller and less brightly colored
drill (Similarity score: 0.6715) : systematic training by multiple repetitions
drill (Similarity score: 0.6715) : (military) the training of soldiers to march (as in ceremonial

In [179]:
%pip install openai

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [181]:
def verify_senses_with_llm(sentence, pun_word, senses):
    prompt = f"""
    The word "{pun_word}" in the sentence "{sentence}" has multiple meanings.
    The following interpretations were chosen based on context:
    """

    for i, (lemma, definition, score) in enumerate(senses):
        prompt += f"{i+1}. {lemma} - {definition}\n"

    prompt += "Select the best two interpretations that fit the context of the sentence."

    completion = openai.chat.completions.create(
        model="gpt-4",
        messages=[
            {
                "role": "user",
                "content": prompt,
            },
        ],
    )
    return completion.choices[0].message.content.strip()

In [182]:
def interpret_pun_with_gpt(sentence, pun_word, search_size = 2):
    selected_senses = select_top_senses(sentence, pun_word, search_size)
    # Display senses and similarity scores
    print(f"Selected senses for the word '{pun_word}':")
    for lemma, definition, similarity in selected_senses:
        print(f"{lemma} (Similarity score: {similarity:.4f}) : {definition}")

    # Check if we need to use the LLM
    if len(selected_senses) > 2:
        print("\nSending multiple senses to LLM for verification...")
        llm_response = verify_senses_with_llm(sentence, pun_word, selected_senses)
        print("\nLLM's feedback on the chosen senses:")
        print(llm_response)
    else:
        print("\nNo need for LLM verification. Only two senses selected.")

In [183]:
def pun_complete_detection_st_gpt(sentence):
    if pun_detection(sentence) == 1:
        print("Pun detected!")
        cleaned_sentence = preprocess_sentence(sentence)
        pun_word = find_pun(cleaned_sentence)
        print("Pun word:", pun_word)
        print(interpret_pun_with_gpt(cleaned_sentence, pun_word,2))
    else:
        print("No pun detected in the sentence")

In [195]:
sentences = ["They hid from the gunman in a sauna where they could sweat it out.",
                     "I've been to the dentist so many times, I really know the drill.",
                     "A contest held by fire fighters is called a ' match '."
                     ]
for sentence in sentences:
  pun_complete_detection_st_gpt(sentence)
  print("-----------------------------------")

Pun detected!
Pun word: sweat
Selected senses for the word 'sweat':
sweat (Similarity score: 0.5356) : salty fluid secreted by sweat glands
sweat (Similarity score: 0.5356) : agitation resulting from active worry
sweat (Similarity score: 0.5356) : condensation of moisture on a cold surface
sweat (Similarity score: 0.5356) : use of physical or mental energy; hard work
sweat (Similarity score: 0.5356) : excrete perspiration through the pores in the skin

Sending multiple senses to LLM for verification...

LLM's feedback on the chosen senses:
1. sweat - salty fluid secreted by sweat glands
3. sweat - condensation of moisture on a cold surface
None
-----------------------------------
Pun detected!
Pun word: drill
Selected senses for the word 'drill':
drill (Similarity score: 0.6715) : a tool with a sharp point and cutting edges for making holes in hard materials (usually rotating rapidly or by repeated blows)
drill (Similarity score: 0.6715) : similar to the mandrill but smaller and less b

In [185]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
import pandas as pd

In [186]:
lemmatizer = WordNetLemmatizer()

In [187]:
kuperman_aoa = pd.read_csv("AOA_adjusted_dataset.csv")

In [188]:
age_of_acquisition = {str(row['Word']).lower(): row['Rating.Mean'] for index, row in kuperman_aoa.iterrows() if isinstance(row['Word'], str)}

In [189]:
def get_age_of_acquisition(word, age):
    '''
    This function checks if the word exists in the age of acquisition dataset
    and if its Rating.Mean is equal to or less than the given age.
    Returns True if the word is appropriate for the given age, False otherwise.
    '''
    word = word.lower()  # Ensure the word is in lowercase to match the dataset

    # Try to get the AoA rating from the dictionary for the exact word
    if word in age_of_acquisition:
        rating_mean = age_of_acquisition[word]
        return rating_mean <= age

    # If not found, try lemmatizing the word
    lemma = lemmatizer.lemmatize(word)
    
    # Check if the lemma has an AoA rating
    if lemma in age_of_acquisition:
        rating_mean = age_of_acquisition[lemma]
        return rating_mean <= age
    
    # If word or its lemma is not found, return False
    return False

In [190]:
# Main function to check AoA for a word and its two senses
def analyze_word_for_aoa(word, age):
    '''
    This function analyzes the input word, identifies its first two senses,
    and checks if they are appropriate for the given age.
    Returns a list containing the word, its senses, and confirmation of appropriateness.
    '''
    word = word.lower()
    lemma = lemmatizer.lemmatize(word)
    
    # Get the first two senses of the word using WordNet
    senses = wn.synsets(lemma)[:2]
    
    if senses:
        # Initialize a list to store information about the word
        word_info = {
            'word': word,
            'senses': [],
        }
        
        # For each of the two senses, check the age of acquisition
        for sense in senses:
            sense_definition = sense.definition()  # Get the sense definition
            aoa_appropriate = get_age_of_acquisition(lemma, age)  # Fetch AoA rating for the lemma
            
            # Add the sense and its AoA status to the word info
            word_info['senses'].append({
                'sense_definition': sense_definition,
                'appropriate_for_age': aoa_appropriate
            })
        
        # Return the results for the word
        return word_info
    else:
        return None  # No senses found

In [191]:
def pun_full_workflow(sentence,age):
    if pun_detection(sentence) == 1:
        print("Pun detected!")
        cleaned_sentence = preprocess_sentence(sentence)
        pun_word = find_pun(cleaned_sentence)
        print("Pun word:", pun_word)
        print(interpret_pun_with_gpt(cleaned_sentence, pun_word,2))
        print('Age Appropriate for ' + str(age) + ': ' + str(get_age_of_acquisition(pun_word, age)))

    else:
        print("No pun detected in the sentence")

In [218]:
sentences = ["They hid from the gunman in a sauna where they could sweat it out.",
                     "I've been to the dentist so many times, I really know the drill.",
                     "A contest held by fire fighters is called a match.", "We are students of computer and information technology."
                     ]
for sentence in sentences:
  pun_full_workflow(sentence,7)
  print("-----------------------------------")

Pun detected!
Pun word: sweat
Selected senses for the word 'sweat':
sweat (Similarity score: 0.5356) : salty fluid secreted by sweat glands
sweat (Similarity score: 0.5356) : agitation resulting from active worry
sweat (Similarity score: 0.5356) : condensation of moisture on a cold surface
sweat (Similarity score: 0.5356) : use of physical or mental energy; hard work
sweat (Similarity score: 0.5356) : excrete perspiration through the pores in the skin

Sending multiple senses to LLM for verification...

LLM's feedback on the chosen senses:
The sentence is quite unclear, but the possible interpretations could be:
1. sweat - salty fluid secreted by sweat glands
5. sweat - excrete perspiration through the pores in the skin
None
Age Appropriate for 7: False
-----------------------------------
Pun detected!
Pun word: drill
Selected senses for the word 'drill':
drill (Similarity score: 0.6715) : a tool with a sharp point and cutting edges for making holes in hard materials (usually rotating 

In [None]:
pun_full_workflow("This is where I keep my arrows, said Tom, quivering",15)

In [None]:
pun_full_workflow("Why do elephants have a trunk? Because they donâ€™t have pockets to put stuff in",5)

In [223]:
pun_full_workflow("The teddy bear was stuffed, so he said no to the dessert",6)

Pun detected!
Pun word: dessert
Selected senses for the word 'dessert':
dessert (Similarity score: 0.6268) : a dish served as the last course of a meal
sweet (Similarity score: 0.1915) : a dish served as the last course of a meal

No need for LLM verification. Only two senses selected.
None
Age Appropriate for 6: True


In [225]:
pun_full_workflow("Why did the imagination go to school? To become a little more thoughtful.",4)

Pun detected!
Pun word: thoughtful
Selected senses for the word 'thoughtful':
thoughtful (Similarity score: 0.3732) : having intellectual depth
thoughtful (Similarity score: 0.3732) : exhibiting or characterized by careful thought
thoughtful (Similarity score: 0.3732) : acting with or showing thought and good sense
thoughtful (Similarity score: 0.3732) : taking heed; giving close and thoughtful attention
thoughtful (Similarity score: 0.3732) : considerate of the feelings or well-being of others

Sending multiple senses to LLM for verification...

LLM's feedback on the chosen senses:
1. thoughtful - having intellectual depth
2. thoughtful - exhibiting or characterized by careful thought
None
Age Appropriate for 4: False
