In [96]:
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
from IPython import display
import matplotlib.pyplot as plt

In [97]:
ic = wordnet_ic.ic("ic-brown-resnik-add1.dat")

In [98]:
df = pd.read_csv("data/464_complete_data.csv")
df.drop(columns="Unnamed: 0", inplace=True)
df = df.rename(columns={
    'human_similarity': 'human',
    'gpt_similarity_left': 'gpt_left',
    'gpt_similarity_right': 'gpt_right',
    'gpt_similarity': 'gpt'
})

In [99]:
df.head()

Unnamed: 0,word1,word2,human,gpt_left,gpt_right,gpt
0,scholar,academician,5.42,0.850105,0.853856,0.851981
1,review,critique,5.41,0.822096,0.82807,0.825083
2,refund,reimbursement,5.41,0.812315,0.810235,0.811275
3,haven,refuge,5.41,0.669938,0.703791,0.686865
4,rebel,revolutionary,5.4,0.808106,0.810149,0.809128


### Most Common (first) Synset Similarity using WUP

In [100]:
def wup_common(word1, word2):
    try:
        synset1 = wn.synsets(word1, pos=wn.NOUN)[0]  # First synset for 'dog'
        synset2 = wn.synsets(word2, pos=wn.NOUN)[0]  # First synset for 'wolf'

        # Compare similarity using Wu-Palmer similarity measure
        return synset1.wup_similarity(synset2)
    except:
        return -1

### Average Similarity using WUP

In [101]:
def wup_average(word1, word2):
    synsets_word1 = wn.synsets(word1, pos=wn.NOUN)
    synsets_word2 = wn.synsets(word2, pos=wn.NOUN)
    scores = []
    for s1 in synsets_word1:
        for s2 in synsets_word2:
            score = s1.wup_similarity(s2)
            if score is not None:
                scores.append(score)
    return sum(scores) / len(scores) if scores else -1

### Most Common (first) Synset Similarity using LIN

In [102]:
def lin_common(word1, word2):
    try:
        synset1 = wn.synsets(word1, pos=wn.NOUN)[0]  # First synset for 'dog'
        synset2 = wn.synsets(word2, pos=wn.NOUN)[0]  # First synset for 'wolf'

        # Compare similarity using Wu-Palmer similarity measure
        return  synset1.lin_similarity(synset2, ic)
    except:
        return -1

### Average Similarity using LIN

In [103]:
def lin_average(word1, word2):
    synsets_word1 = wn.synsets(word1, pos=wn.NOUN)
    synsets_word2 = wn.synsets(word2, pos=wn.NOUN)
    scores = []
    for s1 in synsets_word1:
        for s2 in synsets_word2:
            score = s1.lin_similarity(s2, ic)
            if score is not None:
                scores.append(score)

    return sum(scores) / len(scores) if scores else -1

### Run

In [104]:
df["wup_common"] = None
df["wup_average"] = None
df["lin_common"] = None
df["lin_average"] = None
for idx, row in df.iterrows():
    word1 = row["word1"]
    word2 = row["word2"]

    df.loc[idx, "wup_common"] = wup_common(word1, word2)
    df.loc[idx, "wup_average"] = wup_average(word1, word2)
    df.loc[idx, "lin_common"] = lin_common(word1, word2)
    df.loc[idx, "lin_average"] = lin_average(word1, word2)

    print('{:.2%}'.format(idx/464))
    display.clear_output(wait=True)

99.78%


In [105]:
df = df[df["wup_common"] != -1]

In [106]:
df.shape

(456, 10)

In [108]:
df.to_csv("data/456_complete_data.csv")