# Finding Candidate Words for LexicalSample WSD

We now inspect the Semcor dataset for "candidate words" for Lexical sample WSD (LSWSD). Here, we restrict ourselves to the following criteria:

1. we only consider 20-30ish possible lemmas 
2. each considered lemma should have around 3 possible senses 
3. each considered lemma should have multiple POS tags. 
4. There should be at least hundreds of samples per lemma. 
5. For each lemma, the distribution of senses should be as flat as possible. 
    - For example, if a word has senses 1, 2, and 3, there should be roughly
      the same number of examples for each sense of the word.

In [1]:
import datasets
import pandas as pd

In [2]:
from collections import Counter

In [3]:
from typing import List

In [4]:
from scipy.stats import entropy

In [5]:
from IPython.display import display


In [6]:
# first, download and gather the SemCor dataset
semcor = datasets.load_dataset("thesofakillers/SemCor")

brown1 = semcor["brown1"].to_pandas()
brown1 = brown1.replace(to_replace="None", value=None)

brown2 = semcor["brown2"].to_pandas()
brown2 = brown2.replace(to_replace="None", value=None)

brownv = semcor["brownv"].to_pandas()
brownv = brownv.replace(to_replace="None", value=None)

# we now have all the data in the raganato train set in one pd df
semcor_df = pd.concat([brown1, brown2, brownv])

Using custom data configuration thesofakillers--SemCor-d701cc7e0f131929
Reusing dataset csv (/Users/thesofakillers/.cache/huggingface/datasets/thesofakillers___csv/thesofakillers--SemCor-d701cc7e0f131929/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
semcor_df

Unnamed: 0,tagfile,pnum,snum,tag,lemma,lexsn,wnsn,value,cmd,dc,ot,pn,pos,rdf,sep
0,br-j54,1,1,wf,,,,Whenever,ignore,,,,WRB,,
1,br-j54,1,1,wf,artist,1:18:00::,1,artists,done,,,,NN,,
2,br-j54,1,1,punc,,,,",",,,,,,,
3,br-j54,1,1,wf,,,,indeed,done,,notag,,RB,,
4,br-j54,1,1,punc,,,,",",,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364294,br-b22,26,88,wf,,,,by,ignore,,,,IN,,
364295,br-b22,26,88,wf,,,,the,ignore,,,,DT,,
364296,br-b22,26,88,wf,,,,Christian,tag,,,,NNP,,
364297,br-b22,26,88,wf,,,,imperative,tag,,,,JJ,,


In [8]:
lemma_counts = semcor_df.lemma.value_counts()
lemma_counts.name = 'lemma_count'

In [9]:
def flatness(string_list: List[str]) -> float:
    return entropy(list(Counter(string_list).values()))


In [10]:
lemma_sense_pos = semcor_df.groupby("lemma").agg(
    sense_count=pd.NamedAgg(column='lexsn', aggfunc='nunique'), 
    pos_count=pd.NamedAgg(column='pos', aggfunc='nunique'),
    sense_flatness=pd.NamedAgg(column='lexsn', aggfunc=flatness),
    senses=pd.NamedAgg(column='lexsn', aggfunc=Counter)
)

In [11]:
candidate_data = pd.concat([lemma_counts, lemma_sense_pos], axis=1)

In [12]:
# filter for criterion 2
candidate_data = candidate_data[
    (candidate_data.sense_count >= 3) & (candidate_data.sense_count <= 6)
]

# filter for criterion 3
candidate_data = candidate_data[candidate_data.pos_count >= 2]

# filter for criterion 4
candidate_data = candidate_data[candidate_data.lemma_count >= 100]

# sort for criterion 5
candidate_data.sort_values(["sense_flatness"], ascending=False, inplace=True)

In [13]:
# top 30 for criterion 1
with pd.option_context('display.max_rows', 30):
    display(candidate_data[:30])

Unnamed: 0,lemma_count,sense_count,pos_count,sense_flatness,senses
result,214,5,2,1.498573,"{'1:19:00::': 82, '2:42:00::': 45, '1:11:00::'..."
just,327,6,2,1.394321,"{'4:02:01::': 50, '4:02:05::': 85, '4:02:00::'..."
individual,114,6,2,1.362867,"{'5:00:00:independent:00': 8, '3:00:00::': 38,..."
most,350,6,3,1.2981,"{'4:02:00::': 177, '4:02:01::': 60, '3:00:02::..."
local,116,5,2,1.227815,"{'3:00:01::': 47, '3:00:03::': 39, '5:00:00:na..."
public,109,5,2,1.207541,"{'3:00:00::': 48, '1:14:00::': 35, '5:00:00:co..."
same,243,5,2,1.167289,"{'3:00:02::': 124, '3:00:00::': 78, '3:00:04::..."
cost,136,5,2,1.16613,"{'2:42:00::': 35, '1:21:00::': 75, '1:07:01::'..."
express,100,6,3,1.147828,"{'2:32:01::': 53, '4:02:00::': 1, '2:32:00::':..."
now,459,6,2,1.121687,"{'4:02:00::': 182, '4:02:05::': 223, '1:28:00:..."


In [14]:
pd.set_option('display.max_rows', 10)
candidate_data

Unnamed: 0,lemma_count,sense_count,pos_count,sense_flatness,senses
result,214,5,2,1.498573,"{'1:19:00::': 82, '2:42:00::': 45, '1:11:00::'..."
just,327,6,2,1.394321,"{'4:02:01::': 50, '4:02:05::': 85, '4:02:00::'..."
individual,114,6,2,1.362867,"{'5:00:00:independent:00': 8, '3:00:00::': 38,..."
most,350,6,3,1.298100,"{'4:02:00::': 177, '4:02:01::': 60, '3:00:02::..."
local,116,5,2,1.227815,"{'3:00:01::': 47, '3:00:03::': 39, '5:00:00:na..."
...,...,...,...,...,...
die,137,5,2,0.308226,"{'2:30:00::': 128, '1:06:00::': 6, '2:37:00::'..."
young,103,3,2,0.302305,"{'3:00:00::': 95, '5:00:00:early:02': 1, '1:05..."
kind,123,4,2,0.237229,"{'1:09:00::': 117, '3:00:00::': 4, '5:00:00:be..."
group,1334,5,4,0.034383,"{'1:03:00::': 1328, '2:31:00::': 3, '1:27:00::..."


In [15]:
# and we save to disk, to use later
with open("swsd_lemmas.txt", "w") as f:
    f.write("\n".join(candidate_data[:30].index.tolist()))