In [11]:
import pandas as pd
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
import random

### Read Data

In [12]:
df = pd.read_excel("data/wordFrequency.xlsx", sheet_name = "1 lemmas", index_col = "rank")
df = df[["lemma", "PoS", "freq"]]

### Filter PoS

In [13]:
stop_words = set(stopwords.words('english'))
df = df[~df["lemma"].isin(stop_words)]
df = df[df["PoS"].isin(["n", "v", "j", "r"])]
df["PoS"] = df["PoS"].replace({"n": wn.NOUN, "v": wn.VERB, "j": wn.ADJ, "r": wn.ADV})

### Generate Synonym

In [14]:
def get_synonym(word, pos):
    try:
      synsets = wn.synsets(word, pos)
      random.shuffle(synsets)
      for synset in synsets:
            lemmas = synset.lemmas()
            random.shuffle(lemmas)
            for lemma in lemmas:
                  lemma = lemma.name()
                  if lemma.isalpha() and lemma != word:
                        return lemma, synset
    except:
          pass
    return None, None

In [15]:
df["synonym"] = None
df["synset"] = None
for idx, row in df.iterrows():
    df.loc[idx, "synonym"], df.loc[idx, "synset"] = get_synonym(df.loc[idx, "lemma"], df.loc[idx, "PoS"])

In [16]:
df = df.dropna(axis = 0, how = "any")

### Generate Random Pairs

In [17]:
random_word_dic = {wn.NOUN: list(wn.all_synsets("n")),
                   wn.VERB: list(wn.all_synsets("v")),
                   wn.ADJ: list(wn.all_synsets("a")),
                   wn.ADV: list(wn.all_synsets("r"))}

In [18]:
def get_random_word_by_pos(pos, target_word):
    synsets = random_word_dic[pos]
    while True:
            random_synset = random.choice(synsets)
            random_word = random.choice(random_synset.lemmas()).name()
            if random_word.isalpha() and random_word != target_word: 
                return random_word
    return None

In [19]:
df["random"] = None
for idx, row in df.iterrows():
    df.loc[idx, "random"]= get_random_word_by_pos(df.loc[idx, "PoS"], df.loc[idx, "lemma"])

In [20]:
df.head()

Unnamed: 0_level_0,lemma,PoS,freq,synonym,synset,random
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
26,say,v,4096416,suppose,Synset('suppose.v.01'),achromatize
31,go,v,3546732,depart,Synset('go.v.03'),malt
34,get,v,3347615,drive,Synset('drive.v.11'),metalize
39,know,v,2761628,recognize,Synset('acknowledge.v.06'),retire
50,make,v,2290830,have,Synset('hold.v.03'),sweeten


In [21]:
df.to_csv("data/4000_word_pairs.csv")