In [9]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
import random

### Download data

In [10]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tianyuwu/nltk_data...


True

### Load target words

In [3]:
words = pd.read_csv('data/5000_synonym.csv')

In [31]:
target_words = list(zip(words['lemma'], words['PoS']))
target_words[0]

('say', 'v')

In [27]:
len(target_words)

4702

### Collect words from WordNet by PoS

In [34]:
def get_random_word_by_pos(pos, target_word):
    synsets = list(wn.all_synsets(pos))
    if synsets:
        while True:
            random_synset = random.choice(synsets)
            random_word = random_synset.lemmas()[0].name()
            if random_word != target_word: 
                return random_word
    return None

In [36]:
w = get_random_word_by_pos('v', 'say')
w

'minister'

In [37]:
# Map data labels to WordNet labels
pos_map = {
    'n': 'n',
    'v': 'v',
    'j': 'a',
    'r': 'r',
}

### Pairing

In [49]:
generated_words = []

for word, pos in target_words[0:99]:
    wordnet_pos = pos_map.get(pos)

    if wordnet_pos:
        random_word = get_random_word_by_pos(wordnet_pos, word)
        if random_word:
            generated_words.append((random_word, pos))
        else:
            generated_words.append((None, pos))
    else:
        generated_words.append((None, pos))

In [50]:
generated_words

[('buoy', 'v'),
 ('guesstimate', 'v'),
 ('spool', 'v'),
 ('foil', 'v'),
 ('draw', 'v'),
 ('pull_up_short', 'v'),
 ('turn', 'v'),
 ('Ovibos', 'n'),
 ('cinch', 'v'),
 ('azure', 'v'),
 ('Sri_Lanka', 'n'),
 ('ring_up', 'v'),
 ('Yiddish', 'n'),
 ('mark_down', 'v'),
 ('agonize', 'v'),
 ('resile', 'v'),
 ('Acherontia', 'n'),
 ('franchise_tax', 'n'),
 ('inexorably', 'r'),
 ('possibly', 'r'),
 ('harmonize', 'v'),
 ('goose', 'v'),
 (None, 'a'),
 ('night-blooming_cereus', 'n'),
 ('exposure', 'n'),
 ('hype_up', 'v'),
 ('consume', 'v'),
 (None, 'a'),
 ('nasalize', 'v'),
 ('at_a_loss', 'r'),
 ('unevenly', 'r'),
 ('apologetically', 'r'),
 ('bereave', 'v'),
 ('bomblet', 'n'),
 ('phylogenetically', 'r'),
 ('sit', 'v'),
 ('cut', 'v'),
 ('picture', 'v'),
 ('grand_slam', 'n'),
 ('fancy', 'v'),
 ('backstage', 'r'),
 ('charge', 'v'),
 ('desperately', 'r'),
 ('American_water_shrew', 'n'),
 ('allegorize', 'v'),
 (None, 'a'),
 ('nightshade', 'n'),
 ('forbear', 'v'),
 ('capsicum', 'n'),
 ('piece_of_leather', 'n