In [9]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
import random

### Download data

In [10]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tianyuwu/nltk_data...


True

### Load target words

In [3]:
words = pd.read_csv('data/5000_synonym.csv')

In [75]:
target_words = list(zip(words['lemma'], words['PoS']))
# print(target_words)

In [27]:
len(target_words)

4702

### Collect words from WordNet by PoS

In [61]:
def get_random_word_by_pos(pos, target_word):
    synsets = list(wn.all_synsets(pos))
    if synsets:
        while True:
            random_synset = random.choice(synsets)
            random_word = random_synset.lemmas()[0].name()
            if random_word != target_word and '_' not in random_word: 
                return random_word
    return None

In [70]:
w = get_random_word_by_pos('a', 'say')
w

'bizarre'

In [37]:
# Map data labels to WordNet labels
# pos_map = {
#     'n': 'n',
#     'v': 'v',
#     'j': 'a',
#     'r': 'r',
# }

### Pairing

In [73]:
generated_words = []

for word, pos in target_words:
    # wordnet_pos = pos_map.get(pos)
    if pos:
        random_word = get_random_word_by_pos(pos, word)
        if random_word:
            generated_words.append((random_word, pos))
        else:
            generated_words.append((None, pos))
    else:
        generated_words.append((None, pos))

### Store the result

In [79]:
generated_df = pd.DataFrame(generated_words, columns=['word', 'pos'])
generated_df.head()

Unnamed: 0,word,pos
0,erupt,v
1,overleap,v
2,chaw,v
3,dull,v
4,meet,v


In [83]:
res_df = pd.concat([words['lemma'], generated_df], axis=1)
res_df = res_df.rename(columns={'lemma':'original', 'word':'random'})
res_df.head()

Unnamed: 0,original,random,pos
0,say,erupt,v
1,go,overleap,v
2,get,chaw,v
3,know,dull,v
4,would,meet,v


In [84]:
res_df.to_csv('random_pairs.csv')