In [1]:
import csv
import inflect
import json
import nltk
import pyinflect
import spacy

from collections import defaultdict
from ordered_set import OrderedSet
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet2021 as wn2
from wordfreq import word_frequency

In [2]:
inflector = inflect.engine()
nlp = spacy.load('en_core_web_sm')

In [3]:
def is_plural(word):
    return inflector.singular_noun(word) is not False

In [2]:
DIR = "../data/gqa_entities/"
scene_nouns_path = f"{DIR}/scene_nouns.csv"
question_nouns_path = f"{DIR}/question_nouns.csv"
overlapping_nouns_path = f"{DIR}/overlapping_nouns.csv"

In [4]:
def read_csv(path):
    data = []
    with open(path, "r") as f:
        reader = csv.DictReader(f)
        for line in reader:
            data.append(line)
    return data

def unique_words(lst):
    return OrderedSet([entry['word'] for entry in lst])

In [6]:
scene_nouns = unique_words(read_csv(scene_nouns_path))
question_nouns = unique_words(read_csv(question_nouns_path))
overlapping_nouns = unique_words(read_csv(overlapping_nouns_path))

unique_nouns = scene_nouns.union(question_nouns)

In [7]:
len(unique_nouns)

2041

In [8]:
senses = []
for word in unique_nouns:
    word = "_".join(word.split(" ")) if " " in word else word
    try:
        default_sense = wn.synsets(word)[0]
        default_sense_name = default_sense.name()
    except:
        default_sense_name = "NF"
    
    senses.append((word, default_sense_name))

In [None]:
# # save senses to dir
# with open(f"{DIR}/noun-senses.csv", "w") as f: 
#     writer = csv.writer(f)
#     writer.writerow(["noun", "sense"])
#     writer.writerows(senses)

In [394]:
# read annotated csv from gdrive
annotated_senses_raw = read_csv(f"{DIR}/noun-senses-annotated.csv")
annotated_senses = []
manual = []
leftover = []

for entry in annotated_senses_raw:
    word = entry['noun']
    sense = entry['sense']
    if entry['marked'] == '' or entry['marked'] == '4':
        if entry['replacement'] != '' and entry['marked'] != '4':
            sense = entry['replacement']
            if "wn2" in entry['notes']:
                sense = wn2.synsets(word)[0].name()
            annotated_senses.append((word, sense))
        else:        
            if "manual" in entry['notes'] or "toy" in entry['notes']:
                manual.append(entry)
            else:
                annotated_senses.append((word, sense))
    else:
        leftover.append(entry)

In [395]:
len(leftover) + len(manual) + len(annotated_senses), len(senses)

(2041, 2041)

In [396]:
'''
Current filtration pipeline
get all hypernyms for each entity
look at counts of hypernyms across samples as a pct of appearance in hypernym path
remove common ones (after manual inspection)
    e.g., entity, whole, physical entity, artifact, instrumentality, 
then sample based on freq. (maximum 4, including entity)
'''
def hypernym_path_lemmas(sense):
    try:
        synset = wn2.synset(sense)
    except:
        synset = wn.synset(sense)
    hp = synset.hypernym_paths()[0]
    lemmas = [" ".join(s.lemma_names()[0].split("_")) for s in hp]
    return lemmas

# # e.g.
# hypernym_path_lemmas("chair.n.01")

In [397]:
lemma_counts = defaultdict(float)
for word, sense in annotated_senses:
    try:
        hypernym_lemmas = hypernym_path_lemmas(sense)
    except:
        print(word, sense)
    for hl in hypernym_lemmas:
        lemma_counts[hl] += 1/len(annotated_senses)
lemma_counts = dict(lemma_counts)

In [399]:
sorted(lemma_counts.items(), key=lambda x: x[1], reverse=True)[:10]

[('entity', 0.9975845410628059),
 ('physical entity', 0.9243156199677975),
 ('object', 0.5636070853462178),
 ('whole', 0.5346215780998409),
 ('artifact', 0.49355877616747357),
 ('instrumentality', 0.24879227053140165),
 ('food', 0.21175523349436445),
 ('causal agent', 0.1698872785829311),
 ('matter', 0.16908212560386507),
 ('organism', 0.166666666666667)]

In [434]:
avoid = ["entity", "physical entity", "whole", "artifact", "instrumentality", "organism", 
         "matter", "solid", "abstraction", "commodity", "plant part", "foodstuff", "plant organ",
         "furnishing", "communication", "flavorer", "part", "act", "activity", "place of business", 
         "collection", "mercantile establishment", "unit", "visual communication", "binary compound", 
         "administrative district", "animal material", "causal agent", "phenomenon", "compound", 
         "natural phenomenon", "material", "natural object", "reproductive structure", "object", 
         "edible fruit", "course", "substance", "living thing", "nutriment", "consumer goods",
         "chordate", "big cat", "self-propelled vehicle", "physical phenomenon", "process",
        "obstruction", "angiosperm", "vascular plant", "way", "craft", "conveyance"]

# sample upto 3 based on freq, then resort based on rank in hierarchy = final hierarchy

N_SAMPLES = 3
sampled_paths = defaultdict(list)

for word, sense in annotated_senses:
    word = " ".join(word.split("_"))
    try:
        hypernym_lemmas = list(OrderedSet(hypernym_path_lemmas(sense)))
    except:
        print(word, sense)
    freq_ranks = []
    for i, hl in enumerate(hypernym_lemmas[:-1]):
        if hl not in avoid:
            freq_ranks.append((hl, i, word_frequency(hl, 'en')))
    sampling_amt = min(N_SAMPLES, len(freq_ranks))
    
    # sampled based on freq
    sampled = sorted(freq_ranks, key = lambda x: x[-1], reverse=True)[:sampling_amt]
    
    # reorder based on hierarchy
    rank_sorted = sorted(sampled, key = lambda x: x[1])
    
    # build hypernymy path/chain
    sampled_path = [word] + list(reversed([hypernym for hypernym, rank, freq in rank_sorted]))
    sampled_paths[word] = sampled_path
    
sampled_paths = dict(sampled_paths)

In [435]:
sampled_paths['boat']

['boat', 'vessel', 'vehicle']

In [515]:
wn2.synset('ring.n.08').hypernym_paths()

[[Synset('entity.n.01'),
  Synset('physical_entity.n.01'),
  Synset('object.n.01'),
  Synset('whole.n.02'),
  Synset('artifact.n.01'),
  Synset('decoration.n.01'),
  Synset('adornment.n.01'),
  Synset('jewelry.n.01'),
  Synset('ring.n.08')]]

In [875]:
hypernym_path_lemmas("basketball.n.02")

['entity',
 'physical entity',
 'object',
 'whole',
 'artifact',
 'instrumentality',
 'equipment',
 'game equipment',
 'ball',
 'basketball']

In [561]:
freqs = defaultdict(float)
for s in wn2.synset("wolf.n.03").hypernym_paths()[0]:
    ln = " ".join(s.lemma_names()[0].split("_"))
    freqs[ln] = word_frequency(ln, "en")
    print(ln)
# sorted(freqs.items(), key=lambda x: x[1], reverse=True)

entity
physical entity
causal agent
organism
animal
chordate
vertebrate
mammal
placental
carnivore
canine
wolf


In [876]:
# save to csv
# with open(f"{DIR}/noun-hypernymy-paths.csv", "w") as f:
#     writer = csv.writer(f)
#     writer.writerow(['noun', 'hyp-1', 'hyp-2', 'hyp-3'])
#     for word, chain in sampled_paths.items():
#         if len(chain) > 1:
#             writer.writerow(chain)

In [5]:
# read annotated csv
annotated_hypernym_paths_raw = read_csv(f"{DIR}/noun-hypernymy-paths-annotated.csv")

In [6]:
annotated_hypernym_paths = defaultdict(list)
for entry in annotated_hypernym_paths_raw:
    hyps = [v for k,v in entry.items() if 'hyp' in k and v != '']
    if entry['flag'] != '1':
        annotated_hypernym_paths[entry['noun']] = hyps
        
annotated_hypernym_paths = dict(annotated_hypernym_paths)

In [7]:
# annotated_hypernym_paths
with open(f"{DIR}/noun-hypernyms.json", "w") as f:
    json.dump(annotated_hypernym_paths, f, indent=2)