# WordNet Selector

In [96]:
from pathlib import Path
from itertools import chain
from more_itertools import collapse
import pandas as pd
from nltk.corpus import wordnet as wn
import re

In [3]:
dataset_v1 = Path('/s/babbage/b/nobackup/nblancha/public-datasets/bold5000')
dataset_v2 = Path('/s/babbage/b/nobackup/nblancha/public-datasets/bold5000r2')

In [4]:
def get_dataset(image_name):
    if re.match(r'^n[0-9]+', image_name):
        return 'ImageNet'
    elif re.match(r'^COCO', image_name):
        return 'COCO'
    else:
        return 'Scene'

def get_synset(image_name):
    offset = re.search(r'(?<=^n)[0-9]+', image_name).group(0)
    synset = wn.synset_from_pos_and_offset('n', int(offset))
    return synset.name()

def get_hypernym_paths(synset):
    paths = wn.synset(synset).hypernym_paths()
    # Convert to list of strings
    paths = [[hypernym.name() for hypernym in path] for path in paths]
    return paths

def get_ranks(paths):
    # Deep copy of paths
    paths = [path.copy() for path in paths]
    path_sets = []
    while True:
        rank = set([path.pop(0) for path in paths if path])
        
        if rank:
            path_sets.append(rank)
        else:
            break

    return path_sets

In [5]:
df = pd.read_pickle(dataset_v2 / 'CSI1_dataframe.pkl')
df = df[df.dataset == 'ImageNet'].copy()
df

Unnamed: 0,image,dataset
0,n01930112_19568.JPEG,ImageNet
1,n03733281_29214.JPEG,ImageNet
2,n07695742_5848.JPEG,ImageNet
5,n01917289_1429.JPEG,ImageNet
6,n02108551_26574.JPEG,ImageNet
...,...,...
5243,n09421951_5433.JPEG,ImageNet
5245,n02018207_3567.JPEG,ImageNet
5247,n01692333_12353.JPEG,ImageNet
5248,n03240683_13368.JPEG,ImageNet


In [10]:
# Process synsets
df['synset'] = df.image.map(get_synset)
df['hypernym_paths'] = df.synset.map(get_hypernym_paths)
df['num_paths'] = df.hypernym_paths.map(lambda p: len(p))
df['ranks'] = df.hypernym_paths.map(get_ranks)
df

Unnamed: 0,image,dataset,synset,hypernym_paths,num_paths,ranks
0,n01930112_19568.JPEG,ImageNet,nematode.n.01,"[[entity.n.01, physical_entity.n.01, object.n....",1,"[{entity.n.01}, {physical_entity.n.01}, {objec..."
1,n03733281_29214.JPEG,ImageNet,maze.n.01,"[[entity.n.01, physical_entity.n.01, object.n....",1,"[{entity.n.01}, {physical_entity.n.01}, {objec..."
2,n07695742_5848.JPEG,ImageNet,pretzel.n.01,"[[entity.n.01, physical_entity.n.01, matter.n....",2,"[{entity.n.01}, {physical_entity.n.01}, {matte..."
5,n01917289_1429.JPEG,ImageNet,brain_coral.n.01,"[[entity.n.01, physical_entity.n.01, object.n....",1,"[{entity.n.01}, {physical_entity.n.01}, {objec..."
6,n02108551_26574.JPEG,ImageNet,tibetan_mastiff.n.01,"[[entity.n.01, physical_entity.n.01, object.n....",2,"[{entity.n.01}, {physical_entity.n.01}, {objec..."
...,...,...,...,...,...,...
5243,n09421951_5433.JPEG,ImageNet,sandbar.n.01,"[[entity.n.01, physical_entity.n.01, object.n....",2,"[{entity.n.01}, {physical_entity.n.01}, {objec..."
5245,n02018207_3567.JPEG,ImageNet,american_coot.n.01,"[[entity.n.01, physical_entity.n.01, object.n....",1,"[{entity.n.01}, {physical_entity.n.01}, {objec..."
5247,n01692333_12353.JPEG,ImageNet,gila_monster.n.01,"[[entity.n.01, physical_entity.n.01, object.n....",1,"[{entity.n.01}, {physical_entity.n.01}, {objec..."
5248,n03240683_13368.JPEG,ImageNet,drilling_platform.n.01,"[[entity.n.01, physical_entity.n.01, object.n....",1,"[{entity.n.01}, {physical_entity.n.01}, {objec..."


In [7]:
len(images)

1916

## New Sorting Approach

In [6]:
df['synset'] = df.image.map(get_synset)
df

Unnamed: 0,image,dataset,synset
0,n01930112_19568.JPEG,ImageNet,nematode.n.01
1,n03733281_29214.JPEG,ImageNet,maze.n.01
2,n07695742_5848.JPEG,ImageNet,pretzel.n.01
5,n01917289_1429.JPEG,ImageNet,brain_coral.n.01
6,n02108551_26574.JPEG,ImageNet,tibetan_mastiff.n.01
...,...,...,...
5243,n09421951_5433.JPEG,ImageNet,sandbar.n.01
5245,n02018207_3567.JPEG,ImageNet,american_coot.n.01
5247,n01692333_12353.JPEG,ImageNet,gila_monster.n.01
5248,n03240683_13368.JPEG,ImageNet,drilling_platform.n.01


In [8]:
wn.synsets('nematode')

[Synset('nematode.n.01')]

In [9]:
ss_nematode = wn.synsets('nematode')[0]

In [19]:
ss_nematode.lemma_names()

['nematode', 'nematode_worm', 'roundworm']

In [22]:
ss = wn.synsets('traffic_light')[0]

In [23]:
ss.lemmas()

[Lemma('traffic_light.n.01.traffic_light'),
 Lemma('traffic_light.n.01.traffic_signal'),
 Lemma('traffic_light.n.01.stoplight')]

In [26]:
wn.synsets('stoplight')[0].lemmas()

[Lemma('stoplight.n.01.stoplight'), Lemma('stoplight.n.01.brake_light')]

In [27]:
ss.hypernyms()

[Synset('light.n.14')]

In [30]:
list(ss.closure(lambda s: s.hypernyms()))

[Synset('light.n.14'),
 Synset('visual_signal.n.01'),
 Synset('signal.n.01'),
 Synset('communication.n.02'),
 Synset('abstraction.n.06'),
 Synset('entity.n.01')]

In [31]:
list(set([w for s in ss.closure(lambda s:s.hypernyms()) for w in s.lemma_names()]))

['abstraction',
 'signal',
 'sign',
 'signaling',
 'visual_signal',
 'entity',
 'light',
 'abstract_entity',
 'communication']

In [129]:
def categorize(synset):
    category = None
    ss = wn.synset(synset)
    hypernyms = list(set([w for s in ss.closure(lambda s:s.hypernyms()) for w in s.lemma_names()]))
    
    if 'artifact' in hypernyms:
        category = 'artifact'
        
    # if 'person' in hypernyms:
    #     category = 'person'
    
    if 'organism' in hypernyms:
        if category:
            print(f'ERROR: Collision, {ss}, {category}, organism')
        else:
            category = 'organism'
            
    if 'food' in hypernyms:
        if category:
            print(f'ERROR: Collision, {ss}, {category}, food')
        else:
            category = 'food'
            
    if 'geological_formation' in hypernyms:
        if category:
            print(f'ERROR: Collision, {ss}, {category}, place')
        else:
            category = 'place'
    
    # Special cases
    if not category:
        # Plants that didn't fall under food
        if 'plant_part' in hypernyms:
            category = 'organism'
        else:
        # traffic_light, street_sign, bubble, toilet_tissue
            category = 'artifact'
            
    return category

In [130]:
df['category'] = df.synset.map(categorize)

  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth

In [131]:
df

Unnamed: 0,image,dataset,synset,category
0,n01930112_19568.JPEG,ImageNet,nematode.n.01,organism
1,n03733281_29214.JPEG,ImageNet,maze.n.01,artifact
2,n07695742_5848.JPEG,ImageNet,pretzel.n.01,food
5,n01917289_1429.JPEG,ImageNet,brain_coral.n.01,organism
6,n02108551_26574.JPEG,ImageNet,tibetan_mastiff.n.01,organism
...,...,...,...,...
5243,n09421951_5433.JPEG,ImageNet,sandbar.n.01,place
5245,n02018207_3567.JPEG,ImageNet,american_coot.n.01,organism
5247,n01692333_12353.JPEG,ImageNet,gila_monster.n.01,organism
5248,n03240683_13368.JPEG,ImageNet,drilling_platform.n.01,artifact


In [132]:
df[df.category.isna()]

Unnamed: 0,image,dataset,synset,category


In [133]:
df.category.value_counts()

artifact    1109
organism     823
food          99
place         20
Name: category, dtype: int64

In [118]:
df[df.category == 'person']

Unnamed: 0,image,dataset,synset,category
2213,n09835506_3109.JPEG,ImageNet,ballplayer.n.01,person
2843,n09835506_1446.JPEG,ImageNet,ballplayer.n.01,person
3221,n10565667_5063.JPEG,ImageNet,scuba_diver.n.01,person
4802,n10148035_8180.JPEG,ImageNet,groom.n.01,person
4867,n10148035_12439.JPEG,ImageNet,groom.n.01,person
5016,n10565667_501.JPEG,ImageNet,scuba_diver.n.01,person


In [70]:
list(set([w for s in wn.synset('plate.n.08').closure(lambda s:s.hypernyms()) for w in s.lemma_names()]))

['entree',
 'aliment',
 'nutrition',
 'physical_entity',
 'food',
 'course',
 'entity',
 'alimentation',
 'substance',
 'matter',
 'main_course',
 'nutrient',
 'victuals',
 'sustenance',
 'nourishment',
 'nutriment']

In [61]:
ss.hypernym_paths()

[[Synset('entity.n.01'),
  Synset('abstraction.n.06'),
  Synset('communication.n.02'),
  Synset('signal.n.01'),
  Synset('visual_signal.n.01'),
  Synset('light.n.14'),
  Synset('traffic_light.n.01')]]

In [89]:
list(set(chain.from_iterable(wn.synset('acorn.n.01').hypernym_paths())))

[Synset('plant_organ.n.01'),
 Synset('entity.n.01'),
 Synset('acorn.n.01'),
 Synset('reproductive_structure.n.01'),
 Synset('physical_entity.n.01'),
 Synset('plant_part.n.01'),
 Synset('natural_object.n.01'),
 Synset('object.n.01'),
 Synset('fruit.n.01'),
 Synset('whole.n.02')]

In [91]:
list(set(chain.from_iterable([ss.hypernym_paths() for ss in wn.synsets('acorn')])))

TypeError: unhashable type: 'list'

In [92]:
[ss.hypernym_paths() for ss in wn.synsets('acorn')]

[[[Synset('entity.n.01'),
   Synset('physical_entity.n.01'),
   Synset('object.n.01'),
   Synset('whole.n.02'),
   Synset('natural_object.n.01'),
   Synset('plant_part.n.01'),
   Synset('plant_organ.n.01'),
   Synset('reproductive_structure.n.01'),
   Synset('fruit.n.01'),
   Synset('acorn.n.01')]]]

In [100]:
list(collapse([ss.hypernym_paths() for ss in wn.synsets('groom')]))

[Synset('entity.n.01'),
 Synset('physical_entity.n.01'),
 Synset('causal_agent.n.01'),
 Synset('person.n.01'),
 Synset('peer.n.01'),
 Synset('associate.n.01'),
 Synset('participant.n.01'),
 Synset('groom.n.01'),
 Synset('entity.n.01'),
 Synset('physical_entity.n.01'),
 Synset('object.n.01'),
 Synset('whole.n.02'),
 Synset('living_thing.n.01'),
 Synset('organism.n.01'),
 Synset('person.n.01'),
 Synset('peer.n.01'),
 Synset('associate.n.01'),
 Synset('participant.n.01'),
 Synset('groom.n.01'),
 Synset('entity.n.01'),
 Synset('physical_entity.n.01'),
 Synset('causal_agent.n.01'),
 Synset('person.n.01'),
 Synset('worker.n.01'),
 Synset('employee.n.01'),
 Synset('workman.n.01'),
 Synset('laborer.n.01'),
 Synset('hired_hand.n.01'),
 Synset('stableman.n.01'),
 Synset('entity.n.01'),
 Synset('physical_entity.n.01'),
 Synset('object.n.01'),
 Synset('whole.n.02'),
 Synset('living_thing.n.01'),
 Synset('organism.n.01'),
 Synset('person.n.01'),
 Synset('worker.n.01'),
 Synset('employee.n.01'),
 Sy

In [102]:
wn.synset('buckeye.n.01').hypernym_paths()

[[Synset('entity.n.01'),
  Synset('physical_entity.n.01'),
  Synset('object.n.01'),
  Synset('whole.n.02'),
  Synset('natural_object.n.01'),
  Synset('plant_part.n.01'),
  Synset('plant_organ.n.01'),
  Synset('reproductive_structure.n.01'),
  Synset('fruit.n.01'),
  Synset('seed.n.01'),
  Synset('buckeye.n.01')]]

In [108]:
wn.synset_from_pos_and_offset('n', 12144580)

Synset('corn.n.02')

In [106]:
wn.synset_from_pos_and_offset('n', 1440764).hypernym_paths()

[[Synset('entity.n.01'),
  Synset('physical_entity.n.01'),
  Synset('object.n.01'),
  Synset('whole.n.02'),
  Synset('living_thing.n.01'),
  Synset('organism.n.01'),
  Synset('animal.n.01'),
  Synset('chordate.n.01'),
  Synset('vertebrate.n.01'),
  Synset('aquatic_vertebrate.n.01'),
  Synset('fish.n.01'),
  Synset('bony_fish.n.01'),
  Synset('teleost_fish.n.01'),
  Synset('soft-finned_fish.n.01'),
  Synset('cypriniform_fish.n.01'),
  Synset('cyprinid.n.01'),
  Synset('tench.n.01')]]

In [109]:
wn.synset_from_pos_and_offset('n', 13133613)

Synset('ear.n.05')

In [110]:
wn.synset_from_pos_and_offset('n', 13133613).hypernym_paths()

[[Synset('entity.n.01'),
  Synset('physical_entity.n.01'),
  Synset('object.n.01'),
  Synset('whole.n.02'),
  Synset('natural_object.n.01'),
  Synset('plant_part.n.01'),
  Synset('plant_organ.n.01'),
  Synset('reproductive_structure.n.01'),
  Synset('fruit.n.01'),
  Synset('ear.n.05')]]

In [111]:
wn.synset_from_pos_and_offset('n', 7615774)

Synset('ice_lolly.n.01')