In [1]:
# %matplotlib inline
import numpy as np
# import xml.etree.cElementTree as etree
from lxml import etree
from collections import Counter
import pprint

In [2]:
# Read the structure of ImageNet (structure_released.xml)
tree = etree.parse('structure_released.xml')
struct_root = tree.getroot()[1]

for child in struct_root:
    print('%s: %s(%d childs)' % (child.get('wnid'), child.get('words'), len(child)))

print("\nmisc")
misc = struct_root[-1]
for child in misc:
    print('%s: %s(%d childs)' % (child.get('wnid'), child.get('words'), len(child)))

n00017222: plant, flora, plant life(32 childs)
n09287968: geological formation, formation(28 childs)
n00019128: natural object(20 childs)
n00523513: sport, athletics(20 childs)
n00021939: artifact, artefact(45 childs)
n12992868: fungus(68 childs)
n00007846: person, individual, someone, somebody, mortal, soul(402 childs)
n00015388: animal, animate being, beast, brute, creature, fauna(47 childs)
fa11misc: Misc(2350 childs)

misc
n07708124: julienne, julienne vegetable(0 childs)
n07708398: raw vegetable, rabbit food(0 childs)
n07709046: pulse(0 childs)
n07725158: goa bean(0 childs)
n07727048: kidney bean(0 childs)
n07727140: navy bean, pea bean, white bean(0 childs)
n07727252: pinto bean(0 childs)
n07727377: frijole(0 childs)
n07727458: black bean, turtle bean(0 childs)
n07728053: snap bean, snap(0 childs)
n07728181: string bean(0 childs)
n07728284: Kentucky wonder, Kentucky wonder bean(0 childs)
n07728391: scarlet runner, scarlet runner bean, runner bean, English runner bean(0 childs)
n0

In [3]:
def get_wnids_recursive(node, attr='wnid'):
    ret = [node.get(attr)]
    for child in node:
        ret.extend(get_wnids_recursive(child))
    return ret

wordnet_wnids = []
for child in struct_root[:8]:
    wordnet_wnids.extend(get_wnids_recursive(child))
wordnet_wnid_dict = {wnid:True for wnid in wordnet_wnids}
wordnet_wnid_set = set(wordnet_wnids)

misc_wnids = []
for child in struct_root[-1]:
    misc_wnids.extend(get_wnids_recursive(child))
misc_wnid_dict = {wnid:True for wnid in misc_wnids}
misc_wnid_set = set(misc_wnids)

struct_wnids = wordnet_wnids + misc_wnids
struct_wnid_dict = {wnid:True for wnid in struct_wnids}
struct_wnid_set = set(struct_wnids)

print('in wordnet')
print('Total  : %d nodes(%d unique)' % (len(struct_wnids), len(struct_wnid_set)))
print('WordNet: %d nodes(%d unique)' % (len(wordnet_wnids), len(wordnet_wnid_set)))
print('MISC   : %d nodes(%d unique)' % (len(misc_wnids), len(misc_wnid_set)))

in wordnet
Total  : 60940 nodes(32295 unique)
WordNet: 30881 nodes(27713 unique)
MISC   : 30059 nodes(19876 unique)


In [4]:
# Read 22k classes of ImageNet
with open('22k_label.txt') as fd:
    imagenet_wnids = [temp.strip().split()[0] for temp in fd.readlines()]
imagenet_wnid_dict = {wnid:True for wnid in imagenet_wnids}
imagenet_wnid_set = set(imagenet_wnids)
print('22k_label   : %d nodes(%d unique)' % (len(imagenet_wnids), len(imagenet_wnid_set)))

22k_label   : 21541 nodes(21541 unique)


In [5]:
# Make string output of the tree
def generate_strs_recursive(node, parent_str):
    wnid = node.get('wnid')
    val = node.get('words').split(', ')[0]
    node_str = parent_str + wnid
    output = [node_str + ',' + val]
    for child in node:
        output.extend(generate_strs_recursive(child, node_str + '.'))
    return output

tree_strs = generate_strs_recursive(struct_root, '')
tree_strs

print('Total %d nodes in the tree structure of wordnet' % (len(tree_strs)))
with open('flare.csv', 'w') as fd:
    fd.write('id,value\n' + '\n'.join(tree_strs))

Total 60942 nodes in the tree structure of wordnet


In [6]:
# Check if all 22k classes are in the structure.
print 'Check whether 22k classes included in structure_released.xml'
cnt = 0
for wnid in imagenet_wnids:
    if not wnid in struct_wnid_dict:
        cnt += 1
#         print('\t' + wnid)
print('done!\n%d wnid not included' % cnt)

# Check if all 22k classes are in the wordnet nodes(not for MISC nodes).
print 'Check whether 22k classes included in the wordnet nodes(not for MISC nodes)'
cnt = 0
for wnid in imagenet_wnids:
    if not wnid in wordnet_wnid_dict:
        cnt += 1
#         print('\t' + wnid)
print('done!\n%d wnid not included' % cnt)

Check whether 22k classes included in structure_released.xml
done!
0 wnid not included
Check whether 22k classes included in the wordnet nodes(not for MISC nodes)
done!
2325 wnid not included


In [7]:
# Count occurance of 22k wnids in the tree
struct_imagenet_wnids = [wnid for wnid in struct_wnids if wnid in imagenet_wnid_dict]
occurance_counter = Counter(struct_imagenet_wnids)
occur_stat_counter = Counter(occurance_counter.values())
print('number of occurance in whole wordnet')
occur_stat_counter

number of occurance in whole wordnet


Counter({1: 7980,
         2: 10795,
         3: 452,
         4: 1044,
         5: 482,
         6: 463,
         7: 234,
         8: 67,
         9: 12,
         10: 4,
         11: 7,
         12: 1})

In [8]:
# Count occurance of 22k wnids in the tree
wordnet_imagenet_wnids = [wnid for wnid in wordnet_wnids if wnid in imagenet_wnid_dict]
occurance_counter = Counter(wordnet_imagenet_wnids)
occur_stat_counter = Counter(occurance_counter.values())
print('number of occurance in major branches of wordnet')
occur_stat_counter

number of occurance in major branches of wordnet


Counter({1: 17009, 2: 2090, 3: 85, 4: 30, 5: 2})

In [9]:
# Intersaction between struct and wordnet
imagenet_wnid_in_both_wordnet_and_misc = [wnid for wnid in imagenet_wnid_set if wnid in wordnet_wnid_set.intersection(misc_wnid_set)]
print('%d wnids of imagenet are in both major branches and misc' % len(set(imagenet_wnid_in_both_wordnet_and_misc)))

10176 wnids of imagenet are in both major branches and misc


In [29]:
print('size of intersection of major branches and misc: %d' % len(set(wordnet_wnid_set.intersection(misc_wnid_set))))

size of intersection of major branches and misc: 15294


In [36]:
# Count occurance of 22k wnids in the tree
misc_imagenet_wnids = [wnid for wnid in misc_wnids if wnid in imagenet_wnid_dict]
occurance_counter = Counter(misc_imagenet_wnids)
occur_stat_counter = Counter(occurance_counter.values())
occur_stat_counter

Counter({1: 9759,
         2: 862,
         3: 261,
         4: 407,
         5: 471,
         6: 432,
         7: 231,
         8: 54,
         9: 12,
         10: 4,
         11: 7,
         12: 1})

In [44]:
[key for (key, value) in occurance_counter.items() if value == 11]

['n07894102',
 'n07875835',
 'n07894551',
 'n07909593',
 'n07895435',
 'n07894703',
 'n07894298']

In [83]:
# Count leaf and non-leaf node in the 22k tree
leaf_occurance_dict = {}
def count_leaf_nodes(node, imagenet_wnid_dict, leaf_occurance_dict):
    for child in node:
        count_leaf_nodes(child, imagenet_wnid_dict, leaf_occurance_dict)
    wnid = node.get('wnid')
    if wnid in imagenet_wnid_dict:
        if not wnid in leaf_occurance_dict:
            leaf_occurance_dict[wnid] = [0, 0]
            
        if len(node) == 0:
            leaf_occurance_dict[wnid][0] += 1
        else:
            leaf_occurance_dict[wnid][1] += 1
    return

count_leaf_nodes(struct_root, imagenet_wnid_dict, leaf_occurance_dict)
print('Number of leaf 22k wnids    : %d' % len([wnid for (wnid, (leaf_cnt, nonleaf_cnt)) in leaf_occurance_dict.items() if leaf_cnt > 0]))
print('Number of non-leaf 22k wnids: %d' % len([wnid for (wnid, (leaf_cnt, nonleaf_cnt)) in leaf_occurance_dict.items() if nonleaf_cnt > 0]))

Number of leaf 22k wnids    : 16644
Number of non-leaf 22k wnids: 5019


In [24]:
total_counter = leaf_counter + nonleaf_counter
len(total_counter.keys())

21541

In [91]:
print '22k wnids which appear both as leaf node and non-leaf node'
[(wnid, (leaf_cnt, nonleaf_cnt)) for (wnid, (leaf_cnt, nonleaf_cnt)) in leaf_occurance_dict.items()
         if (leaf_cnt > 0 and nonleaf_cnt > 0)]

22k wnids which appear both as leaf node and non-leaf node


[('n02566109', (1, 1)),
 ('n02517938', (1, 1)),
 ('n02656670', (1, 1)),
 ('n02102605', (1, 2)),
 ('n01715888', (1, 1)),
 ('n01444783', (1, 1)),
 ('n01755740', (1, 1)),
 ('n02568959', (1, 2)),
 ('n01712752', (1, 1)),
 ('n02104523', (1, 3)),
 ('n02650050', (1, 1)),
 ('n02522399', (1, 1)),
 ('n02109811', (1, 3)),
 ('n02621258', (1, 1)),
 ('n02376918', (1, 3)),
 ('n02653786', (1, 1)),
 ('n02101861', (1, 2)),
 ('n02586543', (1, 3)),
 ('n01854415', (1, 1)),
 ('n01493541', (1, 1)),
 ('n02433925', (1, 1)),
 ('n02590495', (1, 1)),
 ('n01446589', (1, 1)),
 ('n01448951', (1, 1)),
 ('n02127052', (1, 1)),
 ('n02590702', (1, 1)),
 ('n02610980', (1, 1)),
 ('n02086346', (1, 3)),
 ('n02532028', (1, 3)),
 ('n02122725', (1, 3)),
 ('n02068974', (1, 1)),
 ('n02419796', (1, 1)),
 ('n02092468', (1, 3)),
 ('n02382948', (1, 1)),
 ('n02537085', (1, 3)),
 ('n02561108', (1, 1)),
 ('n02386310', (1, 1)),
 ('n02432511', (1, 1)),
 ('n02533209', (1, 1)),
 ('n01484097', (1, 1)),
 ('n02607862', (1, 1)),
 ('n02390454', (

In [89]:
def path_string(node):
    if node.getparent() is not None and node.getparent().attrib is not None:
        return path_string(node.getparent()) + '\n' + str(node.attrib)
    else:
        return str(node.attrib)

In [88]:
temp = [node for node in struct_root.xpath('//synset[@wnid="n02566109"]')]
for node in temp:
    print '%d childs' % len(node) if len(node) > 0 else 'Leaf node'
    print path_string(node)
    print '\n\n\n'

4 childs
{}
{'wnid': 'fall11', 'gloss': 'ImageNet 2011 Fall Release.', 'words': 'ImageNet 2011 Fall Release'}
{'wnid': 'n00015388', 'gloss': 'a living organism characterized by voluntary movement', 'words': 'animal, animate being, beast, brute, creature, fauna'}
{'wnid': 'n01466257', 'gloss': 'any animal of the phylum Chordata having a notochord or spinal column', 'words': 'chordate'}
{'wnid': 'n01471682', 'gloss': 'animals having a bony or cartilaginous skeleton with a segmented spinal column and a large brain enclosed in a skull or cranium', 'words': 'vertebrate, craniate'}
{'wnid': 'n01473806', 'gloss': 'animal living wholly or chiefly in or on water', 'words': 'aquatic vertebrate'}
{'wnid': 'n02512053', 'gloss': 'any of various mostly cold-blooded aquatic vertebrates usually having scales and breathing through gills; "the shark is a large fish"; "in the living room there was a tank of colorful fish"', 'words': 'fish'}
{'wnid': 'n02514825', 'gloss': 'any fish of the class Osteichthy

In [92]:
temp = [node for node in struct_root.xpath('//synset[@wnid="n02102605"]')]
for node in temp:
    print '%d childs' % len(node) if len(node) > 0 else 'Leaf node'
    print path_string(node)
    print '\n\n\n'

2 childs
{}
{'wnid': 'fall11', 'gloss': 'ImageNet 2011 Fall Release.', 'words': 'ImageNet 2011 Fall Release'}
{'wnid': 'n00015388', 'gloss': 'a living organism characterized by voluntary movement', 'words': 'animal, animate being, beast, brute, creature, fauna'}
{'wnid': 'n01317541', 'gloss': 'any of various animals that have been tamed and made fit for a human environment', 'words': 'domestic animal, domesticated animal'}
{'wnid': 'n02084071', 'gloss': 'a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds; "the dog barked all night"', 'words': 'dog, domestic dog, Canis familiaris'}
{'wnid': 'n02087122', 'gloss': 'a dog used in hunting game', 'words': 'hunting dog'}
{'wnid': 'n02098550', 'gloss': 'a dog trained to work with sportsmen when they hunt with guns', 'words': 'sporting dog, gun dog'}
{'wnid': 'n02101108', 'gloss': 'any of several breeds of small to medium-sized gun dogs with a lo

In [95]:
temp = [node for node in struct_root.xpath('//synset[@wnid="n00021265"]')]
for node in temp:
    print '%d childs' % len(node) if len(node) > 0 else 'Leaf node'
    print path_string(node)
    print '\n\n\n'

15 childs
{}
{'wnid': 'fall11', 'gloss': 'ImageNet 2011 Fall Release.', 'words': 'ImageNet 2011 Fall Release'}
{'wnid': 'fa11misc', 'gloss': 'Miscellaneous synsets not in the major subtrees in the ImageNet 2011 Fall Release.', 'words': 'Misc'}
{'wnid': 'n00021265', 'gloss': 'any substance that can be metabolized by an animal to give energy and build tissue', 'words': 'food, nutrient'}




