In [55]:
import json
import spacy
import numpy as np
import pandas as pd
from collections import defaultdict

nlp = spacy.load("en_core_web_sm")

def attach_part_name(concepts: list[str], part_name: str):
    concepts_processed = []
    for cpt in concepts:
        doc = nlp(cpt)
        if not any('NOUN' == word.pos_ for word in doc):
            cpt = cpt + ' ' + part_name
        # if 'than' in cpt or 'male' in cpt:  # Note that this would cause Purple Finch to have 0 concept for torso and American GoldFinch to have 0 concept for head
        #     continue 
        concepts_processed.append(cpt)
    return concepts_processed

concept_sets = defaultdict(set)
with open('concepts/CUB/concepts_processed.json', 'rb') as fp:
    concepts_processed = json.load(fp=fp)

# Add a noun to purely adjective concepts
for class_name, concept_dict in concepts_processed.items():
    for part_name, concepts in concept_dict.items():
        concepts_with_part_name = attach_part_name(concepts, part_name)
        concept_dict[part_name] = concepts_with_part_name
        concept_sets[part_name].update(concepts_with_part_name)

concept_sets_sorted = {k: sorted(list(v)) for k, v in concept_sets.items()}

Concepts with `male`, `than`, etc:

In [56]:
for part_name, concepts in concept_sets_sorted.items():
    for cpt in concepts:
        if 'male' in cpt or 'than' in cpt:
            print(cpt)

Black cap in breeding males
Bright yellow in males during breeding season
Olive or dull yellow in females and non-breeding males
black chin and throat in males
black hood and throat in males
black malar mark (mustache in males)
black necklace on males
bright blue in males, brown in females
bright blue on males
bright yellow throat and upper breast in males
brownish (female) head
buffy or brown crown in females
distinct black face mask in males
duller black with slight rusty edges in females and juveniles
duller coloration in females, often olive-brown
glossy black in breeding males
grey mask on face (female)
males have iridescent pinkish-red throat
orange patches on sides for males
rosy-pink in males, grayish in females
rosy-red head and throat (male)
rusty orange on females
small red streak on males
smaller than the American Crow head
striped pattern in females
yellow patch (males only)
darker than body
orange and brown beak in females
blue in males, brown in females
males have a dark

Duplicated Concepts:

In [57]:
all_concepts_set = set()

for part_name, concepts in concept_sets_sorted.items():
    for cpt in concepts:
        if cpt in all_concepts_set:
            print(cpt)
        else:
            all_concepts_set.add(cpt)

dark streaks
iridescent dark blue-black
medium length
olive-brown
Olive or dull yellow in females and non-breeding males
black with a blue-green sheen
bright blue in males, brown in females
dark gray
iridescent dark blue-black
mottled gray and black
white underparts


In [58]:
len(all_concepts_set)

979

In [54]:
import itertools

all_concepts = []
for v in concept_sets_sorted.values():
    all_concepts += v

num_concepts = sum(len(v) for v in concept_sets_sorted.values())
concept_matrix = np.zeros((len(concepts_processed), len(concept_sets_sorted), num_concepts))

with open('concepts/CUB/parts.txt', 'r') as fp:
    all_parts = fp.read().splitlines()
all_class_names = list(concepts_processed.keys())
for class_idx, class_name in enumerate(all_class_names):
    class_concepts = concepts_processed[class_name]
    for part_idx, part_name in enumerate(all_parts):
        cpt_indices = [all_concepts.index(cpt) for cpt in class_concepts[part_name]]
        concept_matrix[class_idx, part_idx, cpt_indices] = 1

weight_matrix = np.ones((len(all_parts), num_concepts))
for part_idx, part_name in enumerate(all_parts):
    part_concepts = concept_sets_sorted[part_name]
    cpt_indices = [all_concepts.index(cpt) for cpt in part_concepts]
    weight_matrix[part_idx, cpt_indices] = 50
weight_matrix

array([[10., 10., 10., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       ...,
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1., 10., ..., 10., 10., 10.]])