In [35]:
import json
import spacy
import numpy as np
import pandas as pd
from collections import defaultdict

nlp = spacy.load("en_core_web_sm")

def attach_part_name(concepts: list[str], part_name: str):
    concepts_processed = []
    for cpt in concepts:
        doc = nlp(cpt)
        if not any('NOUN' == word.pos_ for word in doc):
            cpt = cpt + ' ' + part_name
        # if 'than' in cpt or 'male' in cpt:  # Note that this would cause Purple Finch to have 0 concept for torso and American GoldFinch to have 0 concept for head
        #     continue 
        concepts_processed.append(cpt)
    return concepts_processed

concept_sets = defaultdict(set)
with open('concepts/CUB/concepts_processed.json', 'rb') as fp:
    concepts_processed = json.load(fp=fp)

# Add a noun to purely adjective concepts
for class_name, concept_dict in concepts_processed.items():
    for part_name, concepts in concept_dict.items():
        concepts_with_part_name = attach_part_name(concepts, part_name)
        concept_dict[part_name] = concepts_with_part_name
        concept_sets[part_name].update(concepts_with_part_name)

concept_sets_sorted = {k: sorted(list(v)) for k, v in concept_sets.items()}

duplicated concepts

In [54]:
import itertools

all_concepts = []
for v in concept_sets_sorted.values():
    all_concepts += v

num_concepts = sum(len(v) for v in concept_sets_sorted.values())
concept_matrix = np.zeros((len(concepts_processed), len(concept_sets_sorted), num_concepts))

with open('concepts/CUB/parts.txt', 'r') as fp:
    all_parts = fp.read().splitlines()
all_class_names = list(concepts_processed.keys())
for class_idx, class_name in enumerate(all_class_names):
    class_concepts = concepts_processed[class_name]
    for part_idx, part_name in enumerate(all_parts):
        cpt_indices = [all_concepts.index(cpt) for cpt in class_concepts[part_name]]
        concept_matrix[class_idx, part_idx, cpt_indices] = 1

weight_matrix = np.ones((len(all_parts), num_concepts))
for part_idx, part_name in enumerate(all_parts):
    part_concepts = concept_sets_sorted[part_name]
    cpt_indices = [all_concepts.index(cpt) for cpt in part_concepts]
    weight_matrix[part_idx, cpt_indices] = 50
weight_matrix

array([[10., 10., 10., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       ...,
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1., 10., ..., 10., 10., 10.]])

In [48]:
concept_matrix[1, 1, [1,2,3]]

array([0., 0., 0.])

In [45]:
all_concepts = []
for v in concept_sets_sorted.values():
    all_concepts += v
all_concepts

['Black cap in breeding males',
 'Bright yellow in males during breeding season',
 'Olive or dull yellow in females and non-breeding males',
 'black and white striped head',
 'black cap',
 'black chin and throat in males',
 'black crown and back of the head',
 'black eye line',
 'black face and neck',
 'black facial markings',
 'black feathers with a slight sheen',
 'black forehead in some subspecies',
 'black head',
 'black hood',
 'black hood and throat in males',
 'black malar mark (mustache in males)',
 'black malar streaks (cheek patches)',
 'black malar stripes',
 'black necklace on males',
 'black plumage',
 'black spot on the ear coverts',
 'black stripes',
 'black throat patch',
 'black with a blue-green sheen',
 'blue and black crest',
 'blue and brown head',
 'blue-black cap',
 'blue-gray color with white markings',
 'blue-gray crown and back',
 'bright blue in males, brown in females',
 'bright blue on males',
 'bright green plumage',
 'bright orange face',
 'bright red hea