This notebook is to find the relationship between the provided image labels by finding the cosine similarities according to a pre-trained word embedding.

In [1]:
import gensim
import gensim.downloader as api
import nltk
import numpy as np
import pandas as pd

In [2]:
word2vec_model = api.load('word2vec-google-news-300')  

In [3]:
glove_model = api.load('glove-wiki-gigaword-300')

In [25]:
def get_labels(filename):

    raw_labels = pd.read_csv(filename, names=['label'], skiprows=1)
    raw_labels = raw_labels.values
    labels = [l.replace('_',' ').strip().lower() for label in raw_labels for l in label]

    return raw_labels, labels

In [26]:
def init_dataframe(raw_labels, labels):

    df = pd.DataFrame()
    df['label'] = labels
    df['raw_label'] = raw_labels
    df['most_similar_label'] = ''
    df['cosine_similarity'] = np.nan

    return df

In [27]:
def check_label_in_vocab(label, model):

    try:
        model.most_similar(label)
        return label
    except KeyError:
        return

In [28]:
def find_label_similarities(df, labels_in_vocab, model):

    for idx, row in df.iterrows():
        label_found = True
        max_similarity = 0
        most_similar_label = None
        target_label = row['label']

        for label in labels_in_vocab:
            if target_label != label:
                try:
                    similarity = model.similarity(target_label, label)
                    if similarity > max_similarity:
                        max_similarity = similarity
                        most_similar_label = label
                except KeyError:
                    label_found = False
                    break
        
        if label_found:
            df.at[idx,'most_similar_label'] = most_similar_label
            df.at[idx,'cosine_similarity'] = max_similarity
    
    return df
        


In [30]:
# Word2Vec model, all labels
model = word2vec_model
filename = '../imagenet_labels.txt'

raw_labels, labels = get_labels(filename)
df = init_dataframe(raw_labels, labels)
labels_in_vocab = [label for label in labels if check_label_in_vocab(label, model) is not None]
df = find_label_similarities(df, labels_in_vocab, model)

df.to_csv('../label_similarities/all_labels_word2vec_sim.csv', index=False)

In [31]:
print('Labels found in Word2Vec:', df[df['cosine_similarity'].notna()].shape[0])

Labels found in Word2Vec: 545


In [32]:
# Word2Vec model, label subset
model = word2vec_model
filename = '../labels/class_labels_subset.txt'

raw_labels, labels = get_labels(filename)
df = init_dataframe(raw_labels, labels)
labels_in_vocab = [label for label in labels if check_label_in_vocab(label, model) is not None]
df = find_label_similarities(df, labels_in_vocab, model)

df.to_csv('../label_similarities/class_label_subset_word2vec_sim.csv', index=False)

In [33]:
print('Subset labels found in Word2Vec:', df[df['cosine_similarity'].notna()].shape[0])

Subset labels found in Word2Vec: 58


In [34]:
# GloVe model, all labels
model = glove_model
filename = '../imagenet_labels.txt'

raw_labels, labels = get_labels(filename)
df = init_dataframe(raw_labels, labels)
labels_in_vocab = [label for label in labels if check_label_in_vocab(label, model) is not None]
df = find_label_similarities(df, labels_in_vocab, model)

df.to_csv('../label_similarities/all_labels_glove_sim.csv', index=False)

In [35]:
print('Labels found in GloVe:', df[df['cosine_similarity'].notna()].shape[0])

Labels found in GloVe: 555


In [36]:
# GloVe model, label subset
model = glove_model
filename = '../labels/class_labels_subset.txt'

raw_labels, labels = get_labels(filename)
df = init_dataframe(raw_labels, labels)
labels_in_vocab = [label for label in labels if check_label_in_vocab(label, model) is not None]
df = find_label_similarities(df, labels_in_vocab, model)

df.to_csv('../label_similarities/class_label_subset_glove_sim.csv', index=False)

In [37]:
print('Subset labels found in GloVe:', df[df['cosine_similarity'].notna()].shape[0])

Subset labels found in GloVe: 57
