# Bag-of-Objects TF-IDF

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import pickle
import pandas as pd
from tqdm import tqdm
import os
import json

from dataloader import he_sampling
from data import FlickrDataset, CocoDataset, VGDataset
from PIL import Image
import matplotlib.patches as patches

from sklearn.feature_extraction.text import TfidfTransformer


In [12]:
# ds = VGDataset(sg_path=None)
ds = VGDataset(sg_path='/data/project/rw/CBIR/data/vg_coco/sgg_grcnn/vgcoco_sgg_grcnn_with_adj.pkl')
# ds = FlickrDataset(sg_path=None)
# ds = CocoDataset(sg_path=None)
# ds = VGDataset(sg_path='/data/project/rw/CBIR/data/vg_coco/vg_coco_gt_sg.pkl')

In [4]:
emb_file = '/data/project/rw/CBIR/data/f30k/butd_freq_prior_LR/glove_embs_f30k_sgg_butd_freq_prior_LR.pkl'
vocab2idx_file = '/data/project/rw/CBIR/data/f30k/butd_freq_prior_LR/vocab2idx_f30k_sgg_butd_freq_prior_LR.pkl'
idx2vocab_file = '/data/project/rw/CBIR/data/f30k/butd_freq_prior_LR/idx2vocab_f30k_sgg_butd_freq_prior_LR.pkl'
scene_graph_path = '/data/project/rw/CBIR/data/f30k/butd_freq_prior_LR/f30k_sgg_butd_freq_prior_LR_with_adj.pkl'
ds = FlickrDataset(vocab_emb=emb_file, vocab2idx=vocab2idx_file, idx2vocab=idx2vocab_file, sg_path=scene_graph_path)

In [5]:
ds.sg[0].keys()

dict_keys(['node_labels', 'adj', 'filename', 'imgid', 'bboxes', 'width', 'height'])

In [6]:
'''visual genome'''
# l_train_terms = []
# for img_id in ds.d_split['train']:
#     sg = ds.imgid2sg(img_id)
#     l_train_terms.append(sg['obj_labels'])
    
# l_test_terms = []
# for img_id in ds.d_split['test']:
#     sg = ds.imgid2sg(img_id)
#     l_test_terms.append(sg['obj_labels'])    

'''f30k and coco'''
l_train_terms = []
for img_id in ds.d_split['train']:
    sg = ds.imgid2sg(img_id)
    l_row = []
    for i, word in enumerate(sg['node_labels']):
        if i < len(sg['bboxes']):
            l_row.append(word)
    l_train_terms.append(l_row)
    
l_test_terms = []
for img_id in ds.d_split['test']:
    sg = ds.imgid2sg(img_id)
    l_row = []
    for i, word in enumerate(sg['node_labels']):
        if i < len(sg['bboxes']):
            l_row.append(word)
    l_test_terms.append(l_row)

In [None]:
'''GT scene graph'''    
gen_scene_graph = '/data/project/rw/CBIR/vg_generated_sg_adj_full_butd_freq_train.pkl'   
gen_sg = pickle.load(open(gen_scene_graph, 'rb'))
# full scene graph
sg = json.load(open('/data/public/rw/datasets/visual_genome/filtered_scene_graphs_coco.json', 'r'))
id2sg = {int(sg_['image_id']): sg_  for sg_ in sg}

In [14]:
'''GT Scene Graph VG-COCO'''
  
'''load scene graph and add triplet features'''


l_train_terms = []
for img_id in ds.d_split['train']:
    sg_ = id2sg[img_id]
    l_row = []
    l_obj = [obj['names'][0] for obj in sg_['objects']]
    for i, word in enumerate(l_obj):
        l_row.append(word)
    l_train_terms.append(l_row)
    
l_test_terms = []
for img_id in ds.d_split['test']:
    sg_ = id2sg[img_id]
    l_row = []
    l_obj = [obj['names'][0] for obj in sg_['objects']]
    for i, word in enumerate(l_obj):
        l_row.append(word)
    l_test_terms.append(l_row)    
    
# l_test_terms = []
# for img_id in ds.d_split['test']:
#     sg = ds.imgid2sg(img_id)
#     l_row = []
#     for i, word in enumerate(sg['node_labels']):
#         if i < len(sg['bboxes']):
#             l_row.append(word)
#     l_test_terms.append(l_row)

In [15]:
l_test = ds.d_split['test']

In [16]:
train_vocab = set([t for d in l_train_terms for t in d])

In [17]:
test_vocab = set([t for d in l_test_terms for t in d])

In [18]:
vocab = train_vocab.union(test_vocab)
vocab2idx = {v:i for i, v in enumerate(sorted(list(vocab)))}

In [19]:
train_mat = np.zeros((len(l_train_terms), len(vocab)))
test_mat = np.zeros((len(l_test_terms), len(vocab)))

In [20]:
len(train_vocab)

48387

In [21]:
len(test_vocab)

24484

In [22]:
for i_d, d in enumerate(l_train_terms):
    for t in d:
        idx = vocab2idx[t]
        train_mat[i_d, idx] += 1
        
for i_d, d in enumerate(l_test_terms):
    for t in d:
        idx = vocab2idx[t]
        test_mat[i_d, idx] += 1        

In [23]:
tfidf = TfidfTransformer(use_idf=False)

In [24]:
tfidf.fit(train_mat)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=False)

In [25]:
test_tfidf = tfidf.transform(test_mat).toarray()

In [26]:
sim_mat = test_tfidf.dot(test_tfidf.T)

In [27]:
# os.mkdir('/data/project/rw/viewer_CBIR/viewer/vg_coco_results/boo_tf')
# os.mkdir('/data/project/rw/viewer_CBIR/viewer/vg_coco_results/grcnn_boo_tfidf')
# os.mkdir('/data/project/rw/viewer_CBIR/viewer/vg_coco_results/grcnn_boo_tfidf')
# os.mkdir('/data/project/rw/viewer_CBIR/viewer/f30k_results/butd_boo_tfidf')
# os.mkdir('/data/project/rw/viewer_CBIR/viewer/f30k_results/butd_boo_tf')
os.mkdir('/data/project/rw/viewer_CBIR/viewer/vg_coco_results/gt_boo_tf')
# os.mkdir('/data/project/rw/viewer_CBIR/viewer/coco_results/boo_tfidf')

In [28]:
result_dir = '/data/project/rw/viewer_CBIR/viewer/vg_coco_results/gt_boo_tf'
for i_query, test_id in enumerate(tqdm(l_test)):  # query id
    l_sim = []
    for j_target, target_id in enumerate(l_test):
        l_sim.append(sim_mat[i_query, j_target])
    df = pd.DataFrame({'target_id': l_test, 'sim': l_sim}).sort_values('target_id')

    df[['target_id', 'sim']].to_csv(os.path.join(result_dir, f'{test_id}.tsv'), sep='\t', header=False, index=False)

100%|██████████| 13203/13203 [14:21<00:00, 15.32it/s]
