<a href="https://colab.research.google.com/github/surajsrivathsa/thesis_comics_search_xai/blob/main/feature_extraction/jupyter_notebooks/search_engine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preamble

In [None]:
import os, sys, pickle, glob, numpy as np, pandas as pd
from sklearn.decomposition import PCA


In [None]:
!pip3 install pickle5
import pickle5 as pickle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pickle5
  Downloading pickle5-0.0.12-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (256 kB)
[K     |████████████████████████████████| 256 kB 4.2 MB/s 
[?25hInstalling collected packages: pickle5
Successfully installed pickle5-0.0.12


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls drive

MyDrive


In [None]:
home_filepath = '/content/drive/MyDrive/suraj/thesis'
os.chdir(home_filepath)
os.getcwd()

'/content/drive/MyDrive/suraj/thesis'

In [None]:
featureset_path = os.path.join(home_filepath, 'vgg_features')

# Create dict of panels to pages to books from csv

In [None]:
comic_info_dev_df = pd.read_csv(os.path.join(home_filepath, 'vgg_features', 'comic_info_dev_df.csv'))
print(comic_info_dev_df.shape)
comic_info_dev_df.head(3)
# 126233

(24497, 6)


Unnamed: 0,idx,comic_no,page_no,panel_no,new_comic_no,max_page_no
0,145400,2946,2,1,2929,35
1,145401,2946,3,7,2929,35
2,145402,2946,4,9,2929,35


In [None]:
panels_per_book_df = comic_info_dev_df.groupby(['new_comic_no'])['panel_no'].sum().reset_index()
print(panels_per_book_df.shape)
panels_per_book_df.head(5)

(501, 2)


Unnamed: 0,new_comic_no,panel_no
0,2929,190
1,2930,162
2,2931,156
3,2932,264
4,2933,529


In [None]:
panel_no_lst = list(panels_per_book_df['panel_no'])

# Load pca pickle file

In [None]:
def load_pickle_files(pkl_filepath):
  with open(pkl_filepath, 'rb') as handle:
    file_contents = pickle.load(handle)

  return file_contents

In [None]:
dev_pca_np = load_pickle_files(os.path.join(home_filepath, 'vgg_features', 'dev_pca_np.pickle'))
dev_pca_np.shape

(144567, 64)

# average embeddings of all panels to create one embedding per book

In [None]:
def average_panel_embedding_per_book(panel_pca_embedding, panel_no_counter_lst):
  dev_comic_book_id = 2929
  book_embedding_np = np.zeros((len(panel_no_counter_lst), 64))
  previous_idx = 0
  current_idx = 0
  for idx, panel_number in enumerate(panel_no_counter_lst):
    current_idx = panel_no_counter_lst[idx] + previous_idx
    one_book_embedding = np.mean(panel_pca_embedding[previous_idx:current_idx, :])
    if idx%20 == 0:
      print('book_number: {} and page number: {}'.format(dev_comic_book_id, current_idx - previous_idx))
    book_embedding_np[idx, :] = one_book_embedding
    previous_idx = current_idx
    dev_comic_book_id = dev_comic_book_id + idx

  return book_embedding_np

In [None]:
book_embedding_np = average_panel_embedding_per_book(dev_pca_np, panel_no_lst)

book_number: 2929 and page number: 190
book_number: 3119 and page number: 216
book_number: 3709 and page number: 298
book_number: 4699 and page number: 342
book_number: 6089 and page number: 169
book_number: 7879 and page number: 397
book_number: 10069 and page number: 360
book_number: 12659 and page number: 204
book_number: 15649 and page number: 165
book_number: 19039 and page number: 163
book_number: 22829 and page number: 490
book_number: 27019 and page number: 172
book_number: 31609 and page number: 387
book_number: 36599 and page number: 154
book_number: 41989 and page number: 329
book_number: 47779 and page number: 179
book_number: 53969 and page number: 273
book_number: 60559 and page number: 231
book_number: 67549 and page number: 425
book_number: 74939 and page number: 297
book_number: 82729 and page number: 236
book_number: 90919 and page number: 432
book_number: 99509 and page number: 256
book_number: 108499 and page number: 252
book_number: 117889 and page number: 356
book

# compute similarity based on picked panels

In [None]:
dev_start_book_id = 2929
selected_book_num = 2930

np_book_idx  = selected_book_num-dev_start_book_id

In [None]:
def np_cosine_similarity(u, v):
  u = np.expand_dims(u, 1)
  n = np.sum(u * v, axis=2)
  d = np.linalg.norm(u, axis=2) * np.linalg.norm(v, axis=1)

  return n / d

In [None]:
def np_l2_similarity(u, v):
  d = np.linalg.norm(u - v, axis=1)

  return d

In [None]:
results_cosine_similarity = np_cosine_similarity(book_embedding_np, book_embedding_np[np_book_idx-1:np_book_idx, :])

In [None]:
indices_cosine_similarity = np.argsort(np.squeeze(-results_cosine_similarity), axis=0)[:50]

In [None]:
results_l2_similarity = np_l2_similarity(book_embedding_np, book_embedding_np[np_book_idx-1:np_book_idx,:])

In [None]:
indices_l2_similarity = np.argsort(np.squeeze(results_l2_similarity), axis=0)[:50]

In [None]:
indices_cosine_similarity

array([  0, 287, 286, 276, 275, 273, 272, 271, 288, 270, 268, 267, 266,
       265, 260, 259, 258, 269, 499, 289, 291, 316, 312, 308, 305, 304,
       303, 302, 290, 301, 299, 298, 297, 296, 295, 293, 292, 300, 249,
       248, 247, 211, 207, 206, 205, 204, 195, 194, 213, 189])

In [None]:
indices_l2_similarity

array([  0,  14, 206, 266, 365, 242, 267, 269, 303, 265, 232, 472, 213,
         7, 270, 249, 471, 260, 271, 364, 287,  86, 499, 366, 420, 356,
        20, 299, 288, 211, 168, 319,   8, 194,  15, 312,  24,  21,  99,
       247, 292,  13, 250, 459, 240, 295,  17, 363, 226, 169])

In [None]:
np.intersect1d(indices_cosine_similarity, indices_l2_similarity)

array([  0, 194, 206, 211, 213, 247, 249, 260, 265, 266, 267, 269, 270,
       271, 287, 288, 292, 295, 299, 303, 312, 499])

# Apply PCA on VGG Features - Old

In [None]:
def load_pickle_files(pkl_filepath):
  with open(pkl_filepath, 'rb') as handle:
    file_contents = pickle.load(handle)

  return file_contents

In [None]:
dev_panels_np = load_pickle_files(os.path.join(home_filepath, 'vgg_features', 'dev_filtered_np.pickle'))
# test_dict = load_pickle_files(os.path.join(home_filepath, 'vgg_features', 'test_vgg_embedding.pickle'))
# training_dict = load_pickle_files(os.path.join(home_filepath, 'vgg_features', 'training_vgg_embedding.pickle'))

In [None]:
dev_panels_np.shape

(144567, 4096)

In [None]:
pca_obj = PCA(n_components=64)
pca_obj.fit(dev_panels_np)

PCA(n_components=64)

In [None]:
dev_pca_np = pca_obj.transform(dev_panels_np)

In [None]:
with open(os.path.join(home_filepath, 'vgg_features', 'dev_pca_np.pickle'), 'wb') as handle:
    pickle.dump(dev_pca_np, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
del dev_panels_np, pca_obj

# Similarity on Averaged Book Embedding

In [None]:
def np_cosine_similarity(u, v):
  u = np.expand_dims(u, 1)
  n = np.sum(u * v, axis=2)
  d = np.linalg.norm(u, axis=2) * np.linalg.norm(v, axis=1)
  return n / d

In [None]:
def np_l2_similarity(u, v):
  d = np.linalg.norm(u - v, axis=1)
  return d

In [None]:
comic_book_name_path = os.path.join(home_filepath, 'vgg_features', 'comicnum_to_book_title.csv')
comic_book_name_df = pd.read_csv(comic_book_name_path)
comic_book_name_df['our_idx'] = comic_book_name_df.index.copy()
comic_book_name_df.head()

Unnamed: 0,comic_no,Book Title,Vol,Issue,Year,Month,genre,link,our_idx
0,3451,Blue Bolt,10.0,2.0,1949.0,Sep,superhero|vigilante,,0
1,3452,Dick Cole - Blue Bolt,3.0,1.0,1949.0,Jun,superhero|vigilante,,1
2,3453,Dick Cole - Blue Bolt,5.0,3.0,,Dec,superhero|vigilante,,2
3,3454,Dick Cole - Blue Bolt,2.0,7.0,,Dec,superhero|vigilante,,3
4,3455,Dick Cole - Blue Bolt,5.0,7.0,,,superhero|vigilante,,4


In [None]:
comic_book_name_df.dropna(subset=['comic_no', 'Book Title'], how='any', inplace=True)
comic_book_name_df.loc[comic_book_name_df[ 'Book Title'] == 'Kaanga - Colossus of the Congo', 'comic_no'] = 3618
comic_book_name_df.tail(10)

Unnamed: 0,comic_no,Book Title,Vol,Issue,Year,Month,genre,link,our_idx
163,3614,Jumbo Comics - Sheena - Banshee Wail of the Un...,,97.0,1947,Mar,jungle|animal|female,https://digitalcomicmuseum.com/index.php?dlid=...,163
164,3615,Kaanga - ing of the Jungle,,106.0,1948,,jungle|adventure,https://www.coverbrowser.com/covers/jungle-com...,164
165,3616,Kaanga - Banshee Valley,,107.0,1948,Nov,jungle|animal|non fiction,https://digitalcomicmuseum.com/index.php?dlid=...,165
166,3617,Kaanga - Cavern of the Golden Bones,,109.0,1949,Jan,animal|jungle|adventure,https://digitalcomicmuseum.com/index.php?dlid=...,166
167,3618,Kaanga - Colossus of the Congo,,81.0,1946,Sep,jungle|animal|non fiction,https://digitalcomicmuseum.com/index.php?dlid=...,167
168,3619,Kaanga - Witch Queen of the Hairy Ones,,93.0,1947,Sep,jungle|animal|adventure,https://digitalcomicmuseum.com/index.php?dlid=...,168
169,3620,kaanga - Vendetta of the Free Tribes,,92.0,1947,Aug,jungle|animal|adventure,https://digitalcomicmuseum.com/index.php?dlid=...,169
170,3621,JoJo - The Mountain of Skulls,,8.0,1947,Nov,jungle|adventure,https://digitalcomicmuseum.com/index.php?dlid=...,170
171,3622,JoJo - The Copy-Mad Killers,,28.0,1949,Jun,jungle|adventure,https://digitalcomicmuseum.com/index.php?dlid=...,171
172,3623,Jumbo Comics - Sheena - Tigerman terror,,26.0,1941,Apr,jungle|adventure|female|scifi|children,https://digitalcomicmuseum.com/index.php?dlid=...,172


In [None]:
idx_comicno_bookname_genre_mapping_dict = {}

for idx, row in comic_book_name_df.iterrows():
  idx_comicno_bookname_genre_mapping_dict[idx] = [row['comic_no'], row['Book Title'], row['genre']]

idx_comicno_bookname_genre_mapping_dict[0]

[3451, 'Blue Bolt', 'superhero|vigilante']

In [None]:
book_embedding_path = os.path.join(home_filepath, 'vgg_features', 'averaged_embedding_per_book_np.pickle')
print(book_embedding_path)

with open(book_embedding_path, 'rb') as handle:
  averaged_embedding_per_book_np = pickle5.load(handle)

averaged_embedding_per_book_np.shape

/content/drive/MyDrive/suraj/thesis/vgg_features/averaged_embedding_per_book_np.pickle


(500, 4096)

In [None]:
averaged_embedding_per_book_np[0:2, :]

array([[0.00100002, 0.00133177, 0.0210361 , ..., 0.00212082, 0.00019009,
        0.00399826],
       [0.00049768, 0.00141695, 0.02880869, ..., 0.00192987, 0.00017784,
        0.00611856]])

In [None]:
averaged_embedding_per_book_limited_172_np =averaged_embedding_per_book_np[:173, :] # we have book titles for only 173 books, hence limiting it 

In [None]:
selected_book_num = 5 # select till 172 idx
np_book_idx = selected_book_num
selected_book_info = idx_comicno_bookname_genre_mapping_dict[selected_book_num-1]
print('comic number: {} | title: {} | genre: {} '.format(selected_book_info[0], selected_book_info[1], selected_book_info[2] ))

comic number: 3455 | title: Dick Cole - Blue Bolt | genre: superhero|vigilante 


In [None]:
results_cosine_similarity = np_cosine_similarity(averaged_embedding_per_book_limited_172_np, averaged_embedding_per_book_limited_172_np[max(np_book_idx-1, 0):np_book_idx, :])
indices_cosine_similarity = np.argsort(np.squeeze(-results_cosine_similarity), axis=0)[:50]
sorted_results_cosine_similarity = np.sort(-results_cosine_similarity,axis=0 )[:50]

results_l2_similarity = np_l2_similarity(averaged_embedding_per_book_limited_172_np, averaged_embedding_per_book_limited_172_np[max(np_book_idx-1, 0):np_book_idx,:])
indices_l2_similarity = np.argsort(np.squeeze(results_l2_similarity), axis=0)[:50]
sorted_results_l2_similarity = np.sort(results_l2_similarity,axis=0 )[:50]

# np.intersect1d(indices_cosine_similarity, indices_l2_similarity)

In [None]:
def print_top_n_matching_info(idx_top_n_np, sim_score_top_n_np, comic_info_dict={}, print_n=10):
  sim_score_top_n_squeezed_np = np.squeeze(sim_score_top_n_np)

  for i in range(print_n):
    # print(i, idx_top_n_np[i])
    book_idx = idx_top_n_np[i]
    sim_score_book = sim_score_top_n_squeezed_np[i]
    try:
      comic_no, book_title, genre = comic_info_dict[book_idx]
    except Exception as e:
      comic_no, book_title, genre = (-1, 'not exist', 'not exist')
    print("rank: {} | sim_score: {} | comic_no: {} | book_title: {} | genre: {} ".format(i,sim_score_book, comic_no, book_title, genre ))

  return


In [None]:
print_top_n_matching_info(indices_cosine_similarity, sorted_results_cosine_similarity, comic_info_dict=idx_comicno_bookname_genre_mapping_dict, print_n=15)

rank: 0 | sim_score: -1.0 | comic_no: 3455 | book_title: Dick Cole - Blue Bolt | genre: superhero|vigilante 
rank: 1 | sim_score: -0.9918681851067284 | comic_no: 3456 | book_title: Blue Bolt | genre: superhero|vigilante 
rank: 2 | sim_score: -0.9883338472308044 | comic_no: 3545 | book_title: Feature Comics - lala Palooza | genre: superhero|humor|detective 
rank: 3 | sim_score: -0.987424248422107 | comic_no: 3453 | book_title: Dick Cole - Blue Bolt | genre: superhero|vigilante 
rank: 4 | sim_score: -0.9870748156282712 | comic_no: 3575 | book_title: Frankenstein and Voice of His Conscience | genre: humor 
rank: 5 | sim_score: -0.9867219959662591 | comic_no: 3523 | book_title: Dynamic Man - man on the run | genre: superhero|war|humor|scifi 
rank: 6 | sim_score: -0.9864321264919635 | comic_no: 3581 | book_title: Ghost Rider - The Vulture Swoops | genre: western|fantasy|superhero 
rank: 7 | sim_score: -0.9860135209022758 | comic_no: 3469 | book_title: Buster Crabbe and the Maid of mars | ge

In [None]:
print_top_n_matching_info(indices_l2_similarity, sorted_results_l2_similarity, comic_info_dict=idx_comicno_bookname_genre_mapping_dict, print_n=15)

rank: 0 | sim_score: 0.0 | comic_no: 3455 | book_title: Dick Cole - Blue Bolt | genre: superhero|vigilante 
rank: 1 | sim_score: 0.08114350898857563 | comic_no: 3456 | book_title: Blue Bolt | genre: superhero|vigilante 
rank: 2 | sim_score: 0.0919425163183615 | comic_no: 3545 | book_title: Feature Comics - lala Palooza | genre: superhero|humor|detective 
rank: 3 | sim_score: 0.0966015381342613 | comic_no: 3575 | book_title: Frankenstein and Voice of His Conscience | genre: humor 
rank: 4 | sim_score: 0.10340422356929974 | comic_no: 3469 | book_title: Buster Crabbe and the Maid of mars | genre: Western 
rank: 5 | sim_score: 0.10390559825227569 | comic_no: 3581 | book_title: Ghost Rider - The Vulture Swoops | genre: western|fantasy|superhero 
rank: 6 | sim_score: 0.10435854357003163 | comic_no: 3464 | book_title: Brenda Starr - Shopping for Trouble | genre: romance|adventure|detective|female 
rank: 7 | sim_score: 0.1043840633201284 | comic_no: 3523 | book_title: Dynamic Man - man on the 

## Use kmeans tsne features

In [None]:
kmeans_tsne_features_df_path = os.path.join(home_filepath, 'vgg_features', 'kmeans_tsne_features_df.csv')
kmeans_tsne_features_df = pd.read_csv(kmeans_tsne_features_df_path)
print(kmeans_tsne_features_df.shape)
kmeans_tsne_features_df.tail(10)

(500, 14)


Unnamed: 0,clust_0,clust_1,clust_2,clust_3,clust_4,clust_5,clust_6,clust_7,clust_8,clust_9,tsne_0,tsne_1,tsne_2,max_scaled_panel_count
490,0.0,0.094527,0.0,0.791045,0.0,0.0,0.014925,0.014925,0.044776,0.039801,1.228514,-0.100683,0.09496,0.365455
491,0.0,0.032164,0.0,0.953216,0.0,0.0,0.0,0.0,0.0,0.01462,1.060908,0.066837,0.06449,0.621818
492,0.0,0.0,0.0,0.960133,0.0,0.0,0.0,0.039867,0.0,0.0,0.962488,-0.037293,0.082629,0.547273
493,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.847125,0.039475,0.010068,0.527273
494,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.847125,0.039475,0.010068,0.350909
495,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.847125,0.039475,0.010068,0.229091
496,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.847125,0.039475,0.010068,0.318182
497,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.847125,0.039475,0.010068,0.329091
498,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.847125,0.039475,0.010068,0.785455
499,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.849012,0.040408,0.010291,0.398182


In [None]:
kmeans_tsne_features_df.head(5)

Unnamed: 0,clust_0,clust_1,clust_2,clust_3,clust_4,clust_5,clust_6,clust_7,clust_8,clust_9,tsne_0,tsne_1,tsne_2,max_scaled_panel_count
0,0.132911,0.031646,0.120253,0.246835,0.120253,0.0,0.094937,0.056962,0.082278,0.113924,-0.071253,0.10169,-0.176826,0.287273
1,0.112299,0.045455,0.21123,0.208556,0.069519,0.0,0.122995,0.034759,0.037433,0.157754,-0.109551,0.28107,-0.370177,0.68
2,0.092466,0.05137,0.260274,0.164384,0.106164,0.0,0.09589,0.041096,0.068493,0.119863,-0.256169,0.277131,-0.266444,0.530909
3,0.098734,0.065823,0.121519,0.205063,0.070886,0.0,0.081013,0.063291,0.139241,0.15443,0.01313,0.200849,-0.04651,0.718182
4,0.13871,0.048387,0.170968,0.23871,0.109677,0.0,0.090323,0.06129,0.06129,0.080645,-0.172214,0.077585,-0.132775,0.563636


In [None]:
kmeans_tsne_features_df.columns

Index(['clust_0', 'clust_1', 'clust_2', 'clust_3', 'clust_4', 'clust_5',
       'clust_6', 'clust_7', 'clust_8', 'clust_9', 'tsne_0', 'tsne_1',
       'tsne_2', 'max_scaled_panel_count'],
      dtype='object')

In [None]:
clustering_features_np = kmeans_tsne_features_df[['clust_0', 'clust_1', 'clust_2', 'clust_3', 'clust_4', 'clust_5','clust_6', 'clust_7', 'clust_8', 'clust_9']].to_numpy()
clustering_features_limited_172_np = clustering_features_np[:173, :]

tsne_features_np = kmeans_tsne_features_df[['tsne_0', 'tsne_1', 'tsne_2']].to_numpy()
tsne_features_limited_172_np = tsne_features_np[:173, :]

all_features_np = kmeans_tsne_features_df.to_numpy()
all_features_limited_172_np= all_features_np[:173, :]

print(clustering_features_np.shape, clustering_features_limited_172_np.shape, tsne_features_np.shape, tsne_features_limited_172_np.shape, all_features_np.shape, all_features_limited_172_np.shape)

(500, 10) (173, 10) (500, 3) (173, 3) (500, 14) (173, 14)


In [None]:
selected_book_num = 91 # select till 172 idx
np_book_idx = selected_book_num
selected_book_info = idx_comicno_bookname_genre_mapping_dict[selected_book_num-1]
print('comic number: {} | title: {} | genre: {} '.format(selected_book_info[0], selected_book_info[1], selected_book_info[2] ))

comic number: 3541 | title: Feature Comics - Reynolds of the mounted | genre: humor|detective|mystery 


In [None]:
def save_top_n_matching_info(idx_top_n_np, sim_score_top_n_np, comic_info_dict={}, print_n=10, query_book_id=1, feature_similarity_type='vgg_cosine'):
  sim_score_top_n_squeezed_np = np.squeeze(sim_score_top_n_np)
  list_of_records = []
  query_comic_no, query_book_title, query_genre = comic_info_dict[query_book_id]
  # df = pd.DataFrame({'rank': pd.Series(dtype='int'),'sim_score': pd.Series(dtype='float'), 'comic_no': pd.Series(dtype='int'),'book_title': pd.Series(dtype='str'), 'genre': pd.Series(dtype='str')})
  for i in range(1, print_n):
    # print(i, idx_top_n_np[i])
    book_idx = idx_top_n_np[i]
    sim_score_book = sim_score_top_n_squeezed_np[i]

    try:
      comic_no, book_title, genre = comic_info_dict[book_idx]
    except Exception as e:
      comic_no, book_title, genre = (-1, 'not exist', 'not exist')

    # print("rank: {} | sim_score: {} | comic_no: {} | book_title: {} | genre: {} ".format(i,sim_score_book, comic_no, book_title, genre ))
    # kmeans_tsne_cosine_df['query_book_id'] = np_book_idx-1
    # kmeans_tsne_cosine_df['query_book_title'] = np_book_idx-1
    list_of_records.append({'rank': i, 'sim_score': sim_score_book, 'comic_no': comic_no, 'book_title': book_title, 'genre': genre, 
                            'query_comic_no':query_comic_no, 'query_book_title':query_book_title, 'query_genre':query_genre, 'feature_similarity_type': feature_similarity_type})

  df = pd.DataFrame.from_dict(list_of_records)
  return df


In [None]:
def run_all_similarity_on_features(all_feature_dict={}, query_book_id=2, top_n=21, comic_info_dict={}):
  pdList = []  # List of your dataframes
  np_book_idx = query_book_id
  selected_book_info = comic_info_dict[np_book_idx-1]
  # print('comic number: {} | title: {} | genre: {} '.format(selected_book_info[0], selected_book_info[1], selected_book_info[2] ))
  for k, v in all_feature_dict.items():
    # print(k)

    if k.endswith('_cosine'):
      results_cosine_similarity = np_cosine_similarity(v, v[max(query_book_id-1, 0):query_book_id, :])
      indices_cosine_similarity = np.argsort(np.squeeze(-results_cosine_similarity), axis=0)[:50]
      sorted_results_cosine_similarity = np.sort(-results_cosine_similarity,axis=0 )[:50]
      cosine_df = save_top_n_matching_info(indices_cosine_similarity, sorted_results_cosine_similarity, comic_info_dict=comic_info_dict, print_n=top_n, query_book_id=query_book_id-1, feature_similarity_type=k)
      pdList.append(cosine_df)

    else:
      results_l2_similarity = np_l2_similarity(v, v[max(query_book_id-1, 0):query_book_id,:])
      indices_l2_similarity = np.argsort(np.squeeze(results_l2_similarity), axis=0)[:50]
      sorted_results_l2_similarity = np.sort(results_l2_similarity,axis=0 )[:50]
      l2_df = save_top_n_matching_info(indices_l2_similarity, sorted_results_l2_similarity, comic_info_dict=comic_info_dict, print_n=top_n, query_book_id=query_book_id-1, feature_similarity_type=k)
      pdList.append(l2_df)

  concatenated_df = pd.concat(pdList)
  out_filename = str(selected_book_info[0])+'_'+selected_book_info[1]+'_concatenated_df.csv'
  output_filepath = os.path.join(os.getcwd(),out_filename)
  print(output_filepath)
  print()
  concatenated_df.to_csv(output_filepath, index=False)

  return concatenated_df

In [None]:
all_feature_dict = {'vgg_cosine': averaged_embedding_per_book_limited_172_np, 'vgg_l2': averaged_embedding_per_book_limited_172_np, 'kmeans_cosine':clustering_features_limited_172_np, 'kmeans_l2':clustering_features_limited_172_np,
                    'tsne_cosine': tsne_features_limited_172_np, 'tsne_l2': tsne_features_limited_172_np, 'kmeans_tsne_cosine': all_features_limited_172_np, 'kmeans_tsne_l2': all_features_limited_172_np}

comic_info_dict=idx_comicno_bookname_genre_mapping_dict

top_n=21

query_book_id_lst = [i for i in range(2, 150)]
lst_of_df = []

for id in query_book_id_lst:
  query_book_id = id
  np_book_idx = query_book_id
  selected_book_info = idx_comicno_bookname_genre_mapping_dict[query_book_id-1]
  print('comic number: {} | title: {} | genre: {} '.format(selected_book_info[0], selected_book_info[1], selected_book_info[2] ))
  concatenated_df = run_all_similarity_on_features(all_feature_dict=all_feature_dict, query_book_id=query_book_id, top_n=21, comic_info_dict=comic_info_dict)
  lst_of_df.append(concatenated_df)

comic number: 3452 | title: Dick Cole - Blue Bolt | genre: superhero|vigilante 
/content/drive/MyDrive/suraj/thesis/3452_Dick Cole - Blue Bolt_concatenated_df.csv

comic number: 3453 | title: Dick Cole - Blue Bolt | genre: superhero|vigilante 
/content/drive/MyDrive/suraj/thesis/3453_Dick Cole - Blue Bolt_concatenated_df.csv

comic number: 3454 | title: Dick Cole - Blue Bolt | genre: superhero|vigilante 
/content/drive/MyDrive/suraj/thesis/3454_Dick Cole - Blue Bolt_concatenated_df.csv

comic number: 3455 | title: Dick Cole - Blue Bolt | genre: superhero|vigilante 
/content/drive/MyDrive/suraj/thesis/3455_Dick Cole - Blue Bolt_concatenated_df.csv

comic number: 3456 | title: Blue Bolt | genre: superhero|vigilante 
/content/drive/MyDrive/suraj/thesis/3456_Blue Bolt_concatenated_df.csv

comic number: 3457 | title: Dick Cole - Blue Bolt | genre: superhero 
/content/drive/MyDrive/suraj/thesis/3457_Dick Cole - Blue Bolt_concatenated_df.csv

comic number: 3458 | title: Lightning Man - Blue B

In [None]:
lst_of_df[40].tail(50)

Unnamed: 0,rank,sim_score,comic_no,book_title,genre,query_comic_no,query_book_title,query_genre,feature_similarity_type
10,11,0.158656,3462,Boy - Detective,detective|thriller|short story,3492,Cow Puncher - Devils Scourge,western,tsne_l2
11,12,0.176507,3461,Boy - Crimebuster,adventure|detective|romance|superhero,3492,Cow Puncher - Devils Scourge,western,tsne_l2
12,13,0.17891,3515,Dollman - The eerie tale of mind monster,superhero|humor,3492,Cow Puncher - Devils Scourge,western,tsne_l2
13,14,0.179274,3539,Dollman - Crime goes to college,superhero|humor,3492,Cow Puncher - Devils Scourge,western,tsne_l2
14,15,0.182744,3463,Brenda Starr - Silver lining in sun valley,romance|adventure|female,3492,Cow Puncher - Devils Scourge,western,tsne_l2
15,16,0.183402,3610,Jumbo Comics - Sheena - Stuart Taylor in Weird...,jungle|scifi|humor|adventure|mystery|female,3492,Cow Puncher - Devils Scourge,western,tsne_l2
16,17,0.193622,3473,Captain Marvel Jr - Voodoo Clock,superhero|humour,3492,Cow Puncher - Devils Scourge,western,tsne_l2
17,18,0.193955,3576,Frisky fables - Smart Daddy,animal|humor|romance,3492,Cow Puncher - Devils Scourge,western,tsne_l2
18,19,0.204642,-1,not exist,not exist,3492,Cow Puncher - Devils Scourge,western,tsne_l2
19,20,0.20606,3472,Captain Marvel Jr - The acrobat death trap,superhero|humour,3492,Cow Puncher - Devils Scourge,western,tsne_l2


In [None]:
concatenated_df.shape

(160, 9)

In [None]:
concatenated_df['feature_similarity_type'].value_counts()

vgg_cosine            20
vgg_l2                20
kmeans_cosine         20
kmeans_l2             20
tsne_cosine           20
tsne_l2               20
kmeans_tsne_cosine    20
kmeans_tsne_l2        20
Name: feature_similarity_type, dtype: int64

In [None]:
results_cosine_similarity = np_cosine_similarity(averaged_embedding_per_book_limited_172_np, averaged_embedding_per_book_limited_172_np[max(np_book_idx-1, 0):np_book_idx, :])
indices_cosine_similarity = np.argsort(np.squeeze(-results_cosine_similarity), axis=0)[:50]
sorted_results_cosine_similarity = np.sort(-results_cosine_similarity,axis=0 )[:50]

results_l2_similarity = np_l2_similarity(averaged_embedding_per_book_limited_172_np, averaged_embedding_per_book_limited_172_np[max(np_book_idx-1, 0):np_book_idx,:])
indices_l2_similarity = np.argsort(np.squeeze(results_l2_similarity), axis=0)[:50]
sorted_results_l2_similarity = np.sort(results_l2_similarity,axis=0 )[:50]

In [None]:
vgg_cosine_df = save_top_n_matching_info(indices_cosine_similarity, sorted_results_cosine_similarity, comic_info_dict=idx_comicno_bookname_genre_mapping_dict, print_n=21, query_book_id=np_book_idx-1, feature_similarity_type='vgg_cosine')

In [None]:
vgg_l2_df = save_top_n_matching_info(indices_l2_similarity, sorted_results_l2_similarity, comic_info_dict=idx_comicno_bookname_genre_mapping_dict, print_n=21, query_book_id=np_book_idx-1, feature_similarity_type='vgg_l2')

In [None]:
results_cosine_similarity = np_cosine_similarity(clustering_features_limited_172_np, clustering_features_limited_172_np[max(np_book_idx-1, 0):np_book_idx, :])
indices_cosine_similarity = np.argsort(np.squeeze(-results_cosine_similarity), axis=0)[:50]
sorted_results_cosine_similarity = np.sort(-results_cosine_similarity,axis=0 )[:50]

results_l2_similarity = np_l2_similarity(clustering_features_limited_172_np, clustering_features_limited_172_np[max(np_book_idx-1, 0):np_book_idx,:])
indices_l2_similarity = np.argsort(np.squeeze(results_l2_similarity), axis=0)[:50]
sorted_results_l2_similarity = np.sort(results_l2_similarity,axis=0 )[:50]

# np.intersect1d(indices_cosine_similarity, indices_l2_similarity)

In [None]:
kmeans_cosine_df = save_top_n_matching_info(indices_cosine_similarity, sorted_results_cosine_similarity, comic_info_dict=idx_comicno_bookname_genre_mapping_dict, print_n=21, query_book_id=np_book_idx-1, feature_similarity_type='kmeans_cosine')

In [None]:
kmeans_l2_df = save_top_n_matching_info(indices_l2_similarity, sorted_results_l2_similarity, comic_info_dict=idx_comicno_bookname_genre_mapping_dict, print_n=21, query_book_id=np_book_idx-1, feature_similarity_type='kmeans_l2')

In [None]:
results_cosine_similarity = np_cosine_similarity(tsne_features_limited_172_np, tsne_features_limited_172_np[max(np_book_idx-1, 0):np_book_idx, :])
indices_cosine_similarity = np.argsort(np.squeeze(-results_cosine_similarity), axis=0)[:50]
sorted_results_cosine_similarity = np.sort(-results_cosine_similarity,axis=0 )[:50]

results_l2_similarity = np_l2_similarity(tsne_features_limited_172_np, tsne_features_limited_172_np[max(np_book_idx-1, 0):np_book_idx,:])
indices_l2_similarity = np.argsort(np.squeeze(results_l2_similarity), axis=0)[:50]
sorted_results_l2_similarity = np.sort(results_l2_similarity,axis=0 )[:50]

In [None]:
tsne_cosine_df = save_top_n_matching_info(indices_cosine_similarity, sorted_results_cosine_similarity, comic_info_dict=idx_comicno_bookname_genre_mapping_dict, print_n=21, query_book_id=np_book_idx-1, feature_similarity_type='tsne_cosine')

In [None]:
tsne_l2_df = save_top_n_matching_info(indices_l2_similarity, sorted_results_l2_similarity, comic_info_dict=idx_comicno_bookname_genre_mapping_dict, print_n=21, query_book_id=np_book_idx-1, feature_similarity_type='tsne_l2')

In [None]:
results_cosine_similarity = np_cosine_similarity(all_features_limited_172_np, all_features_limited_172_np[max(np_book_idx-1, 0):np_book_idx, :])
indices_cosine_similarity = np.argsort(np.squeeze(-results_cosine_similarity), axis=0)[:50]
sorted_results_cosine_similarity = np.sort(-results_cosine_similarity,axis=0 )[:50]

results_l2_similarity = np_l2_similarity(all_features_limited_172_np, all_features_limited_172_np[max(np_book_idx-1, 0):np_book_idx,:])
indices_l2_similarity = np.argsort(np.squeeze(results_l2_similarity), axis=0)[:50]
sorted_results_l2_similarity = np.sort(results_l2_similarity,axis=0 )[:50]

In [None]:
kmeans_tsne_cosine_df = save_top_n_matching_info(indices_cosine_similarity, sorted_results_cosine_similarity, comic_info_dict=idx_comicno_bookname_genre_mapping_dict, print_n=21, query_book_id=np_book_idx-1, feature_similarity_type='kmeans_tsne_cosine')


In [None]:
kmeans_tsne_l2_df = save_top_n_matching_info(indices_l2_similarity, sorted_results_l2_similarity, comic_info_dict=idx_comicno_bookname_genre_mapping_dict, print_n=21, query_book_id=np_book_idx-1, feature_similarity_type='kmeans_tsne_l2')

In [None]:
pdList = [vgg_cosine_df, vgg_l2_df, kmeans_cosine_df, kmeans_l2_df, tsne_cosine_df,tsne_l2_df, kmeans_tsne_cosine_df, kmeans_tsne_l2_df  ]  # List of your dataframes
concatenated_df = pd.concat(pdList)

In [None]:
concatenated_df.shape

(168, 10)

In [None]:
concatenated_df.to_csv('concatenated_90_reynoldsofthemounted_df.csv', index=False)

In [None]:
concatenated_df.head(50)

Unnamed: 0,rank,sim_score,comic_no,book_title,genre,query_comic_no,query_book_title,query_genre,feature_similarity_type,query_book_id
0,0,-1.0,3541,Feature Comics - Reynolds of the mounted,humor|detective|mystery,3541,Feature Comics - Reynolds of the mounted,humor|detective|mystery,vgg_cosine,
1,1,-0.993449,3542,Feature Comics - The death room,superhero|humor|drama|detective,3541,Feature Comics - Reynolds of the mounted,humor|detective|mystery,vgg_cosine,
2,2,-0.993295,3494,Crack Comics - Black Condor and The Clock,detective|mystery|superhero|humor,3541,Feature Comics - Reynolds of the mounted,humor|detective|mystery,vgg_cosine,
3,3,-0.992411,3543,Feature Comics - Rusty Ryan,humor|superhero|detective|jungle,3541,Feature Comics - Reynolds of the mounted,humor|detective|mystery,vgg_cosine,
4,4,-0.990397,3544,Feature Comics - beyond the head lines,superhero|detective|western|jungle|humor,3541,Feature Comics - Reynolds of the mounted,humor|detective|mystery,vgg_cosine,
5,5,-0.990066,3495,Crack Comics - Spitfire,detective|mystery|superhero|humor|aviation,3541,Feature Comics - Reynolds of the mounted,humor|detective|mystery,vgg_cosine,
6,6,-0.989298,3581,Ghost Rider - The Vulture Swoops,western|fantasy|superhero,3541,Feature Comics - Reynolds of the mounted,humor|detective|mystery,vgg_cosine,
7,7,-0.989071,3540,Feature Funnies - 2,humor|sports|drama|non fiction,3541,Feature Comics - Reynolds of the mounted,humor|detective|mystery,vgg_cosine,
8,8,-0.989029,3545,Feature Comics - lala Palooza,superhero|humor|detective,3541,Feature Comics - Reynolds of the mounted,humor|detective|mystery,vgg_cosine,
9,9,-0.98815,3552,Feature Comics - College for radicals,superhero|children|humor,3541,Feature Comics - Reynolds of the mounted,humor|detective|mystery,vgg_cosine,


# Search with TF IDF and Emotions

## similarity metrics functions

In [None]:
def np_cosine_similarity(u, v):
  u = np.expand_dims(u, 1)
  n = np.sum(u * v, axis=2)
  d = np.linalg.norm(u, axis=2) * np.linalg.norm(v, axis=1)
  return n / d

def np_l2_similarity(u, v):
  d = np.linalg.norm(u - v, axis=1)
  return d

## comic book information

In [None]:
comic_book_name_path = os.path.join(home_filepath, 'vgg_features', 'comicnum_to_book_title.csv')
comic_book_name_df = pd.read_csv(comic_book_name_path)
comic_book_name_df['our_idx'] = comic_book_name_df.index.copy()

idx_comicno_bookname_genre_mapping_dict = {}
counter = 0
for idx, row in comic_book_name_df.iterrows():
  if str(row['Book Title']) != 'nan':
    # print(counter, row['comic_no'], row['Book Title'], row['genre'])
    idx_comicno_bookname_genre_mapping_dict[counter] = [row['comic_no'], row['Book Title'], row['genre']]
    counter += 1

idx_comicno_bookname_genre_mapping_dict[0]

[3451, 'Blue Bolt', 'superhero|vigilante']

In [None]:
idx_comicno_bookname_genre_mapping_dict[164]

[3623,
 'Jumbo Comics - Sheena - Tigerman terror',
 'jungle|adventure|female|scifi|children']

In [None]:
selected_book_num = 91 # select till 172 idx
np_book_idx = selected_book_num
selected_book_info = idx_comicno_bookname_genre_mapping_dict[selected_book_num-1]
print('comic number: {} | title: {} | genre: {} '.format(selected_book_info[0], selected_book_info[1], selected_book_info[2] ))

comic number: 3541 | title: Feature Comics - Reynolds of the mounted | genre: humor|detective|mystery 


## vgg embedding features - read

In [None]:
book_embedding_path = os.path.join(home_filepath, 'vgg_features', 'averaged_embedding_per_book_np.pickle')
print(book_embedding_path)

with open(book_embedding_path, 'rb') as handle:
  averaged_embedding_per_book_np = pickle.load(handle)

averaged_embedding_per_book_np.shape

/content/drive/MyDrive/suraj/thesis/vgg_features/averaged_embedding_per_book_np.pickle


(500, 4096)

In [None]:
averaged_embedding_per_book_limited_172_np =averaged_embedding_per_book_np[:164, :]
print(averaged_embedding_per_book_limited_172_np.shape)

(164, 4096)


## kmeans tsne features - read

In [None]:
kmeans_tsne_features_df_path = os.path.join(home_filepath, 'vgg_features', 'kmeans_tsne_features_df.csv')
kmeans_tsne_features_df = pd.read_csv(kmeans_tsne_features_df_path)
print(kmeans_tsne_features_df.shape)
kmeans_tsne_features_df.tail()

(500, 14)


Unnamed: 0,clust_0,clust_1,clust_2,clust_3,clust_4,clust_5,clust_6,clust_7,clust_8,clust_9,tsne_0,tsne_1,tsne_2,max_scaled_panel_count
495,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.847125,0.039475,0.010068,0.229091
496,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.847125,0.039475,0.010068,0.318182
497,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.847125,0.039475,0.010068,0.329091
498,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.847125,0.039475,0.010068,0.785455
499,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.849012,0.040408,0.010291,0.398182


In [None]:
clustering_features_np = kmeans_tsne_features_df[['clust_0', 'clust_1', 'clust_2', 'clust_3', 'clust_4', 'clust_5','clust_6', 'clust_7', 'clust_8', 'clust_9']].to_numpy()
clustering_features_limited_172_np = clustering_features_np[:164, :]

tsne_features_np = kmeans_tsne_features_df[['tsne_0', 'tsne_1', 'tsne_2']].to_numpy()
tsne_features_limited_172_np = tsne_features_np[:164, :]

all_features_np = kmeans_tsne_features_df.to_numpy()
all_features_limited_172_np= all_features_np[:164, :]

print(clustering_features_np.shape, clustering_features_limited_172_np.shape, tsne_features_np.shape, tsne_features_limited_172_np.shape, all_features_np.shape, all_features_limited_172_np.shape)

(500, 10) (164, 10) (500, 3) (164, 3) (500, 14) (164, 14)


In [None]:
kmeans_tsne_features_df.head(175)

Unnamed: 0,clust_0,clust_1,clust_2,clust_3,clust_4,clust_5,clust_6,clust_7,clust_8,clust_9,tsne_0,tsne_1,tsne_2,max_scaled_panel_count
0,0.132911,0.031646,0.120253,0.246835,0.120253,0.000000,0.094937,0.056962,0.082278,0.113924,-0.071253,0.101690,-0.176826,0.287273
1,0.112299,0.045455,0.211230,0.208556,0.069519,0.000000,0.122995,0.034759,0.037433,0.157754,-0.109551,0.281070,-0.370177,0.680000
2,0.092466,0.051370,0.260274,0.164384,0.106164,0.000000,0.095890,0.041096,0.068493,0.119863,-0.256169,0.277131,-0.266444,0.530909
3,0.098734,0.065823,0.121519,0.205063,0.070886,0.000000,0.081013,0.063291,0.139241,0.154430,0.013130,0.200849,-0.046510,0.718182
4,0.138710,0.048387,0.170968,0.238710,0.109677,0.000000,0.090323,0.061290,0.061290,0.080645,-0.172214,0.077585,-0.132775,0.563636
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,0.065637,0.098456,0.106178,0.146718,0.083012,0.000000,0.131274,0.088803,0.167954,0.111969,0.102246,0.192451,0.049611,0.941818
171,0.071287,0.198020,0.065347,0.146535,0.077228,0.000000,0.120792,0.097030,0.134653,0.089109,0.164317,-0.173870,0.000356,0.918182
172,0.043121,0.108830,0.047228,0.139630,0.071869,0.000000,0.125257,0.102669,0.147844,0.213552,0.272130,0.303386,-0.000927,0.885455
173,0.096070,0.052402,0.087336,0.161572,0.096070,0.002183,0.041485,0.052402,0.286026,0.124454,0.021654,0.383857,0.341284,0.832727


## tf idf features - read

In [None]:
tf_idf_features_df_path = os.path.join(home_filepath, 'vgg_features', 'Text_TF_IDF_Features.csv')
tf_idf_features_df = pd.read_csv(tf_idf_features_df_path)
print(tf_idf_features_df.shape)
tf_idf_features_df.tail()

(500, 65)


Unnamed: 0,comic_no,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f54,f55,f56,f57,f58,f59,f60,f61,f62,f63
495,3954,0.040774,0.0,0.0,0.0,0.0,0.0,0.241251,0.043554,0.022442,...,0.020965,0.061652,0.1618,0.020306,0.021388,0.083526,0.161155,0.021603,0.021007,0.083193
496,3955,0.143301,0.053054,0.029029,0.0,0.05972,0.0,0.089926,0.0,0.043022,...,0.026793,0.013132,0.142162,0.14273,0.068336,0.08006,0.025745,0.055218,0.0,0.01329
497,3956,0.159456,0.081173,0.059219,0.0,0.076143,0.0,0.209658,0.056776,0.029255,...,0.0,0.040184,0.145005,0.13235,0.097584,0.081661,0.131298,0.042242,0.054768,0.06778
498,3957,0.162522,0.110313,0.017245,0.016667,0.026608,0.011173,0.236585,0.008267,0.042596,...,0.063669,0.109217,0.084454,0.077083,0.10555,0.095122,0.137646,0.0,0.039872,0.126324
499,3958,0.059791,0.076094,0.033308,0.032191,0.0,0.215803,0.073702,0.063867,0.049363,...,0.076857,0.045203,0.118631,0.059553,0.0,0.01531,0.118158,0.095037,0.13862,0.0


In [None]:
tf_idf_features_np = tf_idf_features_df[['f'+str(i) for i in range(64)]].to_numpy()
print(tf_idf_features_np.shape)
tf_idf_features_172_np = tf_idf_features_np[:164, :]
print(tf_idf_features_172_np.shape)

(500, 64)
(164, 64)


## emotions features - read

In [None]:
emotions_features_df_path = os.path.join(home_filepath, 'vgg_features', 'emotions_grouped_labels_df.csv')
emotions_features_df = pd.read_csv(emotions_features_df_path)
print(emotions_features_df.shape)
emotions_features_df.tail()

(166, 25)


Unnamed: 0,comic_no,Angry,Disgust,Fear,Happy,Sad,Surprise,Neutral,Others,Angry_bool,...,Neutral_bool,Others_bool,Angry_bool_normalized,Disgust_bool_normalized,Fear_bool_normalized,Happy_bool_normalized,Sad_bool_normalized,Surprise_bool_normalized,Neutral_bool_normalized,Others_bool_normalized
161,3620,0.43764,0.293917,0.353442,0.33462,0.188949,0.395925,0.763303,0.126515,120,...,238,6,0.160643,0.092369,0.135207,0.109772,0.034806,0.140562,0.318608,0.008032
162,3621,0.48696,0.323807,0.434981,0.334716,0.162287,0.400362,0.688288,0.088241,81,...,129,0,0.176471,0.108932,0.159041,0.111111,0.026144,0.137255,0.281046,0.0
163,3622,0.393722,0.389991,0.410423,0.405186,0.201887,0.384647,0.653067,0.116746,88,...,160,8,0.140351,0.14992,0.145136,0.138756,0.028708,0.129187,0.255183,0.012759
164,3623,0.400482,0.264482,0.293063,0.439713,0.190383,0.374375,0.796398,0.128492,190,...,443,14,0.147287,0.082171,0.096124,0.162791,0.031008,0.126357,0.343411,0.010853
165,3624,0.400267,0.265771,0.309041,0.445888,0.188209,0.355235,0.745862,0.100298,169,...,373,6,0.147084,0.086162,0.111401,0.167972,0.035683,0.121845,0.32463,0.005222


In [None]:
emotions_actual_features_cols = [ 'Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral', 'Others']
emotions_count_features_cols = [ 'Angry_bool_normalized', 'Disgust_bool_normalized', 'Fear_bool_normalized', 'Happy_bool_normalized', 'Sad_bool_normalized', 'Surprise_bool_normalized', 'Neutral_bool_normalized', 'Others_bool_normalized']
# ['idx', 'image_id', 'Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral', 'Others', 'Angry_bool', 'Disgust_bool', 'Fear_bool', 'Happy_bool', 'Sad_bool', 'Surprise_bool', 'Neutral_bool', 'Others_bool']

In [None]:
emotions_actual_features_np = emotions_features_df[emotions_actual_features_cols].to_numpy()
emotions_count_features_np = emotions_features_df[emotions_count_features_cols].to_numpy()

print(emotions_actual_features_np.shape, emotions_count_features_np.shape)

emotions_actual_features_172_np = emotions_actual_features_np[:164, :]
emotions_count_features_172_np = emotions_count_features_np[:164, :]

print(emotions_actual_features_172_np.shape, emotions_count_features_172_np.shape)

(166, 8) (166, 8)
(164, 8) (164, 8)


## search functions

In [None]:
def save_top_n_matching_info(idx_top_n_np, sim_score_top_n_np, comic_info_dict={}, print_n=10, query_book_id=1, feature_similarity_type='vgg_cosine'):
  sim_score_top_n_squeezed_np = np.squeeze(sim_score_top_n_np)
  list_of_records = []
  query_comic_no, query_book_title, query_genre = comic_info_dict[query_book_id]
  # df = pd.DataFrame({'rank': pd.Series(dtype='int'),'sim_score': pd.Series(dtype='float'), 'comic_no': pd.Series(dtype='int'),'book_title': pd.Series(dtype='str'), 'genre': pd.Series(dtype='str')})
  for i in range(1, print_n):
    # print(i, idx_top_n_np[i])
    book_idx = idx_top_n_np[i]
    sim_score_book = sim_score_top_n_squeezed_np[i]

    try:
      comic_no, book_title, genre = comic_info_dict[book_idx]
    except Exception as e:
      comic_no, book_title, genre = (-1, 'not exist', 'not exist')
    list_of_records.append({'rank': i, 'sim_score': sim_score_book, 'comic_no': comic_no, 'book_title': book_title, 'genre': genre, 
                            'query_comic_no':query_comic_no, 'query_book_title':query_book_title, 'query_genre':query_genre, 'feature_similarity_type': feature_similarity_type})

  df = pd.DataFrame.from_dict(list_of_records)
  return df


In [None]:
def run_all_similarity_on_features(all_feature_dict={}, query_book_id=2, top_n=21, comic_info_dict={}):
  pdList = []  # List of your dataframes
  np_book_idx = query_book_id
  selected_book_info = comic_info_dict[np_book_idx-1]
  # print('comic number: {} | title: {} | genre: {} '.format(selected_book_info[0], selected_book_info[1], selected_book_info[2] ))
  for k, v in all_feature_dict.items():
    # print(k)

    if k.endswith('_cosine'):

      if k.startswith('text_visual_combo'):
          text_results_cosine_similarity = np_cosine_similarity(v['tf_idf'], v['tf_idf'][max(query_book_id-1, 0):query_book_id, :])
          image_results_cosine_similarity = np_cosine_similarity(v['kmeans_tsne'], v['kmeans_tsne'][max(query_book_id-1, 0):query_book_id,:])
          mixed_results_similarity = text_results_cosine_similarity + image_results_cosine_similarity
          indices_cosine_similarity = np.argsort(np.squeeze(-mixed_results_similarity), axis=0)
          sorted_results_cosine_similarity = np.sort(-mixed_results_similarity,axis=0 )
      
      elif k.startswith('text_emotions_combo'):
          text_results_cosine_similarity = np_cosine_similarity(v['tf_idf'], v['tf_idf'][max(query_book_id-1, 0):query_book_id, :])
          emotion_results_cosine_similarity = np_cosine_similarity(v['emotions_count'], v['emotions_count'][max(query_book_id-1, 0):query_book_id,:])
          mixed_results_similarity = text_results_cosine_similarity + emotion_results_cosine_similarity
          indices_cosine_similarity = np.argsort(np.squeeze(-mixed_results_similarity), axis=0)
          sorted_results_cosine_similarity = np.sort(-mixed_results_similarity,axis=0 )
      
      elif k.startswith('emotion_visual_combo'):
          image_results_cosine_similarity = np_cosine_similarity(v['kmeans_tsne'], v['kmeans_tsne'][max(query_book_id-1, 0):query_book_id,:])
          emotion_results_cosine_similarity = np_cosine_similarity(v['emotions_count'], v['emotions_count'][max(query_book_id-1, 0):query_book_id,:])
          mixed_results_similarity = image_results_cosine_similarity + emotion_results_cosine_similarity
          indices_cosine_similarity = np.argsort(np.squeeze(-mixed_results_similarity), axis=0)
          sorted_results_cosine_similarity = np.sort(-mixed_results_similarity,axis=0 )

      elif k.startswith('text_visual_reranking'):
          text_results_cosine_similarity = np_cosine_similarity(v['tf_idf'], v['tf_idf'][max(query_book_id-1, 0):query_book_id, :])
          text_results_indices_cosine_similarity = np.argsort(np.squeeze(-text_results_cosine_similarity), axis=0)[:50]
          text_results_sorted_results_cosine_similarity = np.sort(-text_results_cosine_similarity,axis=0 )[:50]

          results_cosine_similarity = np_cosine_similarity(v['kmeans_tsne'][text_results_indices_cosine_similarity,:], v['kmeans_tsne'][max(query_book_id-1, 0):query_book_id, :])
          indices_cosine_similarity = np.argsort(np.squeeze(-results_cosine_similarity), axis=0)
          sorted_results_cosine_similarity = np.sort(-results_cosine_similarity,axis=0 )

      else:    
          results_cosine_similarity = np_cosine_similarity(v, v[max(query_book_id-1, 0):query_book_id, :])
          indices_cosine_similarity = np.argsort(np.squeeze(-results_cosine_similarity), axis=0)
          sorted_results_cosine_similarity = np.sort(-results_cosine_similarity,axis=0 )
      
      cosine_df = save_top_n_matching_info(indices_cosine_similarity, sorted_results_cosine_similarity, comic_info_dict=comic_info_dict, print_n=top_n, query_book_id=query_book_id-1, feature_similarity_type=k)
      pdList.append(cosine_df)

    else:

      if k.startswith('text_visual_combo'):
          text_results_l2_similarity = np_l2_similarity(v['tf_idf'], v['tf_idf'][max(query_book_id-1, 0):query_book_id, :])
          image_results_l2_similarity = np_l2_similarity(v['kmeans_tsne'], v['kmeans_tsne'][max(query_book_id-1, 0):query_book_id,:])
          mixed_results_similarity = text_results_l2_similarity + image_results_l2_similarity
          indices_l2_similarity = np.argsort(np.squeeze(mixed_results_similarity), axis=0)
          sorted_results_l2_similarity = np.sort(mixed_results_similarity,axis=0 )

      elif k.startswith('text_emotions_combo'):
          text_results_l2_similarity = np_l2_similarity(v['tf_idf'], v['tf_idf'][max(query_book_id-1, 0):query_book_id, :])
          emotion_results_l2_similarity = np_l2_similarity(v['emotions_count'], v['emotions_count'][max(query_book_id-1, 0):query_book_id,:])
          mixed_results_similarity = text_results_l2_similarity + emotion_results_l2_similarity
          indices_l2_similarity = np.argsort(np.squeeze(mixed_results_similarity), axis=0)
          sorted_results_l2_similarity = np.sort(mixed_results_similarity,axis=0 )
      
      elif k.startswith('emotion_visual_combo'):
          emotion_results_l2_similarity = np_l2_similarity(v['emotions_count'], v['emotions_count'][max(query_book_id-1, 0):query_book_id,:])
          image_results_l2_similarity = np_l2_similarity(v['kmeans_tsne'], v['kmeans_tsne'][max(query_book_id-1, 0):query_book_id,:])
          mixed_results_similarity = emotion_results_l2_similarity + image_results_l2_similarity
          indices_l2_similarity = np.argsort(np.squeeze(mixed_results_similarity), axis=0)
          sorted_results_l2_similarity = np.sort(mixed_results_similarity,axis=0 )

      elif k.startswith('text_visual_reranking'):
          text_results_l2_similarity = np_l2_similarity(v['tf_idf'], v['tf_idf'][max(query_book_id-1, 0):query_book_id, :])
          text_results_indices_l2_similarity = np.argsort(np.squeeze(text_results_l2_similarity), axis=0)[:50]
          text_results_sorted_results_l2_similarity = np.sort(text_results_l2_similarity,axis=0 )[:50]

          results_l2_similarity = np_cosine_similarity(v['kmeans_tsne'][text_results_indices_l2_similarity,:], v['kmeans_tsne'][max(query_book_id-1, 0):query_book_id, :])
          indices_l2_similarity = np.argsort(np.squeeze(results_l2_similarity), axis=0)
          sorted_results_l2_similarity = np.sort(results_l2_similarity,axis=0 )

      else:
          results_l2_similarity = np_l2_similarity(v, v[max(query_book_id-1, 0):query_book_id,:])
          indices_l2_similarity = np.argsort(np.squeeze(results_l2_similarity), axis=0)
          sorted_results_l2_similarity = np.sort(results_l2_similarity,axis=0 )

      l2_df = save_top_n_matching_info(indices_l2_similarity, sorted_results_l2_similarity, comic_info_dict=comic_info_dict, print_n=top_n, query_book_id=query_book_id-1, feature_similarity_type=k)
      pdList.append(l2_df)

  concatenated_df = pd.concat(pdList)
  out_filename = str(selected_book_info[0])+'_'+selected_book_info[1]+'_concatenated_df.csv'
  output_filepath = os.path.join(home_filepath, 'vgg_features', 'search_results','vgg_kmeans_tsne_tfidf_emotions', out_filename)
  print(output_filepath)
  print()
  concatenated_df.to_csv(output_filepath, index=False)

  return concatenated_df

## initialize feature dict

In [None]:
all_feature_dict = {
                    'vgg_cosine': averaged_embedding_per_book_limited_172_np, 'vgg_l2': averaged_embedding_per_book_limited_172_np, 'kmeans_cosine':clustering_features_limited_172_np, 'kmeans_l2':clustering_features_limited_172_np,
                    'tsne_cosine': tsne_features_limited_172_np, 'tsne_l2': tsne_features_limited_172_np, 'kmeans_tsne_cosine': all_features_limited_172_np, 'kmeans_tsne_l2': all_features_limited_172_np, 
                    'tf_idf_cosine': tf_idf_features_172_np, 'tf_idf_l2': tf_idf_features_172_np, 'text_visual_combo_cosine': {'tf_idf': tf_idf_features_172_np, 'kmeans_tsne':all_features_limited_172_np},
                    'text_visual_combo_l2': {'tf_idf': tf_idf_features_172_np, 'kmeans_tsne':all_features_limited_172_np}, 'text_visual_reranking_cosine':{'tf_idf': tf_idf_features_172_np, 'kmeans_tsne':all_features_limited_172_np},
                    'text_visual_reranking_l2':{'tf_idf': tf_idf_features_172_np, 'kmeans_tsne':all_features_limited_172_np}, 'emotions_actual_cosine': emotions_actual_features_172_np, 'emotions_actual_l2': emotions_actual_features_172_np,
                    'emotions_count_cosine': emotions_count_features_172_np, 'emotions_count_l2': emotions_count_features_172_np, 
                    'text_emotions_combo_cosine': {'emotions_count': emotions_actual_features_172_np, 'tf_idf': tf_idf_features_172_np,},
                    'text_emotions_combo_l2': {'emotions_count': emotions_actual_features_172_np, 'tf_idf': tf_idf_features_172_np,}, 
                    'emotion_visual_combo_cosine':{'emotions_count': emotions_actual_features_172_np, 'kmeans_tsne':all_features_limited_172_np},
                    'emotion_visual_combo_l2':{'emotions_count': emotions_actual_features_172_np, 'kmeans_tsne':all_features_limited_172_np},
                    }

comic_info_dict=idx_comicno_bookname_genre_mapping_dict

top_n=31

query_book_id_lst = [i for i in range(2, 164)]
lst_of_df = []

for id in query_book_id_lst:
  query_book_id = id
  np_book_idx = query_book_id
  selected_book_info = idx_comicno_bookname_genre_mapping_dict[query_book_id-1]
  print('comic number: {} | title: {} | genre: {} '.format(selected_book_info[0], selected_book_info[1], selected_book_info[2] ))
  concatenated_df = run_all_similarity_on_features(all_feature_dict=all_feature_dict, query_book_id=query_book_id, top_n=21, comic_info_dict=comic_info_dict)
  lst_of_df.append(concatenated_df)

comic number: 3452 | title: Dick Cole - Blue Bolt | genre: superhero|vigilante 
/content/drive/MyDrive/suraj/thesis/vgg_features/search_results/vgg_kmeans_tsne_tfidf_emotions/3452_Dick Cole - Blue Bolt_concatenated_df.csv

comic number: 3453 | title: Dick Cole - Blue Bolt | genre: superhero|vigilante 
/content/drive/MyDrive/suraj/thesis/vgg_features/search_results/vgg_kmeans_tsne_tfidf_emotions/3453_Dick Cole - Blue Bolt_concatenated_df.csv

comic number: 3454 | title: Dick Cole - Blue Bolt | genre: superhero|vigilante 
/content/drive/MyDrive/suraj/thesis/vgg_features/search_results/vgg_kmeans_tsne_tfidf_emotions/3454_Dick Cole - Blue Bolt_concatenated_df.csv

comic number: 3455 | title: Dick Cole - Blue Bolt | genre: superhero|vigilante 
/content/drive/MyDrive/suraj/thesis/vgg_features/search_results/vgg_kmeans_tsne_tfidf_emotions/3455_Dick Cole - Blue Bolt_concatenated_df.csv

comic number: 3456 | title: Blue Bolt | genre: superhero|vigilante 
/content/drive/MyDrive/suraj/thesis/vgg_

In [None]:
print(concatenated_df.shape)
concatenated_df['feature_similarity_type'].value_counts()

(440, 9)


vgg_cosine                      20
vgg_l2                          20
emotion_visual_combo_cosine     20
text_emotions_combo_l2          20
text_emotions_combo_cosine      20
emotions_count_l2               20
emotions_count_cosine           20
emotions_actual_l2              20
emotions_actual_cosine          20
text_visual_reranking_l2        20
text_visual_reranking_cosine    20
text_visual_combo_l2            20
text_visual_combo_cosine        20
tf_idf_l2                       20
tf_idf_cosine                   20
kmeans_tsne_l2                  20
kmeans_tsne_cosine              20
tsne_l2                         20
tsne_cosine                     20
kmeans_l2                       20
kmeans_cosine                   20
emotion_visual_combo_l2         20
Name: feature_similarity_type, dtype: int64