<a href="https://colab.research.google.com/github/surajsrivathsa/thesis_comics_search_xai/blob/main/feature_extraction/jupyter_notebooks/search_engine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preamble

In [35]:
import os, sys, pickle, glob, numpy as np, pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [2]:
!pip3 install pickle5
import pickle5 as pickle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pickle5
  Downloading pickle5-0.0.12-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (256 kB)
[K     |████████████████████████████████| 256 kB 4.3 MB/s 
[?25hInstalling collected packages: pickle5
Successfully installed pickle5-0.0.12


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
!ls drive

MyDrive


In [6]:
home_filepath = '/content/drive/MyDrive/suraj/thesis'
os.chdir(home_filepath)
os.getcwd()

'/content/drive/MyDrive/suraj/thesis'

In [7]:
featureset_path = os.path.join(home_filepath, 'features')

# Search with TF IDF and Emotions

## similarity metrics functions

In [8]:
def np_cosine_similarity(u, v):
  u = np.expand_dims(u, 1)
  n = np.sum(u * v, axis=2)
  d = np.linalg.norm(u, axis=2) * np.linalg.norm(v, axis=1)
  return n / d

def np_l2_similarity(u, v):
  d = np.linalg.norm(u - v, axis=1)
  return d

## comic book information

In [10]:
comic_book_name_path = os.path.join(featureset_path, 'comicnum_to_book_title.csv')
comic_book_name_df = pd.read_csv(comic_book_name_path)
comic_book_name_df['our_idx'] = comic_book_name_df.index.copy()

idx_comicno_bookname_genre_mapping_dict = {}
counter = 0
for idx, row in comic_book_name_df.iterrows():
  if str(row['Book Title']) != 'nan':
    # print(counter, row['comic_no'], row['Book Title'], row['genre'])
    idx_comicno_bookname_genre_mapping_dict[counter] = [row['comic_no'], row['Book Title'], row['genre']]
    counter += 1

idx_comicno_bookname_genre_mapping_dict[0]

[3451, 'Blue Bolt', 'superhero|vigilante']

In [11]:
idx_comicno_bookname_genre_mapping_dict[164]

[3623,
 'Jumbo Comics - Sheena - Tigerman terror',
 'jungle|adventure|female|scifi|children']

In [13]:
selected_book_num = 91 # select till 172 idx
np_book_idx = selected_book_num
selected_book_info = idx_comicno_bookname_genre_mapping_dict[selected_book_num-1]
print('comic number: {} | title: {} | genre: {} '.format(selected_book_info[0], selected_book_info[1], selected_book_info[2] ))

comic number: 3541 | title: Feature Comics - Reynolds of the mounted | genre: humor|detective|mystery 


## vgg embedding features - read

In [15]:
book_embedding_path = os.path.join(featureset_path, 'averaged_embedding_per_book_np.pickle')
print(book_embedding_path)

with open(book_embedding_path, 'rb') as handle:
  averaged_embedding_per_book_np = pickle.load(handle)

averaged_embedding_per_book_np.shape

/content/drive/MyDrive/suraj/thesis/features/averaged_embedding_per_book_np.pickle


(500, 4096)

In [16]:
averaged_embedding_per_book_limited_172_np =averaged_embedding_per_book_np[:164, :]
print(averaged_embedding_per_book_limited_172_np.shape)

(164, 4096)


In [38]:
tsne_averaged_embedding_per_book_np = TSNE(n_components=3).fit_transform(averaged_embedding_per_book_limited_172_np)



In [41]:
featureset_path

'/content/drive/MyDrive/suraj/thesis/features'

In [42]:
with open(os.path.join(featureset_path, 'tsne_averaged_embedding_per_book_np.pickle'),'wb') as f:
    pickle.dump(tsne_averaged_embedding_per_book_np, f)

In [43]:
with open(os.path.join(featureset_path, 'tsne_averaged_embedding_per_book_np.pickle'),'rb') as f:
    tsne_averaged_embedding_per_book_np = pickle.load( f)

## kmeans tsne features - read

In [17]:
kmeans_tsne_features_df_path = os.path.join(featureset_path, 'kmeans_tsne_features_df.csv')
kmeans_tsne_features_df = pd.read_csv(kmeans_tsne_features_df_path)
print(kmeans_tsne_features_df.shape)
kmeans_tsne_features_df.tail()

(500, 14)


Unnamed: 0,clust_0,clust_1,clust_2,clust_3,clust_4,clust_5,clust_6,clust_7,clust_8,clust_9,tsne_0,tsne_1,tsne_2,max_scaled_panel_count
495,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.847125,0.039475,0.010068,0.229091
496,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.847125,0.039475,0.010068,0.318182
497,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.847125,0.039475,0.010068,0.329091
498,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.847125,0.039475,0.010068,0.785455
499,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.849012,0.040408,0.010291,0.398182


In [18]:
clustering_features_np = kmeans_tsne_features_df[['clust_0', 'clust_1', 'clust_2', 'clust_3', 'clust_4', 'clust_5','clust_6', 'clust_7', 'clust_8', 'clust_9']].to_numpy()
clustering_features_limited_172_np = clustering_features_np[:164, :]

tsne_features_np = kmeans_tsne_features_df[['tsne_0', 'tsne_1', 'tsne_2']].to_numpy()
tsne_features_limited_172_np = tsne_features_np[:164, :]

all_features_np = kmeans_tsne_features_df.to_numpy()
all_features_limited_172_np= all_features_np[:164, :]

print(clustering_features_np.shape, clustering_features_limited_172_np.shape, tsne_features_np.shape, tsne_features_limited_172_np.shape, all_features_np.shape, all_features_limited_172_np.shape)

(500, 10) (164, 10) (500, 3) (164, 3) (500, 14) (164, 14)


In [19]:
kmeans_tsne_features_df.head(2)

Unnamed: 0,clust_0,clust_1,clust_2,clust_3,clust_4,clust_5,clust_6,clust_7,clust_8,clust_9,tsne_0,tsne_1,tsne_2,max_scaled_panel_count
0,0.132911,0.031646,0.120253,0.246835,0.120253,0.0,0.094937,0.056962,0.082278,0.113924,-0.071253,0.10169,-0.176826,0.287273
1,0.112299,0.045455,0.21123,0.208556,0.069519,0.0,0.122995,0.034759,0.037433,0.157754,-0.109551,0.28107,-0.370177,0.68


## tf idf features - read

In [20]:
tf_idf_features_df_path = os.path.join(featureset_path, 'Text_TF_IDF_Features.csv')
tf_idf_features_df = pd.read_csv(tf_idf_features_df_path)
print(tf_idf_features_df.shape)
tf_idf_features_df.tail()

(500, 65)


Unnamed: 0,comic_no,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f54,f55,f56,f57,f58,f59,f60,f61,f62,f63
495,3954,0.040774,0.0,0.0,0.0,0.0,0.0,0.241251,0.043554,0.022442,...,0.020965,0.061652,0.1618,0.020306,0.021388,0.083526,0.161155,0.021603,0.021007,0.083193
496,3955,0.143301,0.053054,0.029029,0.0,0.05972,0.0,0.089926,0.0,0.043022,...,0.026793,0.013132,0.142162,0.14273,0.068336,0.08006,0.025745,0.055218,0.0,0.01329
497,3956,0.159456,0.081173,0.059219,0.0,0.076143,0.0,0.209658,0.056776,0.029255,...,0.0,0.040184,0.145005,0.13235,0.097584,0.081661,0.131298,0.042242,0.054768,0.06778
498,3957,0.162522,0.110313,0.017245,0.016667,0.026608,0.011173,0.236585,0.008267,0.042596,...,0.063669,0.109217,0.084454,0.077083,0.10555,0.095122,0.137646,0.0,0.039872,0.126324
499,3958,0.059791,0.076094,0.033308,0.032191,0.0,0.215803,0.073702,0.063867,0.049363,...,0.076857,0.045203,0.118631,0.059553,0.0,0.01531,0.118158,0.095037,0.13862,0.0


In [21]:
tf_idf_features_np = tf_idf_features_df[['f'+str(i) for i in range(64)]].to_numpy()
print(tf_idf_features_np.shape)
tf_idf_features_172_np = tf_idf_features_np[:164, :]
print(tf_idf_features_172_np.shape)

(500, 64)
(164, 64)


## emotions features - read

In [22]:
emotions_features_df_path = os.path.join(featureset_path, 'emotions_grouped_labels_df.csv')
emotions_features_df = pd.read_csv(emotions_features_df_path)
print(emotions_features_df.shape)
emotions_features_df.tail(2)

(166, 25)


Unnamed: 0,comic_no,Angry,Disgust,Fear,Happy,Sad,Surprise,Neutral,Others,Angry_bool,...,Neutral_bool,Others_bool,Angry_bool_normalized,Disgust_bool_normalized,Fear_bool_normalized,Happy_bool_normalized,Sad_bool_normalized,Surprise_bool_normalized,Neutral_bool_normalized,Others_bool_normalized
164,3623,0.400482,0.264482,0.293063,0.439713,0.190383,0.374375,0.796398,0.128492,190,...,443,14,0.147287,0.082171,0.096124,0.162791,0.031008,0.126357,0.343411,0.010853
165,3624,0.400267,0.265771,0.309041,0.445888,0.188209,0.355235,0.745862,0.100298,169,...,373,6,0.147084,0.086162,0.111401,0.167972,0.035683,0.121845,0.32463,0.005222


In [23]:
emotions_actual_features_cols = [ 'Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise',]
emotions_count_features_cols = [ 'Angry_bool_normalized', 'Disgust_bool_normalized', 'Fear_bool_normalized', 'Happy_bool_normalized', 'Sad_bool_normalized', 'Surprise_bool_normalized',]
# ['idx', 'image_id', 'Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral', 'Others', 'Angry_bool', 'Disgust_bool', 'Fear_bool', 'Happy_bool', 'Sad_bool', 'Surprise_bool', 'Neutral_bool', 'Others_bool']

In [24]:
emotions_actual_features_np = emotions_features_df[emotions_actual_features_cols].to_numpy()
emotions_count_features_np = emotions_features_df[emotions_count_features_cols].to_numpy()

print(emotions_actual_features_np.shape, emotions_count_features_np.shape)

emotions_actual_features_172_np = emotions_actual_features_np[:164, :]
emotions_count_features_172_np = emotions_count_features_np[:164, :]

print(emotions_actual_features_172_np.shape, emotions_count_features_172_np.shape)

(166, 6) (166, 6)
(164, 6) (164, 6)


## supersense and gender features - read

In [25]:
gender_supersense_features_df_path = os.path.join(featureset_path, 'features_supersense_character_gender_df_20220619.csv')
gender_supersense_features_df = pd.read_csv(gender_supersense_features_df_path)
print(gender_supersense_features_df.shape)
gender_supersense_features_df.tail(2)

(165, 103)


Unnamed: 0,comic_no,Book Title,book_title_clean,unique_id,dialogue_text_filepath,character_dict,unique_character_count,male_orientation_count,female_orientation_count,neutral_orientation_count,...,verb.possession_count,verb.possession_ratio,verb.social_count,verb.social_ratio,verb.stative_count,verb.stative_ratio,verb.weather_count,verb.weather_ratio,adj.ppl_count,adj.ppl_ratio
163,3622,JoJo - The Copy-Mad Killers,jojo_the_copy_mad_killers,3622_jojo_the_copy_mad_killers,/content/drive/MyDrive/suraj/thesis/dataset/em...,"{'3622': [{'name': 'jo jo', 'character_id': 19...",13,51.0,50.0,2.0,...,152,0.041908,218,0.060105,409,0.112765,0.0,0.0,0.0,0.0
164,3623,Jumbo Comics - Sheena - Tigerman terror,jumbo_comics_sheena_tigerman_terror,3623_jumbo_comics_sheena_tigerman_terror,/content/drive/MyDrive/suraj/thesis/dataset/em...,"{'3623': [{'name': 'rick', 'character_id': 21,...",15,118.0,17.0,0.0,...,56,0.02,97,0.034643,258,0.092143,0.0,0.0,0.0,0.0


In [29]:
max_characters_in_book = gender_supersense_features_df['unique_character_count'].max()
gender_supersense_features_df['plot_complexity'] = gender_supersense_features_df['unique_character_count']/max_characters_in_book
gender_supersense_features_df.tail(3)

Unnamed: 0,comic_no,Book Title,book_title_clean,unique_id,dialogue_text_filepath,character_dict,unique_character_count,male_orientation_count,female_orientation_count,neutral_orientation_count,...,verb.possession_ratio,verb.social_count,verb.social_ratio,verb.stative_count,verb.stative_ratio,verb.weather_count,verb.weather_ratio,adj.ppl_count,adj.ppl_ratio,plot_complexity
162,3621,JoJo - The Mountain of Skulls,jojo_the_mountain_of_skulls,3621_jojo_the_mountain_of_skulls,/content/drive/MyDrive/suraj/thesis/dataset/em...,"{'3621': [{'name': 'jo jo', 'character_id': 3,...",7,27.0,14.0,0.0,...,0.03937,79,0.05655,143,0.102362,0.0,0.0,0.0,0.0,0.14
163,3622,JoJo - The Copy-Mad Killers,jojo_the_copy_mad_killers,3622_jojo_the_copy_mad_killers,/content/drive/MyDrive/suraj/thesis/dataset/em...,"{'3622': [{'name': 'jo jo', 'character_id': 19...",13,51.0,50.0,2.0,...,0.041908,218,0.060105,409,0.112765,0.0,0.0,0.0,0.0,0.26
164,3623,Jumbo Comics - Sheena - Tigerman terror,jumbo_comics_sheena_tigerman_terror,3623_jumbo_comics_sheena_tigerman_terror,/content/drive/MyDrive/suraj/thesis/dataset/em...,"{'3623': [{'name': 'rick', 'character_id': 21,...",15,118.0,17.0,0.0,...,0.02,97,0.034643,258,0.092143,0.0,0.0,0.0,0.0,0.3


In [32]:
all_cols_lst = list(gender_supersense_features_df.columns)
gender_orientation_cols_lst = [col for col in all_cols_lst if col.endswith('orientation_ratio')]

supersense_cols_lst = [col for col in all_cols_lst if col.endswith('ratio') and not col.endswith('orientation_ratio')]

plot_complexity_cols_lst = ['plot_complexity']

In [33]:
combined_feature_cols = gender_orientation_cols_lst+supersense_cols_lst+plot_complexity_cols_lst

In [52]:
gender_orientation_features_np = gender_supersense_features_df[gender_orientation_cols_lst].to_numpy()[:164, :]
supersense_features_np = gender_supersense_features_df[supersense_cols_lst].to_numpy()[:164, :]
plot_complexity_features_np = gender_supersense_features_df[plot_complexity_cols_lst].to_numpy()[:164, :]
combined_features_np = gender_supersense_features_df[combined_feature_cols].to_numpy()[:164, :]

print(gender_orientation_features_np.shape, combined_features_np.shape, plot_complexity_features_np.shape)

(164, 3) (164, 49) (164, 1)


## search functions

In [55]:
if not os.path.isdir(os.path.join(featureset_path, 'search_results_20220620')):
    print('directory doesnt exist, making new directory')
    os.mkdir(os.path.join(featureset_path, 'search_results_20220620'))

In [56]:
def save_top_n_matching_info(idx_top_n_np, sim_score_top_n_np, comic_info_dict={}, print_n=10, query_book_id=1, feature_similarity_type='vgg_cosine'):
  sim_score_top_n_squeezed_np = np.squeeze(sim_score_top_n_np)
  list_of_records = []
  query_comic_no, query_book_title, query_genre = comic_info_dict[query_book_id]
  # df = pd.DataFrame({'rank': pd.Series(dtype='int'),'sim_score': pd.Series(dtype='float'), 'comic_no': pd.Series(dtype='int'),'book_title': pd.Series(dtype='str'), 'genre': pd.Series(dtype='str')})
  for i in range(1, print_n):
    # print(i, idx_top_n_np[i])
    book_idx = idx_top_n_np[i]
    sim_score_book = sim_score_top_n_squeezed_np[i]

    try:
      comic_no, book_title, genre = comic_info_dict[book_idx]
    except Exception as e:
      comic_no, book_title, genre = (-1, 'not exist', 'not exist')
    list_of_records.append({'rank': i, 'sim_score': sim_score_book, 'comic_no': comic_no, 'book_title': book_title, 'genre': genre, 
                            'query_comic_no':query_comic_no, 'query_book_title':query_book_title, 'query_genre':query_genre, 'feature_similarity_type': feature_similarity_type})

  df = pd.DataFrame.from_dict(list_of_records)
  return df


In [57]:
def run_all_similarity_on_features(all_feature_dict={}, query_book_id=2, top_n=21, comic_info_dict={}):
  pdList = []  # List of your dataframes
  np_book_idx = query_book_id
  selected_book_info = comic_info_dict[np_book_idx-1]
  # print('comic number: {} | title: {} | genre: {} '.format(selected_book_info[0], selected_book_info[1], selected_book_info[2] ))
  for k, v in all_feature_dict.items():
    # print(k)

    if k.endswith('_cosine'):

      if k.startswith('text_visual_combo'):
          text_results_cosine_similarity = np_cosine_similarity(v['tf_idf'], v['tf_idf'][max(query_book_id-1, 0):query_book_id, :])
          image_results_cosine_similarity = np_cosine_similarity(v['kmeans_tsne'], v['kmeans_tsne'][max(query_book_id-1, 0):query_book_id,:])
          mixed_results_similarity = text_results_cosine_similarity + image_results_cosine_similarity
          indices_cosine_similarity = np.argsort(np.squeeze(-mixed_results_similarity), axis=0)
          sorted_results_cosine_similarity = np.sort(-mixed_results_similarity,axis=0 )
      
      elif k.startswith('text_emotions_combo'):
          text_results_cosine_similarity = np_cosine_similarity(v['tf_idf'], v['tf_idf'][max(query_book_id-1, 0):query_book_id, :])
          emotion_results_cosine_similarity = np_cosine_similarity(v['emotions_count'], v['emotions_count'][max(query_book_id-1, 0):query_book_id,:])
          mixed_results_similarity = text_results_cosine_similarity + emotion_results_cosine_similarity
          indices_cosine_similarity = np.argsort(np.squeeze(-mixed_results_similarity), axis=0)
          sorted_results_cosine_similarity = np.sort(-mixed_results_similarity,axis=0 )
      
      elif k.startswith('emotion_visual_combo'):
          image_results_cosine_similarity = np_cosine_similarity(v['kmeans_tsne'], v['kmeans_tsne'][max(query_book_id-1, 0):query_book_id,:])
          emotion_results_cosine_similarity = np_cosine_similarity(v['emotions_count'], v['emotions_count'][max(query_book_id-1, 0):query_book_id,:])
          mixed_results_similarity = image_results_cosine_similarity + emotion_results_cosine_similarity
          indices_cosine_similarity = np.argsort(np.squeeze(-mixed_results_similarity), axis=0)
          sorted_results_cosine_similarity = np.sort(-mixed_results_similarity,axis=0 )

      elif k.startswith('text_visual_reranking'):
          text_results_cosine_similarity = np_cosine_similarity(v['tf_idf'], v['tf_idf'][max(query_book_id-1, 0):query_book_id, :])
          text_results_indices_cosine_similarity = np.argsort(np.squeeze(-text_results_cosine_similarity), axis=0)[:50]
          text_results_sorted_results_cosine_similarity = np.sort(-text_results_cosine_similarity,axis=0 )[:50]

          results_cosine_similarity = np_cosine_similarity(v['kmeans_tsne'][text_results_indices_cosine_similarity,:], v['kmeans_tsne'][max(query_book_id-1, 0):query_book_id, :])
          indices_cosine_similarity = np.argsort(np.squeeze(-results_cosine_similarity), axis=0)
          sorted_results_cosine_similarity = np.sort(-results_cosine_similarity,axis=0 )

      elif k.startswith('without_tf_idf'):
          emotion_results_cosine_similarity = np_cosine_similarity(v['emotions_count'], v['emotions_count'][max(query_book_id-1, 0):query_book_id,:])
          image_results_cosine_similarity = np_cosine_similarity(v['kmeans_tsne'], v['kmeans_tsne'][max(query_book_id-1, 0):query_book_id,:])
          combined_gsp_cosine_similarity = np_cosine_similarity(v['combined_gsp'], v['combined_gsp'][max(query_book_id-1, 0):query_book_id,:])
          mixed_results_similarity = emotion_results_cosine_similarity + image_results_cosine_similarity + combined_gsp_cosine_similarity
          indices_cosine_similarity = np.argsort(np.squeeze(mixed_results_similarity), axis=0)
          sorted_results_cosine_similarity = np.sort(mixed_results_similarity,axis=0 )

      else:    
          results_cosine_similarity = np_cosine_similarity(v, v[max(query_book_id-1, 0):query_book_id, :])
          indices_cosine_similarity = np.argsort(np.squeeze(-results_cosine_similarity), axis=0)
          sorted_results_cosine_similarity = np.sort(-results_cosine_similarity,axis=0 )
      
      cosine_df = save_top_n_matching_info(indices_cosine_similarity, sorted_results_cosine_similarity, comic_info_dict=comic_info_dict, print_n=top_n, query_book_id=query_book_id-1, feature_similarity_type=k)
      pdList.append(cosine_df)

    else:

      if k.startswith('text_visual_combo'):
          text_results_l2_similarity = np_l2_similarity(v['tf_idf'], v['tf_idf'][max(query_book_id-1, 0):query_book_id, :])
          image_results_l2_similarity = np_l2_similarity(v['kmeans_tsne'], v['kmeans_tsne'][max(query_book_id-1, 0):query_book_id,:])
          mixed_results_similarity = text_results_l2_similarity + image_results_l2_similarity
          indices_l2_similarity = np.argsort(np.squeeze(mixed_results_similarity), axis=0)
          sorted_results_l2_similarity = np.sort(mixed_results_similarity,axis=0 )

      elif k.startswith('text_emotions_combo'):
          text_results_l2_similarity = np_l2_similarity(v['tf_idf'], v['tf_idf'][max(query_book_id-1, 0):query_book_id, :])
          emotion_results_l2_similarity = np_l2_similarity(v['emotions_count'], v['emotions_count'][max(query_book_id-1, 0):query_book_id,:])
          mixed_results_similarity = text_results_l2_similarity + emotion_results_l2_similarity
          indices_l2_similarity = np.argsort(np.squeeze(mixed_results_similarity), axis=0)
          sorted_results_l2_similarity = np.sort(mixed_results_similarity,axis=0 )
      
      elif k.startswith('emotion_visual_combo'):
          emotion_results_l2_similarity = np_l2_similarity(v['emotions_count'], v['emotions_count'][max(query_book_id-1, 0):query_book_id,:])
          image_results_l2_similarity = np_l2_similarity(v['kmeans_tsne'], v['kmeans_tsne'][max(query_book_id-1, 0):query_book_id,:])
          mixed_results_similarity = emotion_results_l2_similarity + image_results_l2_similarity
          indices_l2_similarity = np.argsort(np.squeeze(mixed_results_similarity), axis=0)
          sorted_results_l2_similarity = np.sort(mixed_results_similarity,axis=0 )

      elif k.startswith('text_visual_reranking'):
          text_results_l2_similarity = np_l2_similarity(v['tf_idf'], v['tf_idf'][max(query_book_id-1, 0):query_book_id, :])
          text_results_indices_l2_similarity = np.argsort(np.squeeze(text_results_l2_similarity), axis=0)[:50]
          text_results_sorted_results_l2_similarity = np.sort(text_results_l2_similarity,axis=0 )[:50]

          results_l2_similarity = np_cosine_similarity(v['kmeans_tsne'][text_results_indices_l2_similarity,:], v['kmeans_tsne'][max(query_book_id-1, 0):query_book_id, :])
          indices_l2_similarity = np.argsort(np.squeeze(results_l2_similarity), axis=0)
          sorted_results_l2_similarity = np.sort(results_l2_similarity,axis=0 )

      elif k.startswith('without_tf_idf'):
          emotion_results_l2_similarity = np_l2_similarity(v['emotions_count'], v['emotions_count'][max(query_book_id-1, 0):query_book_id,:])
          image_results_l2_similarity = np_l2_similarity(v['kmeans_tsne'], v['kmeans_tsne'][max(query_book_id-1, 0):query_book_id,:])
          combined_gsp_l2_similarity = np_l2_similarity(v['combined_gsp'], v['combined_gsp'][max(query_book_id-1, 0):query_book_id,:])
          mixed_results_similarity = emotion_results_l2_similarity + image_results_l2_similarity + combined_gsp_l2_similarity
          indices_l2_similarity = np.argsort(np.squeeze(mixed_results_similarity), axis=0)
          sorted_results_l2_similarity = np.sort(mixed_results_similarity,axis=0 )

      else:
          results_l2_similarity = np_l2_similarity(v, v[max(query_book_id-1, 0):query_book_id,:])
          indices_l2_similarity = np.argsort(np.squeeze(results_l2_similarity), axis=0)
          sorted_results_l2_similarity = np.sort(results_l2_similarity,axis=0 )

      l2_df = save_top_n_matching_info(indices_l2_similarity, sorted_results_l2_similarity, comic_info_dict=comic_info_dict, print_n=top_n, query_book_id=query_book_id-1, feature_similarity_type=k)
      pdList.append(l2_df)

  concatenated_df = pd.concat(pdList)
  out_filename = str(selected_book_info[0])+'_'+selected_book_info[1]+'_concatenated_df.csv'
  output_filepath = os.path.join(featureset_path, 'search_results_20220620', out_filename)
  print(output_filepath)
  print()
  concatenated_df.to_csv(output_filepath, index=False)

  return concatenated_df

## initialize feature dict

In [50]:
# gender_orientation_features_np = gender_supersense_features_df[gender_orientation_cols_lst].to_numpy()[:164, :]
# supersense_features_np = gender_supersense_features_df[supersense_cols_lst].to_numpy()[:164, :]
# plot_complexity_features_np = gender_supersense_features_df[plot_complexity_cols_lst].to_numpy()[:164, :]
# combined_features_np = gender_supersense_features_df[combined_feature_cols].to_numpy()[:164, :]

In [60]:
all_feature_dict = {
                    'vgg_cosine': tsne_averaged_embedding_per_book_np, 'vgg_l2': tsne_averaged_embedding_per_book_np, 'kmeans_cosine':clustering_features_limited_172_np, 'kmeans_l2':clustering_features_limited_172_np,
                    'tsne_cosine': tsne_features_limited_172_np, 'tsne_l2': tsne_features_limited_172_np, 'kmeans_tsne_cosine': all_features_limited_172_np, 'kmeans_tsne_l2': all_features_limited_172_np, 
                    'tf_idf_cosine': tf_idf_features_172_np, 'tf_idf_l2': tf_idf_features_172_np, 'text_visual_combo_cosine': {'tf_idf': tf_idf_features_172_np, 'kmeans_tsne':all_features_limited_172_np},
                    'text_visual_combo_l2': {'tf_idf': tf_idf_features_172_np, 'kmeans_tsne':all_features_limited_172_np}, 'text_visual_reranking_cosine':{'tf_idf': tf_idf_features_172_np, 'kmeans_tsne':all_features_limited_172_np},
                    'text_visual_reranking_l2':{'tf_idf': tf_idf_features_172_np, 'kmeans_tsne':all_features_limited_172_np}, 'emotions_actual_cosine': emotions_actual_features_172_np, 'emotions_actual_l2': emotions_actual_features_172_np,
                    'emotions_count_cosine': emotions_count_features_172_np, 'emotions_count_l2': emotions_count_features_172_np, 
                    'text_emotions_combo_cosine': {'emotions_count': emotions_actual_features_172_np, 'tf_idf': tf_idf_features_172_np,},
                    'text_emotions_combo_l2': {'emotions_count': emotions_actual_features_172_np, 'tf_idf': tf_idf_features_172_np,}, 
                    'emotion_visual_combo_cosine':{'emotions_count': emotions_actual_features_172_np, 'kmeans_tsne':all_features_limited_172_np},
                    'emotion_visual_combo_l2':{'emotions_count': emotions_actual_features_172_np, 'kmeans_tsne':all_features_limited_172_np},
                    'gender_orientation_cosine': gender_orientation_features_np, 'gender_orientation_l2': gender_orientation_features_np,
                    'supersense_features_cosine': supersense_features_np, 'supersense_features_l2': supersense_features_np,
                    'plot_complexity_features_cosine': plot_complexity_features_np, 'plot_complexity_features_l2': plot_complexity_features_np,
                    'combined_gsp_cosine': combined_features_np, 'combined_gsp_l2': combined_features_np,
                    'without_tf_idf_cosine':  {'combined_gsp': combined_features_np, 'kmeans_tsne':all_features_limited_172_np, 'emotions_count': emotions_actual_features_172_np},
                    'without_tf_idf_l2':  {'combined_gsp': combined_features_np, 'kmeans_tsne':all_features_limited_172_np, 'emotions_count': emotions_actual_features_172_np},
                    }

comic_info_dict=idx_comicno_bookname_genre_mapping_dict

top_n=31

query_book_id_lst = [i for i in range(1, 166)]
lst_of_df = []

for id in query_book_id_lst:
  query_book_id = id
  np_book_idx = query_book_id
  selected_book_info = idx_comicno_bookname_genre_mapping_dict[query_book_id-1]
  print('comic number: {} | title: {} | genre: {} '.format(selected_book_info[0], selected_book_info[1], selected_book_info[2] ))
  concatenated_df = run_all_similarity_on_features(all_feature_dict=all_feature_dict, query_book_id=query_book_id, top_n=21, comic_info_dict=comic_info_dict)
  lst_of_df.append(concatenated_df)

comic number: 3451 | title: Blue Bolt | genre: superhero|vigilante 
/content/drive/MyDrive/suraj/thesis/features/search_results_20220620/3451_Blue Bolt_concatenated_df.csv

comic number: 3452 | title: Dick Cole - Blue Bolt | genre: superhero|vigilante 
/content/drive/MyDrive/suraj/thesis/features/search_results_20220620/3452_Dick Cole - Blue Bolt_concatenated_df.csv

comic number: 3453 | title: Dick Cole - Blue Bolt | genre: superhero|vigilante 
/content/drive/MyDrive/suraj/thesis/features/search_results_20220620/3453_Dick Cole - Blue Bolt_concatenated_df.csv

comic number: 3454 | title: Dick Cole - Blue Bolt | genre: superhero|vigilante 
/content/drive/MyDrive/suraj/thesis/features/search_results_20220620/3454_Dick Cole - Blue Bolt_concatenated_df.csv

comic number: 3455 | title: Dick Cole - Blue Bolt | genre: superhero|vigilante 
/content/drive/MyDrive/suraj/thesis/features/search_results_20220620/3455_Dick Cole - Blue Bolt_concatenated_df.csv

comic number: 3456 | title: Blue Bolt |

ValueError: ignored

In [61]:
print(concatenated_df.shape)
concatenated_df['feature_similarity_type'].value_counts()

(640, 9)


vgg_cosine                         20
vgg_l2                             20
without_tf_idf_cosine              20
combined_gsp_l2                    20
combined_gsp_cosine                20
plot_complexity_features_l2        20
plot_complexity_features_cosine    20
supersense_features_l2             20
supersense_features_cosine         20
gender_orientation_l2              20
gender_orientation_cosine          20
emotion_visual_combo_l2            20
emotion_visual_combo_cosine        20
text_emotions_combo_l2             20
text_emotions_combo_cosine         20
emotions_count_l2                  20
emotions_count_cosine              20
emotions_actual_l2                 20
emotions_actual_cosine             20
text_visual_reranking_l2           20
text_visual_reranking_cosine       20
text_visual_combo_l2               20
text_visual_combo_cosine           20
tf_idf_l2                          20
tf_idf_cosine                      20
kmeans_tsne_l2                     20
kmeans_tsne_