In [56]:
import pandas as pd
import numpy as np
from tqdm._tqdm_notebook import tqdm_notebook
from sentence_transformers import SentenceTransformer


tqdm_notebook.pandas()

model = SentenceTransformer("all-distilroberta-v1")

def embedding_text(model, text):
    vector = model.encode(text)
    return vector

In [5]:
df = pd.read_excel('data/search_ground_truth.xlsx')
df.tail()

  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0,query,query_type,content_id,total_click,position,score
15704,yumis cells,search,9572,64,1,1
15705,zombie,search,1838,261,1,3
15706,zombie,search,8948,197,2,2
15707,zombie,search,2466,194,3,1
15708,zombie detective,search,1838,71,1,1


In [61]:
film_df = pd.read_json('data/film_metadata.json', lines=True)
film_df['id'] = film_df['id'].astype(str)
film_df['embedding'] = film_df.progress_apply(lambda x: embedding_text(model, x['search_text']), axis=1)
film_df.head()

  0%|          | 0/5771 [00:00<?, ?it/s]

Unnamed: 0,id,description,release_date,total_watchers,age_rating,image_portrait,content_url,image_url,title,group_l1,group_l2,genres,actors,directors,country,release_year,popularity,search_text,embedding
0,9372,Lasja mendapati suami dan ayahnya terlibat dal...,2024-01-27,393890,18 or more,ratu-adil-57125a.jpg,https://www.vidio.com/premier/9372,https://thumbor.prod.vidiocdn.com/xXSubtNx3jiO...,ratu adil,series,vidio original,"action, crime, drama","abdurrahman arif, andri mashadi, budi ros, dia...","ginanti rona, tommy dewo",indonesia,2024,trending,title: ratu adil\ndescription: Lasja mendapati...,"[-0.019320933, -0.054715045, -0.014341731, 0.0..."
1,9456,Kehidupan sempurna Amara hancur karena malam p...,2024-01-27,340919,18 or more,cinta-pertama-ayah-66e828.jpg,https://www.vidio.com/premier/9456,https://thumbor.prod.vidiocdn.com/ZONjpNuq_CcE...,cinta pertama ayah,series,vidio original,"crime, drama, family, mystery","aisha nurra datau, al ghazali, dwi sasono, ers...",hadrah daeng ratu,indonesia,2024,trending,title: cinta pertama ayah\ndescription: Kehidu...,"[-0.0075485576, -0.06536246, -0.015011005, -0...."
2,7617,Menceritakan kisah seorang gadis bernama Sakin...,2023-03-13,305402,13 or more,bidadari-surgamu-2bbb68.jpg,https://www.vidio.com/premier/7617,https://thumbor.prod.vidiocdn.com/JxL1jUZmTFdc...,bidadari surgamu,series,tv sinetron,"drama, family, religi, romance","josephine firmstone, michelle joan, rizky naza...",anurag vaishnav,indonesia,2023,trending,title: bidadari surgamu\ndescription: Mencerit...,"[0.010699028, -0.02576401, -0.0075895544, 0.03..."
3,7576,"Naura (Basmalah), Rahsya (Raden Rakha), Adara ...",2023-02-23,263464,13 or more,magic-5-def33f.jpg,https://www.vidio.com/premier/7576,https://thumbor.prod.vidiocdn.com/kha0tcadN-wD...,magic 5,series,tv sinetron,"drama, fantasy, friendship","afan da5, basmalah, eby da 5, raden rakha, sri...","a. septian, bobby moeryawan, sondang pratama, ...",indonesia,2023,trending,"title: magic 5\ndescription: Naura (Basmalah),...","[0.0030388115, -0.021281231, 0.01229152, 0.046..."
4,9535,Alya dan Alyssa adalah anak kembar yang terpis...,2024-01-22,213724,13 or more,tertawan-hati-592778.jpg,https://www.vidio.com/premier/9535,https://thumbor.prod.vidiocdn.com/sXZL0cyvwRdI...,tertawan hati,series,tv sinetron,"drama, romance","afifah ifah'nda, jonas rivanno, naysilla mirda...",sanjeev ram kishan,indonesia,2024,trending,title: tertawan hati\ndescription: Alya dan Al...,"[0.017310545, -0.040223982, -0.014482203, -0.0..."


In [68]:
joined_df = df.merge(film_df[['id', 'search_text', 'embedding']], left_on='query', right_on='id', how='left')
joined_df.drop(columns=['id'], inplace=True)
joined_df['search_text'] = np.where(joined_df['query_type'] == 'search', joined_df['query'], joined_df['search_text'])

In [69]:
trial_df = joined_df[joined_df['query_type'] != 'film_id'].head()

In [70]:
trial_df['embedding'] = trial_df.progress_apply(lambda x: embedding_text(model, x['search_text']), axis=1)
trial_df

  0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,embedding
930,172 days,search,7337,119,1,1,172 days,"[-0.012481329, -0.0025321585, -0.005856002, 0...."
1087,18 again,search,1855,355,1,1,18 again,"[0.016855573, 9.0014755e-06, -0.04532912, -0.0..."
1416,1988,search,69,94,1,1,1988,"[-0.011472764, -0.016984362, -0.032218765, -0...."
1423,2012,search,7752,210,1,1,2012,"[-0.016047977, -0.023321731, -0.05679598, 0.05..."
2974,3 semprul,search,710,158,1,1,3 semprul,"[-0.010308068, -0.0086142365, -0.04801666, 0.0..."


In [71]:
# slow operation
joined_df['embedding'] = joined_df.progress_apply(lambda x: embedding_text(model, x['search_text']) if x['query_type'] == 'search' else x['embedding'], axis=1)

  0%|          | 0/15709 [00:00<?, ?it/s]

In [72]:
joined_df = joined_df.dropna(subset=['search_text', 'embedding'])

In [73]:
from annoy import AnnoyIndex
annoy_index = AnnoyIndex(768, 'dot')
for index, row in film_df.iterrows():
    annoy_index.add_item(int(row['id']), row['embedding'])
annoy_index.build(30)

def retrieve_by_embedding(embedding):
    if len(embedding) > 0:
        return annoy_index.get_nns_by_vector(embedding, 20, search_k=-1)
    else:
        return []

In [74]:
trial_df['search_result'] = trial_df.progress_apply(lambda x: retrieve_by_embedding(x['embedding']), axis=1)
trial_df

  0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,embedding,search_result
930,172 days,search,7337,119,1,1,172 days,"[-0.012481329, -0.0025321585, -0.005856002, 0....","[4703, 9158, 4771, 5765, 8937, 5382, 2145, 450..."
1087,18 again,search,1855,355,1,1,18 again,"[0.016855573, 9.0014755e-06, -0.04532912, -0.0...","[4563, 5016, 5178, 5410, 5126, 5189, 4724, 471..."
1416,1988,search,69,94,1,1,1988,"[-0.011472764, -0.016984362, -0.032218765, -0....","[7881, 9223, 8695, 6053, 6279, 6052, 7992, 821..."
1423,2012,search,7752,210,1,1,2012,"[-0.016047977, -0.023321731, -0.05679598, 0.05...","[9151, 5476, 7154, 6133, 9357, 5178, 5940, 613..."
2974,3 semprul,search,710,158,1,1,3 semprul,"[-0.010308068, -0.0086142365, -0.04801666, 0.0...","[2085, 3679, 5555, 8980, 5590, 2575, 6613, 236..."


In [75]:
joined_df['search_result'] = joined_df.progress_apply(lambda x: retrieve_by_embedding(x['embedding']), axis=1)

  0%|          | 0/15623 [00:00<?, ?it/s]

In [76]:
joined_df.head()

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,embedding,search_result
0,1003,film_id,1027,93,1,3,title: hot young bloods\ndescription: Berlatar...,"[-0.010659831, -0.07717221, -0.04447361, -0.03...","[1003, 5654, 1720, 7757, 5182, 9298, 1926, 728..."
1,1003,film_id,1559,54,2,2,title: hot young bloods\ndescription: Berlatar...,"[-0.010659831, -0.07717221, -0.04447361, -0.03...","[1003, 5654, 1720, 7757, 5182, 9298, 1926, 728..."
2,1003,film_id,831,46,3,1,title: hot young bloods\ndescription: Berlatar...,"[-0.010659831, -0.07717221, -0.04447361, -0.03...","[1003, 5654, 1720, 7757, 5182, 9298, 1926, 728..."
3,1011,film_id,999,5,1,3,title: the fatal encounter\ndescription: Usaha...,"[-0.0027073447, -0.061096076, -0.042264882, -0...","[1011, 8821, 4675, 4490, 7335, 5179, 7332, 835..."
4,1011,film_id,2329,3,2,2,title: the fatal encounter\ndescription: Usaha...,"[-0.0027073447, -0.061096076, -0.042264882, -0...","[1011, 8821, 4675, 4490, 7335, 5179, 7332, 835..."


In [77]:
def is_true_positive(row):
    return 1 if row['content_id'] in row['search_result'] else 0

In [85]:
joined_df['is_tp'] = joined_df.progress_apply(is_true_positive, axis=1)

  0%|          | 0/15623 [00:00<?, ?it/s]

In [86]:
joined_df[['is_tp']].describe()

Unnamed: 0,is_tp
count,15623.0
mean,0.142866
std,0.349948
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [84]:
joined_df[joined_df['query'] == 'spiderman']

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,embedding,search_result,is_tp
15537,spiderman,search,3426,240,1,3,spiderman,"[0.0042682276, 0.009095928, -0.00083625066, 0....","[4175, 9351, 7747, 9354, 9352, 7753, 3426, 545...",1
15538,spiderman,search,7747,152,2,2,spiderman,"[0.0042682276, 0.009095928, -0.00083625066, 0....","[4175, 9351, 7747, 9354, 9352, 7753, 3426, 545...",1
15539,spiderman,search,9354,134,3,1,spiderman,"[0.0042682276, 0.009095928, -0.00083625066, 0....","[4175, 9351, 7747, 9354, 9352, 7753, 3426, 545...",1


In [87]:
film_df[film_df['id'] == '3426']

Unnamed: 0,id,description,release_date,total_watchers,age_rating,image_portrait,content_url,image_url,title,group_l1,group_l2,genres,actors,directors,country,release_year,popularity,search_text,embedding
287,3426,Peter Parker menyeimbangkan hidupnya sebagai s...,2017-07-07,1739,13 or more,spider-man-homecoming-e4b5ed.jpg,https://www.vidio.com/premier/3426,https://thumbor.prod.vidiocdn.com/TBeIDDTwf35r...,spider-man: homecoming,movies,western,"action, adventure, comedy, fantasy, sci-fi","marisa tomei, michael keaton, robert downey jr...",jon watts,western,2017,average,title: spider-man: homecoming\ndescription: Pe...,"[-0.012979601, -0.012500955, -0.003738276, 0.0..."


In [None]:
joined_df.drop(columns=['search_result', 'is_tp']).to_json('data/benchmark_sbert.json', orient='records', lines=True)