In [1]:
import os
import pandas as pd
import numpy as np
from tqdm._tqdm_notebook import tqdm_notebook
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", ""))


tqdm_notebook.pandas()

model = "text-embedding-3-small"

def embedding_text(model, text):
    vector = client.embeddings.create(input = [text], model=model).data[0].embedding
    return vector

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook


In [4]:
df = pd.read_excel('data/search_ground_truth.xlsx')
df.tail()

  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0,query,query_type,content_id,total_click,position,score
15704,yumis cells,search,9572,64,1,1
15705,zombie,search,1838,261,1,3
15706,zombie,search,8948,197,2,2
15707,zombie,search,2466,194,3,1
15708,zombie detective,search,1838,71,1,1


In [5]:
film_df = pd.read_json('data/film_metadata.json', lines=True)
film_df['id'] = film_df['id'].astype(str)
film_df['embedding'] = film_df.progress_apply(lambda x: embedding_text(model, x['search_text']), axis=1)
film_df.head()

  0%|          | 0/5771 [00:00<?, ?it/s]

Unnamed: 0,id,description,release_date,total_watchers,age_rating,image_portrait,content_url,image_url,title,group_l1,group_l2,genres,actors,directors,country,release_year,popularity,search_text,embedding
0,9372,Lasja mendapati suami dan ayahnya terlibat dal...,2024-01-27,393890,18 or more,ratu-adil-57125a.jpg,https://www.vidio.com/premier/9372,https://thumbor.prod.vidiocdn.com/xXSubtNx3jiO...,ratu adil,series,vidio original,"action, crime, drama","abdurrahman arif, andri mashadi, budi ros, dia...","ginanti rona, tommy dewo",indonesia,2024,trending,title: ratu adil\ndescription: Lasja mendapati...,"[-0.028334420174360275, 0.045475538820028305, ..."
1,9456,Kehidupan sempurna Amara hancur karena malam p...,2024-01-27,340919,18 or more,cinta-pertama-ayah-66e828.jpg,https://www.vidio.com/premier/9456,https://thumbor.prod.vidiocdn.com/ZONjpNuq_CcE...,cinta pertama ayah,series,vidio original,"crime, drama, family, mystery","aisha nurra datau, al ghazali, dwi sasono, ers...",hadrah daeng ratu,indonesia,2024,trending,title: cinta pertama ayah\ndescription: Kehidu...,"[0.030531717464327812, 0.05053670331835747, -0..."
2,7617,Menceritakan kisah seorang gadis bernama Sakin...,2023-03-13,305402,13 or more,bidadari-surgamu-2bbb68.jpg,https://www.vidio.com/premier/7617,https://thumbor.prod.vidiocdn.com/JxL1jUZmTFdc...,bidadari surgamu,series,tv sinetron,"drama, family, religi, romance","josephine firmstone, michelle joan, rizky naza...",anurag vaishnav,indonesia,2023,trending,title: bidadari surgamu\ndescription: Mencerit...,"[-0.01604270003736019, 0.02698090672492981, -0..."
3,7576,"Naura (Basmalah), Rahsya (Raden Rakha), Adara ...",2023-02-23,263464,13 or more,magic-5-def33f.jpg,https://www.vidio.com/premier/7576,https://thumbor.prod.vidiocdn.com/kha0tcadN-wD...,magic 5,series,tv sinetron,"drama, fantasy, friendship","afan da5, basmalah, eby da 5, raden rakha, sri...","a. septian, bobby moeryawan, sondang pratama, ...",indonesia,2023,trending,"title: magic 5\ndescription: Naura (Basmalah),...","[-0.017627734690904617, 0.049079325050115585, ..."
4,9535,Alya dan Alyssa adalah anak kembar yang terpis...,2024-01-22,213724,13 or more,tertawan-hati-592778.jpg,https://www.vidio.com/premier/9535,https://thumbor.prod.vidiocdn.com/sXZL0cyvwRdI...,tertawan hati,series,tv sinetron,"drama, romance","afifah ifah'nda, jonas rivanno, naysilla mirda...",sanjeev ram kishan,indonesia,2024,trending,title: tertawan hati\ndescription: Alya dan Al...,"[-0.023380404338240623, -0.003916167188435793,..."


In [6]:
joined_df = df.merge(film_df[['id', 'search_text', 'embedding']], left_on='query', right_on='id', how='left')
joined_df.drop(columns=['id'], inplace=True)
joined_df['search_text'] = np.where(joined_df['query_type'] == 'search', joined_df['query'], joined_df['search_text'])

In [7]:
trial_df = joined_df[joined_df['query_type'] != 'film_id'].head()

In [8]:
trial_df['embedding'] = trial_df.progress_apply(lambda x: embedding_text(model, x['search_text']), axis=1)
trial_df

  0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,embedding
930,172 days,search,7337,119,1,1,172 days,"[-0.02434447407722473, 0.03994518518447876, 0...."
1087,18 again,search,1855,355,1,1,18 again,"[-0.001787033979780972, 0.017965495586395264, ..."
1416,1988,search,69,94,1,1,1988,"[-0.01428853441029787, 0.020473599433898926, 0..."
1423,2012,search,7752,210,1,1,2012,"[0.030848190188407898, -0.011176170781254768, ..."
2974,3 semprul,search,710,158,1,1,3 semprul,"[-0.017543114721775055, -0.0002211366081610322..."


In [9]:
# slow operation
joined_df['embedding'] = joined_df.progress_apply(lambda x: embedding_text(model, x['search_text']) if x['query_type'] == 'search' else x['embedding'], axis=1)

  0%|          | 0/15709 [00:00<?, ?it/s]

In [10]:
joined_df = joined_df.dropna(subset=['search_text', 'embedding'])

In [12]:
from annoy import AnnoyIndex
annoy_index = AnnoyIndex(1536, 'dot')
for index, row in film_df.iterrows():
    annoy_index.add_item(int(row['id']), row['embedding'])
annoy_index.build(30)

def retrieve_by_embedding(embedding):
    if len(embedding) > 0:
        return annoy_index.get_nns_by_vector(embedding, 20, search_k=-1)
    else:
        return []

In [13]:
trial_df['search_result'] = trial_df.progress_apply(lambda x: retrieve_by_embedding(x['embedding']), axis=1)
trial_df

  0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,embedding,search_result
930,172 days,search,7337,119,1,1,172 days,"[-0.02434447407722473, 0.03994518518447876, 0....","[7396, 650, 4955, 4448, 6001, 2415, 5020, 6091..."
1087,18 again,search,1855,355,1,1,18 again,"[-0.001787033979780972, 0.017965495586395264, ...","[1517, 1855, 5189, 6542, 7798, 5622, 4817, 502..."
1416,1988,search,69,94,1,1,1988,"[-0.01428853441029787, 0.020473599433898926, 0...","[69, 4468, 4469, 6054, 6279, 1607, 5910, 6274,..."
1423,2012,search,7752,210,1,1,2012,"[0.030848190188407898, -0.011176170781254768, ...","[7752, 7780, 7775, 4724, 9563, 9307, 3430, 546..."
2974,3 semprul,search,710,158,1,1,3 semprul,"[-0.017543114721775055, -0.0002211366081610322...","[1108, 9354, 4829, 3429, 5456, 5579, 5948, 876..."


In [14]:
joined_df['search_result'] = joined_df.progress_apply(lambda x: retrieve_by_embedding(x['embedding']), axis=1)

  0%|          | 0/15623 [00:00<?, ?it/s]

In [15]:
joined_df.head()

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,embedding,search_result
0,1003,film_id,1027,93,1,3,title: hot young bloods\ndescription: Berlatar...,"[-0.021782688796520233, 0.04166099801659584, -...","[1003, 5654, 9298, 1720, 1027, 8291, 1855, 165..."
1,1003,film_id,1559,54,2,2,title: hot young bloods\ndescription: Berlatar...,"[-0.021782688796520233, 0.04166099801659584, -...","[1003, 5654, 9298, 1720, 1027, 8291, 1855, 165..."
2,1003,film_id,831,46,3,1,title: hot young bloods\ndescription: Berlatar...,"[-0.021782688796520233, 0.04166099801659584, -...","[1003, 5654, 9298, 1720, 1027, 8291, 1855, 165..."
3,1011,film_id,999,5,1,3,title: the fatal encounter\ndescription: Usaha...,"[-0.05172013118863106, 0.02686239406466484, 0....","[1011, 1514, 997, 1314, 7335, 3919, 3982, 1885..."
4,1011,film_id,2329,3,2,2,title: the fatal encounter\ndescription: Usaha...,"[-0.05172013118863106, 0.02686239406466484, 0....","[1011, 1514, 997, 1314, 7335, 3919, 3982, 1885..."


In [16]:
def is_true_positive(row):
    return 1 if row['content_id'] in row['search_result'] else 0

In [17]:
joined_df['is_tp'] = joined_df.progress_apply(is_true_positive, axis=1)

  0%|          | 0/15623 [00:00<?, ?it/s]

In [18]:
joined_df[['is_tp']].describe()

Unnamed: 0,is_tp
count,15623.0
mean,0.211163
std,0.408147
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [19]:
joined_df[joined_df['query'] == 'spiderman']

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,embedding,search_result,is_tp
15537,spiderman,search,3426,240,1,3,spiderman,"[-0.025672562420368195, -0.04033959284424782, ...","[9351, 3426, 7747, 7753, 9354, 4175, 9352, 545...",1
15538,spiderman,search,7747,152,2,2,spiderman,"[-0.025672562420368195, -0.04033959284424782, ...","[9351, 3426, 7747, 7753, 9354, 4175, 9352, 545...",1
15539,spiderman,search,9354,134,3,1,spiderman,"[-0.02567915990948677, -0.04028681293129921, -...","[9351, 3426, 7747, 7753, 9354, 4175, 9352, 545...",1


In [21]:
film_df[film_df['id'] == '3426']

Unnamed: 0,id,description,release_date,total_watchers,age_rating,image_portrait,content_url,image_url,title,group_l1,group_l2,genres,actors,directors,country,release_year,popularity,search_text,embedding
287,3426,Peter Parker menyeimbangkan hidupnya sebagai s...,2017-07-07,1739,13 or more,spider-man-homecoming-e4b5ed.jpg,https://www.vidio.com/premier/3426,https://thumbor.prod.vidiocdn.com/TBeIDDTwf35r...,spider-man: homecoming,movies,western,"action, adventure, comedy, fantasy, sci-fi","marisa tomei, michael keaton, robert downey jr...",jon watts,western,2017,average,title: spider-man: homecoming\ndescription: Pe...,"[-0.04142177104949951, 0.006981788668781519, -..."


In [22]:
joined_df.drop(columns=['search_result', 'is_tp']).to_json('data/benchmark_openai.json', orient='records', lines=True)