In [1]:
from datetime import datetime
import pandas as pd
import numpy as np

from google.cloud import aiplatform
from tqdm._tqdm_notebook import tqdm_notebook

now = datetime.now()
formatted_date_time = now.strftime('%Y%m%d_%H%M')

aiplatform.init(
    project='vidio-quiz-prod',
    location='asia-southeast1',
    staging_bucket='gs://genai_hackathon_2024',
)
model = aiplatform.Endpoint("7738653107357220864")
tqdm_notebook.pandas()

def embedding_text(model, text):
    prediction = model.predict(instances=[{
        "content": text,
        "task_type": "DEFAULT",
        "title": ""
    }])
    for embedding in prediction.predictions:
        vector = embedding
    return vector

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook


In [78]:
def preprocess_film_metadata(df):
    df.fillna('', inplace=True)
    df['id'] = df['id'].astype(str)
    df['title'] = df['film_title'].str.lower()
    df['group_l1'] = df['group_name_l1'].str.lower()
    df['group_l2'] = df['group_name_l2'].str.lower()
    df['genres'] = df['film_genres'].apply(lambda x: ', '.join(x.split(',')))
    df['actors'] = df['film_actors'].apply(lambda x: ', '.join(x.split(',')))
    df['directors'] = df['film_directors'].apply(lambda x: ', '.join(x.split(',')))
    df.loc[df['actors'] == "various", 'actors'] = ""
    df['country'] = df['country_group'].str.lower()
    df['total_watchers'] = df['total_watchers'].astype('int')
    df['release_date'] = df['release_date'].str.replace(" 00:00:00", "")
    df['release_year'] = df['release_date'].apply(lambda x: datetime.strptime(str(x), "%Y-%m-%d").year if x != '' else '')

    def popularity(total_watchers):
        if total_watchers >= 50000:
            return "trending"
        elif total_watchers < 50000 and total_watchers >= 500:
            return "average"
        else:
            return "below average"

    df['popularity'] = df['total_watchers'].apply(lambda x: popularity(x))
    search_text_columns = ['title', 'description', 'group_l1', 'group_l2', 'film_main_genre', 'genres', 'directors', 'actors', 'country', 'release_year', 'age_rating', 'popularity']
    df['search_text'] = df[search_text_columns].apply(lambda row: search_text(*row), axis=1)
    df = df.loc[:,~df.columns.duplicated()]
    df.drop(columns=['film_title', 'group_name_l1', 'group_name_l2', 'film_main_genre', 'film_genres', 'film_directors', 'film_actors', 'country_group'], inplace=True)
    return df


def search_text(title, description, group_l1, group_l2, main_genre, genres, directors, actors, country, release_year, age_rating, popularity):
    
    return f"""title: {title}
actors: {actors}
group: {group_l1} > {group_l2}
genres: {main_genre}, {genres}
directors: {directors}
description: {description}
country: {country}
release year: {release_year}
age rating: {age_rating}
popularity: {popularity}"""

def search_by_actors(title, actors):
    actor_list = "\n- ".join(actors.split(', '))
    return f"""actors: 
- {actor_list}"""

In [56]:
df = pd.read_excel('data/search_ground_truth.xlsx')
df.tail()

  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0,query,query_type,content_id,total_click,position,score
13831,zombie,search,8948,209,3,3
13832,zombie,search,6116,117,4,2
13833,zombie,search,7323,116,5,1
13834,zombie detective,search,1838,79,1,1
13835,zorro,search,7765,13,1,1


In [57]:
film_df = pd.read_excel('data/film_metadata.xlsx', converters={'release_date':str})
film_df = preprocess_film_metadata(film_df)

  warn("Workbook contains no default style, apply openpyxl's default")


In [85]:
film_df['embedding'] = film_df.progress_apply(lambda x: embedding_text(model, x['search_text']), axis=1)
film_df.head()

  0%|          | 0/5796 [00:00<?, ?it/s]

Unnamed: 0,id,description,release_date,total_watchers,age_rating,image_portrait,content_url,image_url,title,group_l1,group_l2,genres,actors,directors,country,release_year,popularity,search_text,embedding
0,7617,Menceritakan kisah seorang gadis bernama Sakin...,2023-03-13,294965,13 or more,bidadari-surgamu-2bbb68.jpg,https://www.vidio.com/premier/7617,https://thumbor.prod.vidiocdn.com/JxL1jUZmTFdc...,bidadari surgamu,series,tv sinetron,"drama, family, religi, romance","josephine firmstone, michelle joan, rizky naza...",anurag vaishnav,indonesia,2023,trending,title: bidadari surgamu\nactors: josephine fir...,"[0.00801349897, -0.0132515263, 0.0241768584, 0..."
1,7576,"Naura (Basmalah), Rahsya (Raden Rakha), Adara ...",2023-02-23,269774,13 or more,magic-5-def33f.jpg,https://www.vidio.com/premier/7576,https://thumbor.prod.vidiocdn.com/kha0tcadN-wD...,magic 5,series,tv sinetron,"drama, fantasy, friendship","afan da5, basmalah, eby da 5, raden rakha, sri...","a. septian, bobby moeryawan, sondang pratama, ...",indonesia,2023,trending,"title: magic 5\nactors: afan da5, basmalah, eb...","[0.0444368534, 0.0244383197, 0.0395935625, 0.0..."
2,9456,Kehidupan sempurna Amara hancur karena malam p...,2024-01-27,266437,18 or more,cinta-pertama-ayah-8aa6ca.jpg,https://www.vidio.com/premier/9456,https://thumbor.prod.vidiocdn.com/Q2S24BPh34Ds...,cinta pertama ayah,series,vidio original,"crime, drama, family, mystery","aisha nurra datau, al ghazali, dwi sasono, ers...",hadrah daeng ratu,indonesia,2024,trending,title: cinta pertama ayah\nactors: aisha nurra...,"[0.00752738677, -0.0229997095, -0.0045902105, ..."
3,9372,Lasja mendapati suami dan ayahnya terlibat dal...,2024-02-29,246905,18 or more,ratu-adil-45593f.jpg,https://www.vidio.com/premier/9372,https://thumbor.prod.vidiocdn.com/eVGqYp07_qb3...,ratu adil,series,vidio original,"action, crime, drama","abdurrahman arif, andri mashadi, budi ros, dia...","ginanti rona, tommy dewo",indonesia,2024,trending,"title: ratu adil\nactors: abdurrahman arif, an...","[-0.0119298976, -0.014275223, -0.000455235713,..."
4,8928,Di hari perayaan anniversary pernikahan Shafir...,2023-10-04,233564,13 or more,di-antara-dua-cinta-bddc55.jpg,https://www.vidio.com/premier/8928,https://thumbor.prod.vidiocdn.com/rnBg59O6bVIy...,di antara dua cinta,series,tv sinetron,"drama, romance","anggika bolsterli, anthony xie, asha assuncao,...",deni pusung,indonesia,2023,trending,title: di antara dua cinta\nactors: anggika bo...,"[-0.00349965668, -0.0041485359, 0.00802058, 0...."


In [86]:
joined_df = df.merge(film_df[['id', 'search_text', 'embedding']], left_on='query', right_on='id', how='left')
joined_df.drop(columns=['id'], inplace=True)
joined_df['search_text'] = np.where(joined_df['query_type'] == 'search', joined_df['query'], joined_df['search_text'])

In [87]:
joined_df[joined_df['query_type'] != 'film_id'].size

33224

In [88]:
trial_df = joined_df[joined_df['query_type'] != 'film_id'].head()

In [89]:
trial_df['embedding'] = trial_df.progress_apply(lambda x: embedding_text(model, x['search_text']), axis=1)
trial_df

  0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,embedding
146,12 strong,search,4724,19,1,1,12 strong,"[0.00380811165, -0.0115063414, 0.0907214731, 0..."
666,172 day,search,7337,24,1,1,172 day,"[-0.0157180298, -0.0471819341, 0.0336405672, 0..."
667,172 days,search,7337,120,1,2,172 days,"[-0.041564744, -0.0335650146, 0.0266157854, 0...."
668,172 days,search,2525,12,2,1,172 days,"[-0.041564744, -0.0335650146, 0.0266157854, 0...."
669,172 days film,search,7337,13,1,1,172 days film,"[-0.0337656215, -0.0321737304, 0.0246201046, 0..."


In [90]:
joined_df[joined_df['query_type'] != 'film_id'].head()

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,embedding
146,12 strong,search,4724,19,1,1,12 strong,
666,172 day,search,7337,24,1,1,172 day,
667,172 days,search,7337,120,1,2,172 days,
668,172 days,search,2525,12,2,1,172 days,
669,172 days film,search,7337,13,1,1,172 days film,


In [91]:
# slow operation
joined_df['embedding'] = joined_df.progress_apply(lambda x: embedding_text(model, x['search_text']) if x['query_type'] == 'search' else x['embedding'], axis=1)

  0%|          | 0/13836 [00:00<?, ?it/s]

In [92]:
joined_df = joined_df.dropna(subset=['search_text', 'embedding'])

In [93]:
joined_df[joined_df['query_type'] != 'film_id'].head()

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,embedding
146,12 strong,search,4724,19,1,1,12 strong,"[0.00380811165, -0.0115063414, 0.0907214731, 0..."
666,172 day,search,7337,24,1,1,172 day,"[-0.0157180298, -0.0471819341, 0.0336405672, 0..."
667,172 days,search,7337,120,1,2,172 days,"[-0.041564744, -0.0335650146, 0.0266157854, 0...."
668,172 days,search,2525,12,2,1,172 days,"[-0.041564744, -0.0335650146, 0.0266157854, 0...."
669,172 days film,search,7337,13,1,1,172 days film,"[-0.0337656215, -0.0321737304, 0.0246201046, 0..."


In [94]:
joined_df[joined_df['query_type'] == 'film_id'].head()

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,embedding
0,1003,film_id,1027,143,1,5,"title: hot young bloods\nactors: kim hee-won, ...","[0.00341663091, 0.00968899857, 0.0142909465, 0..."
1,1003,film_id,831,90,2,4,"title: hot young bloods\nactors: kim hee-won, ...","[0.00341663091, 0.00968899857, 0.0142909465, 0..."
2,1003,film_id,1559,81,3,3,"title: hot young bloods\nactors: kim hee-won, ...","[0.00341663091, 0.00968899857, 0.0142909465, 0..."
3,1003,film_id,2415,52,4,2,"title: hot young bloods\nactors: kim hee-won, ...","[0.00341663091, 0.00968899857, 0.0142909465, 0..."
4,1003,film_id,1299,46,5,1,"title: hot young bloods\nactors: kim hee-won, ...","[0.00341663091, 0.00968899857, 0.0142909465, 0..."


In [95]:
from annoy import AnnoyIndex
annoy_index = AnnoyIndex(768, 'dot')
for index, row in film_df.iterrows():
    annoy_index.add_item(int(row['id']), row['embedding'])
annoy_index.build(30)

def retrieve_by_embedding(embedding):
    if len(embedding) > 0:
        return annoy_index.get_nns_by_vector(embedding, 20, search_k=-1)
    else:
        return []

In [96]:
trial_df['search_result'] = trial_df.progress_apply(lambda x: retrieve_by_embedding(x['embedding']), axis=1)
trial_df

  0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,embedding,search_result
146,12 strong,search,4724,19,1,1,12 strong,"[0.00380811165, -0.0115063414, 0.0907214731, 0...","[4724, 9158, 7079, 3849, 6985, 9263, 997, 5054..."
666,172 day,search,7337,24,1,1,172 day,"[-0.0157180298, -0.0471819341, 0.0336405672, 0...","[4736, 7881, 9150, 5928, 9527, 4617, 5107, 936..."
667,172 days,search,7337,120,1,2,172 days,"[-0.041564744, -0.0335650146, 0.0266157854, 0....","[4736, 5928, 2415, 9150, 7396, 3737, 1961, 234..."
668,172 days,search,2525,12,2,1,172 days,"[-0.041564744, -0.0335650146, 0.0266157854, 0....","[4736, 5928, 2415, 9150, 7396, 3737, 1961, 234..."
669,172 days film,search,7337,13,1,1,172 days film,"[-0.0337656215, -0.0321737304, 0.0246201046, 0...","[4736, 4147, 5928, 1961, 5107, 8581, 9687, 915..."


In [97]:
joined_df['search_result'] = joined_df.progress_apply(lambda x: retrieve_by_embedding(x['embedding']), axis=1)

  0%|          | 0/13769 [00:00<?, ?it/s]

In [98]:
joined_df.head()

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,embedding,search_result
0,1003,film_id,1027,143,1,5,"title: hot young bloods\nactors: kim hee-won, ...","[0.00341663091, 0.00968899857, 0.0142909465, 0...","[1003, 5654, 205, 1517, 7334, 1027, 1314, 1554..."
1,1003,film_id,831,90,2,4,"title: hot young bloods\nactors: kim hee-won, ...","[0.00341663091, 0.00968899857, 0.0142909465, 0...","[1003, 5654, 205, 1517, 7334, 1027, 1314, 1554..."
2,1003,film_id,1559,81,3,3,"title: hot young bloods\nactors: kim hee-won, ...","[0.00341663091, 0.00968899857, 0.0142909465, 0...","[1003, 5654, 205, 1517, 7334, 1027, 1314, 1554..."
3,1003,film_id,2415,52,4,2,"title: hot young bloods\nactors: kim hee-won, ...","[0.00341663091, 0.00968899857, 0.0142909465, 0...","[1003, 5654, 205, 1517, 7334, 1027, 1314, 1554..."
4,1003,film_id,1299,46,5,1,"title: hot young bloods\nactors: kim hee-won, ...","[0.00341663091, 0.00968899857, 0.0142909465, 0...","[1003, 5654, 205, 1517, 7334, 1027, 1314, 1554..."


In [99]:
def is_true_positive(row):
    return 1 if row['content_id'] in row['search_result'] else 0

In [100]:
joined_df['is_tp'] = joined_df.progress_apply(is_true_positive, axis=1)

  0%|          | 0/13769 [00:00<?, ?it/s]

In [101]:
joined_df[['is_tp']].describe()

Unnamed: 0,is_tp
count,13769.0
mean,0.281575
std,0.449783
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [102]:
joined_df[joined_df['is_tp'] == 0].head(10)

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,embedding,search_result,is_tp
3,1003,film_id,2415,52,4,2,"title: hot young bloods\nactors: kim hee-won, ...","[0.00341663091, 0.00968899857, 0.0142909465, 0...","[1003, 5654, 205, 1517, 7334, 1027, 1314, 1554...",0
4,1003,film_id,1299,46,5,1,"title: hot young bloods\nactors: kim hee-won, ...","[0.00341663091, 0.00968899857, 0.0142909465, 0...","[1003, 5654, 205, 1517, 7334, 1027, 1314, 1554...",0
17,1027,film_id,3584,425,2,4,title: kim ji-young: born 1982\nactors: gong y...,"[0.0179736074, -0.0175941829, 0.0208254922, -0...","[1027, 1855, 1517, 831, 7415, 39, 1233, 1591, ...",0
18,1027,film_id,1719,233,3,3,title: kim ji-young: born 1982\nactors: gong y...,"[0.0179736074, -0.0175941829, 0.0208254922, -0...","[1027, 1855, 1517, 831, 7415, 39, 1233, 1591, ...",0
20,1027,film_id,3512,199,5,1,title: kim ji-young: born 1982\nactors: gong y...,"[0.0179736074, -0.0175941829, 0.0208254922, -0...","[1027, 1855, 1517, 831, 7415, 39, 1233, 1591, ...",0
23,1030,film_id,1605,10,2,4,"title: princess mermaid\nactors: kenny austin,...","[0.0279201195, -0.00782657787, 0.0081119081, 0...","[1030, 1819, 322, 338, 359, 8490, 5832, 8503, ...",0
24,1031,film_id,1373,11,1,5,title: jungkir balik dunia sissy\nactors: jour...,"[0.0565780662, -0.0209649336, 0.0145230396, 0....","[1031, 5198, 5083, 3696, 2926, 1706, 863, 8636...",0
25,1031,film_id,307,10,2,4,title: jungkir balik dunia sissy\nactors: jour...,"[0.0565780662, -0.0209649336, 0.0145230396, 0....","[1031, 5198, 5083, 3696, 2926, 1706, 863, 8636...",0
26,1033,film_id,921,11,1,5,title: harta tahta cinta sasha\nactors: larasa...,"[0.00503867399, 0.0161691103, 0.00594937615, 0...","[1033, 6228, 1932, 3408, 822, 4626, 5405, 3355...",0
27,1033,film_id,735,10,2,4,title: harta tahta cinta sasha\nactors: larasa...,"[0.00503867399, 0.0161691103, 0.00594937615, 0...","[1033, 6228, 1932, 3408, 822, 4626, 5405, 3355...",0


In [125]:
joined_df[joined_df['query'].str.contains('reza rahardian')]

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,embedding,search_result,is_tp
12820,reza rahardian,search,6081,17,1,1,reza rahardian,"[-0.0402096435, -0.0192004628, 0.0272873268, 0...","[1714, 7266, 5142, 4574, 5409, 7699, 4941, 457...",0


In [113]:
query_embedding = embedding_text(model, "wulan guritno")

example_df = film_df[film_df['id'].isin(['7316', '8811', '3789', '1936', '1243', '6112', '7609', '4207'])]
example_df['similarity'] = example_df.apply(lambda x: np.dot(query_embedding, x['embedding']), axis=1)
example_df['contain_wulan_guritno'] = example_df['search_text'].str.contains('wulan guritno')
example_df['contain_wulan'] = example_df['search_text'].str.contains('wulan')
example_df['contain_guritno'] = example_df['search_text'].str.contains('guritno')
example_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  example_df['similarity'] = example_df.apply(lambda x: np.dot(query_embedding, x['embedding']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  example_df['contain_wulan_guritno'] = example_df['search_text'].str.contains('wulan guritno')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  example_

Unnamed: 0,id,description,release_date,total_watchers,age_rating,image_portrait,content_url,image_url,title,group_l1,...,directors,country,release_year,popularity,search_text,embedding,similarity,contain_wulan_guritno,contain_wulan,contain_guritno
28,7316,Perjaka culun penulis script sinteron azab ber...,2023-02-12,31304,21 or more,open-bo-25ae12.jpg,https://www.vidio.com/premier/7316,https://thumbor.prod.vidiocdn.com/pMao6UQ4hhug...,open bo,series,...,reka wijaya,indonesia,2023.0,average,"title: open bo\nactors: aryo wahab, brigitta c...","[-0.0299437065, -0.00584263774, 0.0435879827, ...",0.695362,True,True,True
54,7609,"3 narapidana kabur dari penjara, karena merasa...",2023-03-24,9236,13 or more,surga-belok-kanan-216562.jpg,https://www.vidio.com/premier/7609,https://thumbor.prod.vidiocdn.com/NTl7XpfARsUx...,surga belok kanan,series,...,angling sagaran,indonesia,2023.0,average,title: surga belok kanan\nactors: abidzar al g...,"[-0.024053283, 0.00576378405, 0.00595000479, 0...",0.704423,False,False,False
1408,1243,Tutur Tinular menceritakan kehidupan dua kadip...,,393,13 or more,tutur-tinular-2a9559.jpg,https://www.vidio.com/premier/1243,https://thumbor.prod.vidiocdn.com/gAj9ymP0L9oR...,tutur tinular,series,...,muchlis raya,indonesia,,below average,"title: tutur tinular \nactors: rico verald, ri...","[0.000110077606, -0.0112361806, 0.0134773487, ...",0.709651,False,False,False
2944,4207,Mendadak kaya Vino malah harus berpura - pura...,2022-01-06,118,13 or more,diprank-sama-good-lookingnya-tukang-kredit-pan...,https://www.vidio.com/premier/4207,https://thumbor.prod.vidiocdn.com/yUfLhZ7_p6ML...,diprank sama good lookingnya tukang kredit panci,movies,...,ninos joned,indonesia,2022.0,below average,title: diprank sama good lookingnya tukang kre...,"[0.0155163687, -0.0148840928, -0.0202281345, 0...",0.704389,False,False,False
3077,1936,FTV Sinema Wajah Indonesia | Untuk menunjang g...,2020-10-31,107,13 or more,ftv-sinema-wajah-indonesia-tertuduh-sukses-493...,https://www.vidio.com/premier/1936,https://thumbor.prod.vidiocdn.com/Vhsz31Zp55dw...,tertuduh sukses,movies,...,adys kayl,indonesia,2020.0,below average,"title: tertuduh sukses\nactors: arya saloka, d...","[0.0329873525, -0.0147541445, 0.019497145, 0.0...",0.709729,False,False,False
3588,8811,"Tak sengaja menabrak Ibu dari Wisnu, Kirana ju...",2023-09-14,75,13 or more,kuwa-kuwi-dia-mainnya-cantik-dan-elegan-3f728b...,https://www.vidio.com/premier/8811,https://thumbor.prod.vidiocdn.com/MUdr7N-yh3UI...,kuwa kuwi dia mainnya cantik dan elegan,movies,...,anika marani,indonesia,2023.0,below average,title: kuwa kuwi dia mainnya cantik dan elegan...,"[0.00981917325, 0.0181628801, -0.000957604032,...",0.716783,False,False,False
4759,3789,Mencari Jejak Bunda merupakan sebuah sinetron ...,2021-05-03,24,13 or more,mencari-jejak-bunda-c95834.jpg,https://www.vidio.com/premier/3789,https://thumbor.prod.vidiocdn.com/9qRaTxNn_pXZ...,mencari jejak bunda,series,...,budhi sutrisno,indonesia,2021.0,below average,title: mencari jejak bunda\nactors: ghea d'sya...,"[-0.000480858813, 0.00419106055, -0.0385343209...",0.710779,False,False,False
4966,6112,Penasaran sama pertanyaan-pertanyaan Miss Kepo...,2022-09-01,18,13 or more,binus-tv-w-o-w-7ff665.jpg,https://www.vidio.com/premier/6112,https://thumbor.prod.vidiocdn.com/ggvVafCvDJWP...,binus tv : w.o.w,entertainment,...,,indonesia,2022.0,below average,title: binus tv : w.o.w\nactors: \ngenres: hob...,"[0.0331686549, 0.0030181047, 0.043184828, 0.02...",0.70947,False,False,False


In [126]:
query_embedding = embedding_text(model, "reza rahadian")

example_df = film_df[film_df['id'].isin(['6081', '1714', '7266', '5142', '4574', '5409', '7699', '4941'])]
example_df['similarity'] = example_df.apply(lambda x: np.dot(query_embedding, x['embedding']), axis=1)
example_df['contain_reza_rahadian'] = example_df['search_text'].str.contains('reza rahadian')
example_df['contain_reza'] = example_df['search_text'].str.contains('reza')
example_df['contain_rahadian'] = example_df['search_text'].str.contains('rahardian')
example_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  example_df['similarity'] = example_df.apply(lambda x: np.dot(query_embedding, x['embedding']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  example_df['contain_reza_rahadian'] = example_df['search_text'].str.contains('reza rahadian')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  example_

Unnamed: 0,id,description,release_date,total_watchers,age_rating,image_portrait,content_url,image_url,title,group_l1,...,directors,country,release_year,popularity,search_text,embedding,similarity,contain_reza_rahadian,contain_reza,contain_rahadian
186,4574,Ra One adalah film Sci-Fi Hindi tentang Shekha...,2011-10-26,2958,13 or more,ra-one-e9fa0f.jpg,https://www.vidio.com/premier/4574,https://thumbor.prod.vidiocdn.com/7OLtOA6sS01b...,ra.one,movies,...,anubhav sinha,india,2011,average,"title: ra.one\nactors: arjun rampal, kareena k...","[-0.0329704061, -0.000693619368, 0.0202373043,...",0.638115,False,False,False
867,7699,Berawal dari sebuah tragedi membuat Erwin (Rid...,2023-03-15,753,13 or more,rasa-ini-salah-waktu-ftv-sctv-b063ee.jpg,https://www.vidio.com/premier/7699,https://thumbor.prod.vidiocdn.com/8BfRqnC6Hlg9...,rasa ini salah waktu,movies,...,otoy witoyo,indonesia,2023,average,title: rasa ini salah waktu\nactors: ridwan gh...,"[0.0105151832, 0.0106905373, 0.00936369132, 0....",0.624949,False,False,False
1502,5409,"Polisi yang tak henti-hentinya Radha, mencari ...",2020-08-21,363,18 or more,flesh-6ad005.png,https://www.vidio.com/premier/5409,https://thumbor.prod.vidiocdn.com/05gJj0kTn4AU...,flesh,series,...,danish aslam,india,2020,below average,"title: flesh\nactors: akshay oberoi, mahima ma...","[-0.0361354835, 0.0047861943, 0.0389020368, 0....",0.635357,False,False,False
1694,6081,Ketika novelis Tiana jatuh cinta pada Harun ya...,2018-05-24,299,13 or more,the-gift-2a66d9.jpg,https://www.vidio.com/premier/6081,https://thumbor.prod.vidiocdn.com/W-octrvBK0ND...,the gift,movies,...,hanung bramantyo,indonesia,2018,below average,"title: the gift\nactors: ayushita, christine h...","[0.0419949777, -0.00583861955, 0.0216156058, 0...",0.589011,True,True,False
3520,4941,Rasa bersalah karena kelambanan menghantui ora...,2019-01-01,79,13 or more,that-man-in-the-picture-f13b01.png,https://www.vidio.com/premier/4941,https://thumbor.prod.vidiocdn.com/8cR8QBqiuoo4...,that man in the picture,movies,...,gaurav sharma,india,2019,below average,title: that man in the picture\nactors: raghub...,"[-0.0210958552, 0.0335716605, 0.0344419032, 0....",0.63476,False,False,False
4701,5142,"Ketika Akash, seorang taipan terkenal bertemu ...",2011-09-01,26,13 or more,u-r-my-jaan-9eda9a.jpg,https://www.vidio.com/premier/5142,https://thumbor.prod.vidiocdn.com/Nf3U2M_Hch5m...,u r my jaan,movies,...,aron govil,india,2011,below average,"title: u r my jaan\nactors: anil dhawan, himan...","[0.0313511342, 0.0152070802, -0.0192654487, 0....",0.639106,False,False,False
4928,1714,Simak keseruan ngobrol-ngobrol bareng aktor da...,2019-03-13,19,13 or more,vidio-talk-74bf15.jpg,https://www.vidio.com/premier/1714,https://thumbor.prod.vidiocdn.com/SXZJg7tctWq3...,vidio talk,entertainment,...,various,indonesia,2019,below average,"title: vidio talk\nactors: asran shady, sheila...","[-0.015890738, -0.0306137912, 0.0130118309, -0...",0.661963,False,False,False
5635,7266,Don Bhargavan telah kehilangan kepercayaan dir...,2006-08-25,3,13 or more,bharghavacharitham-moonnam-khandam-68a16c.jpg,https://www.vidio.com/premier/7266,https://thumbor.prod.vidiocdn.com/d-XS8RgIHnr8...,bharghavacharitham moonnam khandam,movies,...,jomon theckan,india,2006,below average,title: bharghavacharitham moonnam khandam\nact...,"[0.0158710591, 0.00842774566, -0.0043984619, 0...",0.627423,False,False,False


In [108]:
film_df[film_df['id'] == '8811']['search_text'].values

array(['title: kuwa kuwi dia mainnya cantik dan elegan\nactors: angela gilsha, ridwan ghani\ngenres: drama, drama, romance\ncountry: indonesia\ngroup: movies > ftv\ndirectors: anika marani\nrelease year: 2023\nage rating: 13 or more\npopularity: below average\ndescription: Tak sengaja menabrak Ibu dari Wisnu, Kirana justru menyembunyikan hal tersebut agar tidak disalahkan. Pertemuan itu membuat keduanya semakin dekat. Namun tak disangka, akhirnya Wisnu mengetahui jika Kiranalah yang menabrak sang Ibu.'],
      dtype=object)

In [24]:
joined_df.drop(columns=['search_result', 'is_tp']).to_json('data/benchmark_gecko.json', orient='records', lines=True)

In [105]:
film_df.tail()

Unnamed: 0,id,description,release_date,total_watchers,age_rating,image_portrait,content_url,image_url,title,group_l1,group_l2,genres,actors,directors,country,release_year,popularity,search_text,embedding
5791,5944,Tutorial ini menunjukkan kepada pemirsa bagaim...,2022-08-09,0,13 or more,dw-how-to-bauhaus-b6a1ca.png,https://www.vidio.com/premier/5944,https://thumbor.prod.vidiocdn.com/wkgzPesakyte...,dw - how to bauhaus,entertainment,lifestyle,hobbies,,various,western,2022,below average,title: dw - how to bauhaus\nactors: \ngenres: ...,"[0.0345598832, -0.00589417, 0.00903062243, -0...."
5792,6682,"Ulasan gadget atau gawai, seperti earphone, sp...",2022-11-03,0,13 or more,gonta-ganti-hape-review-gadgets-0ed4f9.png,https://www.vidio.com/premier/6682,https://thumbor.prod.vidiocdn.com/4WwhYo5tW3Bj...,gonta ganti hape - review gadgets,entertainment,lifestyle,gadget,,various,indonesia,2022,below average,title: gonta ganti hape - review gadgets\nacto...,"[0.00790511444, 0.00354635087, 0.00958709884, ..."
5793,6237,Sering menerka-nerka apa yang dipikirin cowok?...,2022-09-09,0,13 or more,asmaraku-tanya-cowok-40f356.png,https://www.vidio.com/premier/6237,https://thumbor.prod.vidiocdn.com/C4lFHjc2w7LH...,asmaraku - tanya cowok,entertainment,lifestyle,hobbies,,,indonesia,2022,below average,title: asmaraku - tanya cowok\nactors: \ngenre...,"[0.0241778921, 0.0189394671, 0.0150779253, 0.0..."
5794,9703,Memutuskan untuk berhenti bekerja dan menjadi ...,2022-04-29,0,13 or more,suamiku-lebih-memilih-jadi-bapak-rumah-tangga-...,https://www.vidio.com/premier/9703,https://thumbor.prod.vidiocdn.com/uFAlvYKp4sTQ...,suamiku lebih memilih jadi bapak rumah tangga ...,movies,ftv,"drama, family, romance","dea lestari, ichal muhammad",bobby moeryawan,indonesia,2022,below average,title: suamiku lebih memilih jadi bapak rumah ...,"[0.0229411796, -0.0171404257, -0.0225542542, 0..."
5795,6706,"Membahas tentang Korea dari budaya, fashion, k...",2022-11-08,0,13 or more,kpop-on-class-koc-podcast-333d58.jpg,https://www.vidio.com/premier/6706,https://thumbor.prod.vidiocdn.com/aursReaCAZPo...,kpop on class - koc podcast,entertainment,lifestyle,hobbies,,various,indonesia,2022,below average,title: kpop on class - koc podcast\nactors: \n...,"[-0.00855891313, 0.00493523804, -0.00816851482..."


In [106]:
film_df.to_json(f"data/film_metadata_tuned.json", orient='records', lines=True)