In [4]:
import pandas as pd
import numpy as np

from google.cloud import aiplatform
from vertexai.language_models import TextEmbeddingModel
from tqdm._tqdm_notebook import tqdm_notebook

aiplatform.init(
    project='vidio-quiz-prod',
    location='asia-southeast1',
    staging_bucket='gs://genai_hackathon_2024',
)
tqdm_notebook.pandas()

model = TextEmbeddingModel.from_pretrained("textembedding-gecko-multilingual")

def embedding_text(model, text):
    embeddings = model.get_embeddings([text])
    for embedding in embeddings:
        vector = embedding.values
        # print(f"Length of Embedding Vector: {len(vector)}")
    return vector

In [5]:
from datetime import datetime


def preprocess_film_metadata(df):
    df.fillna('', inplace=True)
    df['id'] = df['id'].astype(str)
    df['title'] = df['film_title'].str.lower()
    df['group_l1'] = df['group_name_l1'].str.lower()
    df['group_l2'] = df['group_name_l2'].str.lower()
    df['genres'] = df['film_genres'].apply(lambda x: ', '.join(x.split(',')))
    df['actors'] = df['film_actors'].apply(lambda x: ', '.join(x.split(',')))
    df['directors'] = df['film_directors'].apply(lambda x: ', '.join(x.split(',')))
    df.loc[df['actors'] == "various", 'actors'] = ""
    df['country'] = df['country_group'].str.lower()
    df['total_watchers'] = df['total_watchers'].astype('int')
    df['release_date'] = df['release_date'].str.replace(" 00:00:00", "")
    df['release_year'] = df['release_date'].apply(lambda x: str(datetime.strptime(str(x), "%Y-%m-%d").year) if x != '' else '')

    def popularity(total_watchers):
        if total_watchers >= 50000:
            return "trending"
        elif total_watchers < 50000 and total_watchers >= 500:
            return "average"
        else:
            return "below average"

    df['popularity'] = df['total_watchers'].apply(lambda x: popularity(x))
    search_text_columns = ['title', 'description', 'group_l1', 'group_l2', 'film_main_genre', 'genres', 'directors', 'actors', 'country', 'release_year', 'age_rating', 'popularity']
    df['search_text'] = df[search_text_columns].apply(lambda row: search_text(*row), axis=1)
    df = df.loc[:,~df.columns.duplicated()]
    df.drop(columns=['film_title', 'group_name_l1', 'group_name_l2', 'film_main_genre', 'film_genres', 'film_directors', 'film_actors', 'country_group'], inplace=True)
    return df


def search_text(title, description, group_l1, group_l2, main_genre, genres, directors, actors, country, release_year, age_rating, popularity):
    
    return f"""title: {title}
actors: {actors}
group: {group_l1} > {group_l2}
genres: {main_genre}, {genres}
directors: {directors}
description: {description}
country: {country}
release year: {release_year}
age rating: {age_rating}
popularity: {popularity}"""

In [6]:
df = pd.read_excel('data/search_ground_truth.xlsx')
df.tail()

  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0,query,query_type,content_id,total_click,position,score
14362,zombie,search,7323,138,4,2
14363,zombie,search,6116,135,5,1
14364,zombie detective,search,1838,101,1,2
14365,zombie detective,search,2466,12,2,1
14366,zorro,search,7765,13,1,1


In [7]:
film_df = pd.read_excel('data/film_metadata.xlsx', converters={'release_date':str})
film_df = preprocess_film_metadata(film_df)
film_df['embedding'] = film_df.progress_apply(lambda x: embedding_text(model, x['search_text']), axis=1)
film_df.head()

  warn("Workbook contains no default style, apply openpyxl's default")


  0%|          | 0/5757 [00:00<?, ?it/s]

Unnamed: 0,id,description,release_date,total_watchers,age_rating,image_portrait,content_url,image_url,is_premium,title,group_l1,group_l2,genres,actors,directors,country,release_year,popularity,search_text,embedding
0,9643,Aliza terpaksa menerima perjodohan dari Bunda ...,2024-03-09,1083307,18 or more,santri-pilihan-bunda-ee25e0.jpg,https://www.vidio.com/premier/9643,https://thumbor.prod.vidiocdn.com/-PVWRIp3f0rh...,True,santri pilihan bunda,series,vidio original,"adaptation, drama, romance","fadi alaydrus, naura ayu, sarah sechan, teuku ...",angling sagaran,indonesia,2024,trending,title: santri pilihan bunda\nactors: fadi alay...,"[0.0015386489685624838, -0.02811456471681595, ..."
1,7617,Menceritakan kisah seorang gadis bernama Sakin...,2023-03-13,239263,13 or more,bidadari-surgamu-2bbb68.jpg,https://www.vidio.com/premier/7617,https://thumbor.prod.vidiocdn.com/JxL1jUZmTFdc...,True,bidadari surgamu,series,tv sinetron,"drama, family, religi, romance","josephine firmstone, michelle joan, rizky naza...",anurag vaishnav,indonesia,2023,trending,title: bidadari surgamu\nactors: josephine fir...,"[0.013242288492619991, -0.0075963991694152355,..."
2,7576,"Naura (Basmalah), Rahsya (Raden Rakha), Adara ...",2023-02-23,238254,13 or more,magic-5-def33f.jpg,https://www.vidio.com/premier/7576,https://thumbor.prod.vidiocdn.com/kha0tcadN-wD...,True,magic 5,series,tv sinetron,"drama, fantasy, friendship","afan da5, basmalah, eby da 5, raden rakha, sri...","a. septian, bobby moeryawan, sondang pratama, ...",indonesia,2023,trending,"title: magic 5\nactors: afan da5, basmalah, eb...","[0.04730440303683281, 0.02769733965396881, 0.0..."
3,8928,Di hari perayaan anniversary pernikahan Shafir...,2023-10-04,223074,13 or more,di-antara-dua-cinta-bddc55.jpg,https://www.vidio.com/premier/8928,https://thumbor.prod.vidiocdn.com/rnBg59O6bVIy...,True,di antara dua cinta,series,tv sinetron,"drama, romance","anggika bolsterli, anthony xie, asha assuncao,...",deni pusung,indonesia,2023,trending,title: di antara dua cinta\nactors: anggika bo...,"[0.0017294461140409112, -0.0005878254305571318..."
4,9372,Lasja mendapati suami dan ayahnya terlibat dal...,2024-02-29,207446,18 or more,ratu-adil-eede53.jpg,https://www.vidio.com/premier/9372,https://thumbor.prod.vidiocdn.com/L5oJmXWkmnEo...,True,ratu adil,series,vidio original,"action, crime, drama","abdurrahman arif, andri mashadi, budi ros, dia...","ginanti rona, tommy dewo",indonesia,2024,trending,"title: ratu adil\nactors: abdurrahman arif, an...","[-0.007469169329851866, -0.009112628176808357,..."


In [8]:
joined_df = df.merge(film_df[['id', 'search_text', 'embedding']], left_on='query', right_on='id', how='left')
joined_df.drop(columns=['id'], inplace=True)
joined_df['search_text'] = np.where(joined_df['query_type'] == 'search', joined_df['query'], joined_df['search_text'])

In [9]:
joined_df[joined_df['query_type'] != 'film_id'].size

37472

In [10]:
trial_df = joined_df[joined_df['query_type'] != 'film_id'].head()

In [11]:
trial_df['embedding'] = trial_df.progress_apply(lambda x: embedding_text(model, x['search_text']), axis=1)
trial_df

  0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,embedding
146,12 strong,search,4724,22,1,1,12 strong,"[0.00753067247569561, -0.01148197054862976, 0...."
666,172 day,search,7337,26,1,1,172 day,"[-0.01306532695889473, -0.0531916543841362, 0...."
667,172 days,search,7337,131,1,2,172 days,"[-0.0369473472237587, -0.040300723165273666, 0..."
668,172 days,search,2525,12,2,1,172 days,"[-0.0369473472237587, -0.040300723165273666, 0..."
669,172 days film,search,7337,13,1,1,172 days film,"[-0.032957036048173904, -0.03768826276063919, ..."


In [12]:
joined_df[joined_df['query_type'] != 'film_id'].head()

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,embedding
146,12 strong,search,4724,22,1,1,12 strong,
666,172 day,search,7337,26,1,1,172 day,
667,172 days,search,7337,131,1,2,172 days,
668,172 days,search,2525,12,2,1,172 days,
669,172 days film,search,7337,13,1,1,172 days film,


In [13]:
# slow operation
joined_df['embedding'] = joined_df.progress_apply(lambda x: embedding_text(model, x['search_text']) if x['query_type'] == 'search' else x['embedding'], axis=1)

  0%|          | 0/14367 [00:00<?, ?it/s]

In [14]:
joined_df = joined_df.dropna(subset=['search_text', 'embedding'])

In [15]:
joined_df[joined_df['query_type'] != 'film_id'].head()

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,embedding
146,12 strong,search,4724,22,1,1,12 strong,"[0.00753067247569561, -0.01148197054862976, 0...."
666,172 day,search,7337,26,1,1,172 day,"[-0.01306532695889473, -0.0531916543841362, 0...."
667,172 days,search,7337,131,1,2,172 days,"[-0.0369473472237587, -0.040300723165273666, 0..."
668,172 days,search,2525,12,2,1,172 days,"[-0.0369473472237587, -0.040300723165273666, 0..."
669,172 days film,search,7337,13,1,1,172 days film,"[-0.032957036048173904, -0.03768826276063919, ..."


In [16]:
joined_df[joined_df['query_type'] == 'film_id'].head()

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,embedding
0,1003,film_id,1027,143,1,5,"title: hot young bloods\nactors: kim hee-won, ...","[0.008422765880823135, 0.013056890107691288, 0..."
1,1003,film_id,831,90,2,4,"title: hot young bloods\nactors: kim hee-won, ...","[0.008422765880823135, 0.013056890107691288, 0..."
2,1003,film_id,1559,81,3,3,"title: hot young bloods\nactors: kim hee-won, ...","[0.008422765880823135, 0.013056890107691288, 0..."
3,1003,film_id,2415,52,4,2,"title: hot young bloods\nactors: kim hee-won, ...","[0.008422765880823135, 0.013056890107691288, 0..."
4,1003,film_id,1299,46,5,1,"title: hot young bloods\nactors: kim hee-won, ...","[0.008422765880823135, 0.013056890107691288, 0..."


In [17]:
from annoy import AnnoyIndex
annoy_index = AnnoyIndex(768, 'dot')
for index, row in film_df.iterrows():
    annoy_index.add_item(int(row['id']), row['embedding'])
annoy_index.build(30)

def retrieve_by_embedding(embedding):
    if len(embedding) > 0:
        return annoy_index.get_nns_by_vector(embedding, 20, search_k=-1)
    else:
        return []

In [18]:
trial_df['search_result'] = trial_df.progress_apply(lambda x: retrieve_by_embedding(x['embedding']), axis=1)
trial_df

  0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,embedding,search_result
146,12 strong,search,4724,22,1,1,12 strong,"[0.00753067247569561, -0.01148197054862976, 0....","[4724, 7069, 9158, 9019, 9263, 7641, 7799, 384..."
666,172 day,search,7337,26,1,1,172 day,"[-0.01306532695889473, -0.0531916543841362, 0....","[9671, 4736, 9150, 9687, 9204, 5928, 1855, 725..."
667,172 days,search,7337,131,1,2,172 days,"[-0.0369473472237587, -0.040300723165273666, 0...","[9671, 4736, 7881, 5928, 9687, 9204, 9150, 241..."
668,172 days,search,2525,12,2,1,172 days,"[-0.0369473472237587, -0.040300723165273666, 0...","[9671, 4736, 7881, 5928, 9687, 9204, 9150, 241..."
669,172 days film,search,7337,13,1,1,172 days film,"[-0.032957036048173904, -0.03768826276063919, ...","[9671, 4147, 5107, 9348, 9150, 9151, 5102, 798..."


In [19]:
joined_df['search_result'] = joined_df.progress_apply(lambda x: retrieve_by_embedding(x['embedding']), axis=1)

  0%|          | 0/14262 [00:00<?, ?it/s]

In [20]:
joined_df.head()

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,embedding,search_result
0,1003,film_id,1027,143,1,5,"title: hot young bloods\nactors: kim hee-won, ...","[0.008422765880823135, 0.013056890107691288, 0...","[1003, 1517, 7334, 1314, 1027, 1554, 1193, 131..."
1,1003,film_id,831,90,2,4,"title: hot young bloods\nactors: kim hee-won, ...","[0.008422765880823135, 0.013056890107691288, 0...","[1003, 1517, 7334, 1314, 1027, 1554, 1193, 131..."
2,1003,film_id,1559,81,3,3,"title: hot young bloods\nactors: kim hee-won, ...","[0.008422765880823135, 0.013056890107691288, 0...","[1003, 1517, 7334, 1314, 1027, 1554, 1193, 131..."
3,1003,film_id,2415,52,4,2,"title: hot young bloods\nactors: kim hee-won, ...","[0.008422765880823135, 0.013056890107691288, 0...","[1003, 1517, 7334, 1314, 1027, 1554, 1193, 131..."
4,1003,film_id,1299,46,5,1,"title: hot young bloods\nactors: kim hee-won, ...","[0.008422765880823135, 0.013056890107691288, 0...","[1003, 1517, 7334, 1314, 1027, 1554, 1193, 131..."


In [21]:
def is_true_positive(row):
    return 1 if row['content_id'] in row['search_result'] else 0

In [22]:
joined_df['is_tp'] = joined_df.progress_apply(is_true_positive, axis=1)

  0%|          | 0/14262 [00:00<?, ?it/s]

In [23]:
joined_df[['is_tp']].describe()

Unnamed: 0,is_tp
count,14262.0
mean,0.248002
std,0.431868
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [24]:
joined_df[joined_df['query_type'] == 'search'][['is_tp']].describe()

Unnamed: 0,is_tp
count,4684.0
mean,0.342656
std,0.474648
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [25]:
joined_df[joined_df['query_type'] != 'search'][['is_tp']].describe()

Unnamed: 0,is_tp
count,9578.0
mean,0.201712
std,0.401299
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [26]:
joined_df[joined_df['query'] == 'ratu adil']

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,embedding,search_result,is_tp
13181,ratu adil,search,9372,3184,1,7,ratu adil,"[-0.009534591808915138, -0.01045189332216978, ...","[6147, 731, 6100, 497, 499, 9538, 5699, 5735, ...",0
13182,ratu adil,search,5421,221,2,6,ratu adil,"[-0.009534591808915138, -0.01045189332216978, ...","[6147, 731, 6100, 497, 499, 9538, 5699, 5735, ...",0
13183,ratu adil,search,1756,208,3,5,ratu adil,"[-0.009534591808915138, -0.01045189332216978, ...","[6147, 731, 6100, 497, 499, 9538, 5699, 5735, ...",0
13184,ratu adil,search,3111,75,4,4,ratu adil,"[-0.009534591808915138, -0.01045189332216978, ...","[6147, 731, 6100, 497, 499, 9538, 5699, 5735, ...",0
13185,ratu adil,search,3982,21,5,1,ratu adil,"[-0.009534591808915138, -0.01045189332216978, ...","[6147, 731, 6100, 497, 499, 9538, 5699, 5735, ...",0
13186,ratu adil,search,8957,21,5,1,ratu adil,"[-0.009534591808915138, -0.01045189332216978, ...","[6147, 731, 6100, 497, 499, 9538, 5699, 5735, ...",0
13187,ratu adil,search,5178,21,5,1,ratu adil,"[-0.009534591808915138, -0.01045189332216978, ...","[6147, 731, 6100, 497, 499, 9538, 5699, 5735, ...",0


In [27]:
film_df[film_df['id'] == '5421']

Unnamed: 0,id,description,release_date,total_watchers,age_rating,image_portrait,content_url,image_url,is_premium,title,group_l1,group_l2,genres,actors,directors,country,release_year,popularity,search_text,embedding
25,5421,"Walau berusaha keluar dari lingkaran setan, El...",2023-11-03,33155,18 or more,pertaruhan-the-series-5e7b33.jpg,https://www.vidio.com/premier/5421,https://thumbor.prod.vidiocdn.com/jz6vOt9OQORS...,True,pertaruhan the series,series,vidio original,"action, adaptation, crime, drama, thriller","aulia sarah, bio one, clara bernadeth, giulio ...","fajar martha santosa, sidharta tata",indonesia,2023,average,title: pertaruhan the series\nactors: aulia sa...,"[0.013054502196609974, -0.010694384574890137, ..."


In [None]:
joined_df.drop(columns=['search_result', 'is_tp']).to_json('data/benchmark_gecko.json', orient='records', lines=True)