In [2]:
import pandas as pd
import numpy as np

from google.cloud import aiplatform
from tqdm._tqdm_notebook import tqdm_notebook
import google.auth
import google.auth.transport.requests
import requests
import json
creds, project = google.auth.default()

# creds.valid is False, and creds.token is None
# Need to refresh credentials to populate those

auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
token = creds.token

aiplatform.init(
    project='vidio-quiz-prod',
    location='asia-southeast1',
    staging_bucket='gs://genai_hackathon_2024',
)
tqdm_notebook.pandas()

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook


In [3]:
data_store = "film-metadata-2024-03-19_1710826473633"
project_id = "328583281153"
location = "global"

def vertex_search_http(query, film_id=None):
    url = f"https://discoveryengine.googleapis.com/v1alpha/projects/{project_id}/locations/{location}/collections/default_collection/dataStores/{data_store}/servingConfigs/default_search:search"
    data = {
        "query": query, 
        "pageSize": 20,
    }
    headers = {"Authorization": f"Bearer {token}"}
    response = requests.post(url, json=data, headers=headers)
    response_json = json.loads(response.text)
    result = list(map(lambda x: x["document"]["structData"], response_json["results"]))
    result = list(map(lambda x: x["id"], result))
    if film_id is not None:
        result = [element for element in result if element != film_id]

    return result

In [4]:
from datetime import datetime


def preprocess_film_metadata(df):
    df.fillna('', inplace=True)
    df['id'] = df['id'].astype(str)
    df['title'] = df['film_title'].str.lower()
    df['group_l1'] = df['group_name_l1'].str.lower()
    df['group_l2'] = df['group_name_l2'].str.lower()
    df['genres'] = df['film_genres'].apply(lambda x: ', '.join(x.split(',')))
    df['actors'] = df['film_actors'].apply(lambda x: ', '.join(x.split(',')))
    df['directors'] = df['film_directors'].apply(lambda x: ', '.join(x.split(',')))
    df.loc[df['actors'] == "various", 'actors'] = ""
    df['country'] = df['country_group'].str.lower()
    df['total_watchers'] = df['total_watchers'].astype('int')
    df['release_date'] = df['release_date'].str.replace(" 00:00:00", "")
    df['release_year'] = df['release_date'].apply(lambda x: str(datetime.strptime(str(x), "%Y-%m-%d").year) if x != '' else '')

    def popularity(total_watchers):
        if total_watchers >= 50000:
            return "trending"
        elif total_watchers < 50000 and total_watchers >= 500:
            return "average"
        else:
            return "below average"

    df['popularity'] = df['total_watchers'].apply(lambda x: popularity(x))
    search_text_columns = ['title', 'description', 'group_l1', 'group_l2', 'film_main_genre', 'genres', 'directors', 'actors', 'country', 'release_year', 'age_rating', 'popularity']
    df['search_text'] = df[search_text_columns].apply(lambda row: search_text(*row), axis=1)
    df = df.loc[:,~df.columns.duplicated()]
    df.drop(columns=['film_title', 'group_name_l1', 'group_name_l2', 'film_main_genre', 'film_genres', 'film_directors', 'film_actors', 'country_group'], inplace=True)
    return df


def search_text(title, description, group_l1, group_l2, main_genre, genres, directors, actors, country, release_year, age_rating, popularity):
    
    return f"""title: {title}
actors: {actors}
group: {group_l1} > {group_l2}
genres: {main_genre}, {genres}
directors: {directors}
description: {description}
country: {country}
release year: {release_year}
age rating: {age_rating}
popularity: {popularity}"""

In [5]:
df = pd.read_excel('data/search_ground_truth.xlsx')
df.sample(5)

  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0,query,query_type,content_id,total_click,position,score
7781,8452,film_id,4264,11,3,3
595,1674,film_id,3586,18,2,4
12514,momentum,search,275,12,1,1
1103,2066,film_id,2365,17,5,1
9250,9518,film_id,6893,19,4,2


In [6]:
film_df = pd.read_excel('data/film_metadata.xlsx', converters={'release_date':str})
film_df = preprocess_film_metadata(film_df)

  warn("Workbook contains no default style, apply openpyxl's default")


In [7]:
joined_df = df.merge(film_df[['id', 'search_text']], left_on='query', right_on='id', how='left')
joined_df.drop(columns=['id'], inplace=True)
joined_df['search_text'] = np.where(joined_df['query_type'] == 'search', joined_df['query'], joined_df['search_text'])
joined_df = joined_df.dropna(subset=['search_text'])

In [8]:
distinct_df = joined_df.drop_duplicates(subset=['search_text'])

In [9]:
trial_df = distinct_df.sample(5)

In [10]:
def search_route(query, query_type, search_text):
    if query_type == "film_id":
        return vertex_search_http(search_text, query)
    else:
        return vertex_search_http(search_text)

In [11]:
trial_df['search_result'] = trial_df.progress_apply(lambda x: search_route(x['query'], x['query_type'], x['search_text']), axis=1)
trial_df

  0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,search_result
11808,jilid 15,search,4932,12,1,1,jilid 15,"[4932, 2732, 2715, 9665, 7636, 841, 6537, 2155..."
7143,7949,film_id,3747,18,1,5,title: delivery order bukan salah jodoh\nactor...,"[9552, 4325, 9391, 654, 4086, 7519, 3447, 1188..."
1600,2429,film_id,2246,281,1,5,title: love story the series extras\nactors: a...,"[2246, 1433, 5811, 2428, 748, 1600, 2256, 3337..."
3447,4412,film_id,8832,61,1,5,title: the hunger games: catching fire\nactors...,"[1983, 8832, 9203, 9562, 5463, 7752, 5453, 893..."
13127,putih abu,search,1115,51,1,1,putih abu,"[1115, 3745, 5832, 391, 6712, 2926, 8503, 2763..."


In [12]:
distinct_df['search_result'] = distinct_df.progress_apply(lambda x: search_route(x['query'], x['query_type'], x['search_text']), axis=1)

  0%|          | 0/5952 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  distinct_df['search_result'] = distinct_df.progress_apply(lambda x: search_route(x['query'], x['query_type'], x['search_text']), axis=1)


In [13]:
final_df = joined_df.merge(distinct_df[['query', 'search_result']], left_on='query', right_on='query', how='left')

In [14]:
final_df[final_df.isna().any(axis=1)]

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,search_result


In [15]:
def is_true_positive(row):
    return 1 if str(row['content_id']) in row['search_result'] else 0

In [16]:
final_df['is_tp'] = final_df.progress_apply(is_true_positive, axis=1)

  0%|          | 0/14262 [00:00<?, ?it/s]

In [17]:
final_df[['is_tp']].describe()

Unnamed: 0,is_tp
count,14262.0
mean,0.547749
std,0.497732
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [18]:
final_df[final_df['query_type'] == 'search'][['is_tp']].describe()

Unnamed: 0,is_tp
count,4684.0
mean,0.821947
std,0.382598
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


In [19]:
final_df[final_df['query_type'] != 'search'][['is_tp']].describe()

Unnamed: 0,is_tp
count,9578.0
mean,0.413656
std,0.492514
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [20]:
final_df[final_df['is_tp'] == 0].sample(10)

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,search_result,is_tp
4117,5196,film_id,5974,17,3,3,title: ujung penantian\nactors: dr niken wenin...,"[2330, 2285, 8281, 3699, 7067, 7124, 841, 4625...",0
1519,2367,film_id,2046,217,1,5,title: demi cinta - catatan harianku\nactors: ...,"[3514, 3558, 2707, 4120, 2257, 632, 2335, 2149...",0
1520,2367,film_id,5313,96,2,4,title: demi cinta - catatan harianku\nactors: ...,"[3514, 3558, 2707, 4120, 2257, 632, 2335, 2149...",0
2186,2969,film_id,3175,56,2,4,title: max steel: turbo warriors\nactors: andr...,"[2968, 3138, 3078, 1092, 9364, 6686, 8734, 849...",0
6768,7684,film_id,3180,48,1,5,title: ftv ramadan\nactors: dude herlino\ngrou...,"[1181, 9696, 7538, 4816, 841, 1182, 6247, 861,...",0
7140,7999,film_id,960,12,5,1,title: wanted abang duda sempurna\nactors: rac...,"[1131, 6556, 9695, 8281, 5401, 1788, 9014, 851...",0
1577,2423,film_id,3925,48,2,4,title: terbebani cinta - catatan harianku\nact...,"[2361, 3580, 1782, 1409, 3557, 2436, 2277, 215...",0
13559,suami takut istri,search,4487,20,1,2,suami takut istri,"[1726, 7647, 3651, 8971, 6410, 8981, 6239, 609...",0
2920,3850,film_id,3708,53,2,4,title: resident evil: apocalypse\nactors: eric...,"[3826, 3916, 3843, 4177, 3917, 3908, 9333, 869...",0
5385,6325,film_id,3626,34,1,5,title: dunia cemburu\nactors: aquila firrina s...,"[1945, 5685, 5328, 9627, 5660, 1382, 348, 9575...",0


In [23]:
final_df[final_df['query'] == 'bidadari surga']

Unnamed: 0,query,query_type,content_id,total_click,position,score,search_text,search_result,is_tp
10031,bidadari surga,search,2098,399,1,4,bidadari surga,"[2098, 7617, 2716, 6431, 359, 1778, 3699, 9557...",1
10032,bidadari surga,search,535,41,2,3,bidadari surga,"[2098, 7617, 2716, 6431, 359, 1778, 3699, 9557...",0
10033,bidadari surga,search,6301,15,3,2,bidadari surga,"[2098, 7617, 2716, 6431, 359, 1778, 3699, 9557...",0
10034,bidadari surga,search,4065,10,4,1,bidadari surga,"[2098, 7617, 2716, 6431, 359, 1778, 3699, 9557...",0


In [24]:
film_df[film_df['id'] == '2098']

Unnamed: 0,id,description,release_date,total_watchers,age_rating,image_portrait,content_url,image_url,is_premium,title,group_l1,group_l2,genres,actors,directors,country,release_year,popularity,search_text
1986,2098,Mamak Lainuri (Meriam Bellina) beserta putra-p...,2013-10-07,211,13 or more,bidadari-bidadari-surga-82933d.jpg,https://www.vidio.com/premier/2098,https://thumbor.prod.vidiocdn.com/uKnm8vsfh9MF...,True,bidadari bidadari surga,movies,indonesia,"drama, romance","christ laurent, kevin julio, kimberly ryder, m...",sony gaokasak,indonesia,2013,below average,title: bidadari bidadari surga\nactors: chris...


In [22]:
final_df.drop(columns=['search_result', 'is_tp']).to_json('data/benchmark_vertex_search.json', orient='records', lines=True)