In [1]:
import pandas as pd
from datetime import datetime
import numpy as np

from google.cloud import aiplatform
from vertexai.language_models import TextEmbeddingModel
from tqdm._tqdm_notebook import tqdm_notebook

aiplatform.init(
    project='vidio-quiz-prod',
    location='asia-southeast1',
    staging_bucket='gs://genai_hackathon_2024',
)
tqdm_notebook.pandas()

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook


In [2]:
def preprocess_film_metadata(df):
    df.fillna('', inplace=True)
    df['id'] = df['id'].astype(str)
    df['title'] = df['film_title'].str.lower()
    df['group_l1'] = df['group_name_l1'].str.lower()
    df['group_l2'] = df['group_name_l2'].str.lower()
    df['genres'] = df['film_genres'].apply(lambda x: x.split(','))
    df['actors'] = df['film_actors'].apply(lambda x: x.split(','))
    df['directors'] = df['film_directors'].apply(lambda x: x.split(','))
    # df.loc[df['actors'] == "various", 'actors'] = ""
    df['country'] = df['country_group'].str.lower()
    # df['total_watchers'] = df['total_watchers'].astype('int')
    df['release_date'] = df['release_date'].astype(str)
    df['release_date'] = df['release_date'].str.replace(" 00:00:00", "")
    df['release_year'] = df['release_date'].progress_apply(lambda x: str(datetime.strptime(str(x), "%Y-%m-%d").year) if x != '' else '')

    def popularity(total_watchers):
        if total_watchers >= 50000:
            return "trending"
        elif total_watchers < 50000 and total_watchers >= 500:
            return "average"
        else:
            return "below average"

    # df['popularity'] = df['total_watchers'].apply(lambda x: popularity(x))
    search_text_columns = ['title', 'description', 'group_l1', 'group_l2', 'film_main_genre', 'genres', 'directors', 'actors', 'country', 'release_year', 'age_rating']
    df['search_text'] = df[search_text_columns].apply(lambda row: search_text(*row), axis=1)
    df.rename(columns={'search_text': 'content'}, inplace=True)
    df = df.loc[:,~df.columns.duplicated()]
    df.drop(columns=['film_title', 'group_name_l1', 'group_name_l2', 'film_main_genre', 'film_genres', 'film_directors', 'film_actors', 'country_group'], inplace=True)
    return df


def search_text(title, description, group_l1, group_l2, main_genre, genres, directors, actors, country, release_year, age_rating):
    genres = ', '.join(genres)
    actors = ', '.join(actors)
    directors = ', '.join(directors)
    return f"""title: {title}
actors: {actors}
group: {group_l1} > {group_l2}
genres: {main_genre}, {genres}
directors: {directors}
description: {description}
country: {country}
release year: {release_year}
age rating: {age_rating}"""


In [6]:
from trino.dbapi import connect

conn = connect(
    host="localhost",
    port=8080,
    user="public_adhoc",
    catalog="hive",
)
cur = conn.cursor()

In [7]:
def generate_sql_query():
    return f"""
SELECT 
    DISTINCT(dayshift.vod_metadata.film_id) as id, 
    dayshift.vod_metadata.film_title, 
    group_name_l1, 
    group_name_l2, 
    film_main_genre, 
    film_genres, 
    film_directors, 
    film_actors, 
    country_group, 
    films.description, 
    films.release_date, 
    film_rating as age_rating,
    image_portrait,
    concat('https://www.vidio.com/premier/', cast(films.id as varchar)) as content_url,
    concat('https://thumbor.prod.vidiocdn.com/', to_base64url(hmac_sha1(cast(concat('223x332/filters:quality(75)/vidio-web-prod-film/uploads/film/image_portrait/', cast(films.id as varchar), '/', films.image_portrait) as varbinary), cast('cheeky rando' as varbinary))), '/', concat('223x332/filters:quality(75)/vidio-web-prod-film/uploads/film/image_portrait/', cast(films.id as varchar), '/', films.image_portrait)) as image_url
    -- CASE WHEN premium_contents.premiumable_id IS NULL THEN FALSE ELSE TRUE END as is_premium
FROM dayshift.vod_metadata
LEFT JOIN vidio_production.films as films ON dayshift.vod_metadata.film_id = films.id
-- LEFT JOIN vidio_web.public.premium_contents as premium_contents ON dayshift.vod_metadata.film_id = premium_contents.premiumable_id AND premium_contents.premiumable_type = 'Film'
WHERE
dayshift.vod_metadata.film_id IS NOT NULL
AND film_published = true
AND film_deleted = false
AND video_published = true
"""

In [8]:
film_df = pd.read_sql_query(generate_sql_query(), conn)
film_df = preprocess_film_metadata(film_df)
film_df.head()

  film_df = pd.read_sql_query(generate_sql_query(), conn)


  0%|          | 0/5755 [00:00<?, ?it/s]

Unnamed: 0,id,description,release_date,age_rating,image_portrait,content_url,image_url,title,group_l1,group_l2,genres,actors,directors,country,release_year,content
0,5820,Hal-hal berikut ini sangat menarik yang dapat ...,2022-07-20,13 or more,keluyuran-tips-pengalaman-jalan-jalan-b3fc63.png,https://www.vidio.com/premier/5820,https://thumbor.prod.vidiocdn.com/wiwa3U8SYPb8...,keluyuran - tips pengalaman jalan-jalan,entertainment,lifestyle,"[hobbies, travel]",[various],[various],indonesia,2022,title: keluyuran - tips pengalaman jalan-jalan...
1,5691,Fun food with Pororo! Encourage healthy eating...,2022-07-08,less than 7,pororo-yum-yum-song-4b7a9a.jpg,https://www.vidio.com/premier/5691,https://thumbor.prod.vidiocdn.com/Ckr3-eIblgz1...,pororo yum yum song,entertainment,education,"[animation, cartoon]",[various],[various],korea,2022,title: pororo yum yum song\nactors: various\ng...
2,7657,Perjalanan hidup untuk berusaha istiqomah dari...,2023-03-10,13 or more,belok-kanan-jalan-terus-fb3bb1.jpg,https://www.vidio.com/premier/7657,https://thumbor.prod.vidiocdn.com/DXV2Xk8TkJ9Y...,belok kanan jalan terus,series,tv sinetron,"[comedy, drama, religi]","[cut meyriska, roger danuarta]",[agus elias],indonesia,2023,title: belok kanan jalan terus\nactors: cut me...
3,362,Candra Kirana merupakan kakak beradik yang mem...,2016-04-04,13 or more,candra-kirana-28f941.jpg,https://www.vidio.com/premier/362,https://thumbor.prod.vidiocdn.com/PBs7Y6hCZASU...,candra kirana,series,indonesia,"[drama, family, romance]","[marsha aurelia, masayu anastasia, nadila erne...",[sony gaokasak],indonesia,2016,"title: candra kirana\nactors: marsha aurelia, ..."
4,7884,Dokumenter yang menceritakan kehidupan dari ti...,2023-04-19,13 or more,horse-racing-on-the-cloud-588f9c.jpg,https://www.vidio.com/premier/7884,https://thumbor.prod.vidiocdn.com/nZKITQZCGpcz...,horse racing on the cloud,series,mandarin,"[dokumenter, education, historical]","[caijia, nanjie]",[],mandarin,2023,title: horse racing on the cloud\nactors: caij...


In [9]:
# film_df['embedding'] = film_df.progress_apply(lambda x: embedding_text(model, x['content']), axis=1)
# film_df.rename(columns={"embedding": "embedding_vector", "content_url": "uri"}, inplace=True)
film_df.rename(columns={"content_uri": "uri"}, inplace=True)

In [12]:
film_df.to_json('data/film_metadata.json', orient='records', lines=True)

In [13]:
from google.cloud import storage

def upload_blob(bucket_name, source_file_name, destination_blob_name):
  storage.blob._DEFAULT_CHUNKSIZE = 35 * 1024 * 1024  # 35 MB
  storage.blob._MAX_MULTIPART_SIZE = 35 * 1024 * 1024  # 35 MB
  
  storage_client = storage.Client()
  bucket = storage_client.bucket(bucket_name)
  blob = bucket.blob(destination_blob_name)

  blob.upload_from_filename(source_file_name)

  print(
      "File {} uploaded to {}.".format(
          source_file_name, destination_blob_name
      )
  )

upload_blob("genai_hackathon_2024", "data/film_metadata.json", "data/film_metadata.json")

File data/film_metadata.json uploaded to data/film_metadata.json.


In [22]:
from base64 import b64encode
import json
import google.auth
import google.auth.transport.requests
creds, project = google.auth.default()

# creds.valid is False, and creds.token is None
# Need to refresh credentials to populate those

auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
token = creds.token


project_id = "vidio-quiz-prod"
location = "global"
data_store_id = "film-metadata-202403191330_1710829784824"
gcs_url = "gs://genai_hackathon_2024/data/film_metadata.json"

In [23]:
import requests


def reindex_from_gcs(token, project_id, location, data_store_id, gcs_url):
    headers={
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json"
    }
    data={
      "reconciliationMode": "FULL",
      "autoGenerateIds": True,
      "gcsSource":{
        "inputUris": [
          gcs_url
        ],
        "dataSchema": "custom"
      }
    }
    response = requests.post(
        f"https://discoveryengine.googleapis.com/v1beta/projects/{project_id}/locations/{location}/collections/default_collection/dataStores/{data_store_id}/branches/0/documents:import",
        headers=headers,
        json=data
    )
    return response

In [24]:
reindex_from_gcs(token, project_id, location, data_store_id, gcs_url)

<Response [200]>