In [7]:
from google.cloud import aiplatform
from vertexai.language_models import TextEmbeddingModel, TextEmbeddingInput
from tqdm._tqdm_notebook import tqdm_notebook

aiplatform.init(
    project='vidio-quiz-prod',
    location='asia-southeast1',
    staging_bucket='gs://genai_hackathon_2024',
)
tqdm_notebook.pandas()

In [8]:
model = TextEmbeddingModel.from_pretrained("textembedding-gecko-multilingual")

def embedding_text(model, text):
    embeddings = model.get_embeddings([text])
    for embedding in embeddings:
        vector = embedding.values
        # print(f"Length of Embedding Vector: {len(vector)}")
    return vector

def embedding_texts(model, texts):
    # inputs = list(map(lambda x: TextEmbeddingInput(text=x), texts))
    # embeddings = model.get_embeddings(inputs)
    embeddings = model.get_embeddings(texts)
    return list(map(lambda x: x.values, embeddings))

def embedding_texts_dataframe(df):
    df.reset_index(drop=True, inplace=True)
    CHUNK_SIZE = 5
    embeds = []
    for i in tqdm_notebook(range(0,df.shape[0],CHUNK_SIZE)):
        end_idx = i+CHUNK_SIZE
        if end_idx > len(df.index):
            end_idx = len(df.index)
        curr = df[i:end_idx]
        texts = curr['search_text'].values.tolist()
        embeds = embeds + embedding_texts(model, texts)

    df['embedding'] = df.apply(lambda row : embeds[row.name], axis=1)

def df_id_in(df, result):
    ids = result[0]
    distances = result[1]
    # df['order'] = df['id'].apply(lambda x: lst.index(x) if x in lst else -1)
    df['distance'] = df['id'].apply(lambda x: distances[ids.index(x)] if x in ids else -1)
    df = df[df['id'].isin(ids)]
    # return df.sort_values(['order']).drop(columns=['order'])
    return df.sort_values(['distance'], ascending=False)

        

In [9]:
#ganti ke pandas xlsx
import pandas as pd
from datetime import datetime
import numpy as np

original_df = pd.read_excel('film_metadata.xlsx', converters={'release_date':str})

first = original_df[:200]
middle = original_df[2000:2200]
last = original_df[-100:]

df = pd.concat([first, middle, last])
# df.reset_index(drop=True, inplace=True)

  warn("Workbook contains no default style, apply openpyxl's default")


In [10]:
df.fillna('', inplace=True)
df['title'] = df['film_title'].str.lower()
df['group_l1'] = df['group_name_l1'].str.lower()
df['group_l2'] = df['group_name_l2'].str.lower()
df['genres'] = df['film_genres'].apply(lambda x: ', '.join(x.split(',')))
df['actors'] = df['film_actors'].apply(lambda x: ', '.join(x.split(',')))
df['directors'] = df['film_directors'].apply(lambda x: ', '.join(x.split(',')))
df.loc[df['actors'] == "various", 'actors'] = ""
df['country'] = df['country_group'].str.lower()
df['total_watchers'] = df['total_watchers'].astype('int')
df['release_date'] = df['release_date'].str.replace(" 00:00:00", "")
df['release_year'] = df['release_date'].apply(lambda x: datetime.strptime(str(x), "%Y-%m-%d").year if x != '' else '')

def popularity(total_watchers):
    if total_watchers >= 50000:
        return "trending"
    elif total_watchers < 50000 and total_watchers >= 500:
        return "average"
    else:
        return "below average"

df['popularity'] = df['total_watchers'].apply(lambda x: popularity(x))

def search_text(title, description, group_l1, group_l2, main_genre, genres, directors, actors, country, release_year, age_rating, popularity):
    return f"""title: {title}
description: {description}
group: {group_l1} > {group_l2}
genres: {main_genre}, {genres}
directors: {directors}
actors: {actors}
country: {country}
release year: {release_year}
age rating: {age_rating}
popularity: {popularity}"""

search_text_columns = ['title', 'description', 'group_l1', 'group_l2', 'film_main_genre', 'genres', 'directors', 'actors', 'country', 'release_year', 'age_rating', 'popularity']
df['search_text'] = df[search_text_columns].apply(lambda row: search_text(*row), axis=1)

In [11]:
embedding_texts_dataframe(df)

  0%|          | 0/100 [00:00<?, ?it/s]

In [12]:
final_df = df
from annoy import AnnoyIndex
annoy_index = AnnoyIndex(768, 'dot')
for index, row in final_df.iterrows():
    annoy_index.add_item(row['id'], row['embedding'])
annoy_index.build(10)

True

In [83]:

query =  "actors: ammar zoni"
query_vector = embedding_text(model, query)
results = annoy_index.get_nns_by_vector(query_vector, 10, search_k=-1, include_distances=True)
df_id_in(final_df, results)

Unnamed: 0,id,film_title,group_name_l1,group_name_l2,film_main_genre,film_genres,film_directors,film_actors,country_group,description,...,genres,actors,directors,country,release_year,popularity,search_text,embedding,distance,embedding_satuan
340,1426,Cinta Suci (Extras),Series,Indonesia,drama,"comedy,drama,romance",m. abdullah,"ammar zoni,asmirandah,haico van der veken,iris...",Indonesia,"Kumpulan highlight-highlight sinetron ""Cinta S...",...,"comedy, drama, romance","ammar zoni, asmirandah, haico van der veken, i...",m. abdullah,indonesia,,below average,title: cinta suci (extras)\ndescription: Kumpu...,"[0.01523171178996563, 7.786622518324293e-06, -...",0.719207,"[0.015318725258111954, 0.00034956587478518486,..."
179,312,Cinta Suci,Series,Indonesia,drama,"drama,romance",m. abdullah,"ammar zoni,asmirandah,dinda kanya dewi,irish b...",Indonesia,Suci (Irish Bella) yang terjerat masalah harus...,...,"drama, romance","ammar zoni, asmirandah, dinda kanya dewi, iris...",m. abdullah,indonesia,2018.0,average,title: cinta suci\ndescription: Suci (Irish Be...,"[0.018569303676486015, -0.016310125589370728, ...",0.700998,"[0.018290605396032333, -0.016663607209920883, ..."
295,687,Essentials: Wali,Music,Music Video,pop,"alternative,melayu,pop","ronny djalil,yogi yose","aan kurnia,farhan zainal muttaqin,hamzah shopi...",Indonesia,Ibu-ibu bapak-bapak siapa yang suka lagu-lagu ...,...,"alternative, melayu, pop","aan kurnia, farhan zainal muttaqin, hamzah sho...","ronny djalil, yogi yose",indonesia,,below average,title: essentials: wali\ndescription: Ibu-ibu ...,"[0.0296808909624815, -0.010143663734197617, -0...",0.667141,"[0.029998313635587692, -0.010133866220712662, ..."
86,9478,Jason Bourne,Movies,Western,action,"action,adaptation,mystery,thriller",paul greengrass,"alicia vikander,julia stiles,matt damon,tommy ...",Western,Mantan agen CIA yang paling berbahaya keluar d...,...,"action, adaptation, mystery, thriller","alicia vikander, julia stiles, matt damon, tom...",paul greengrass,western,2016.0,average,title: jason bourne\ndescription: Mantan agen ...,"[-0.0222869161516428, -0.04716699197888374, 0....",0.666957,"[-0.022403016686439514, -0.04704789072275162, ..."
41,7131,Switchover,Series,Vidio Original,romance,"action,adaptation,mystery,romance",angling sagaran,"adhisty zara,alika jantinia,emir mahira,fadly ...",Indonesia,Kembalinya Anna ke Jakarta membawa kebahagiaan...,...,"action, adaptation, mystery, romance","adhisty zara, alika jantinia, emir mahira, fad...",angling sagaran,indonesia,2023.0,average,title: switchover\ndescription: Kembalinya Ann...,"[-0.039473164826631546, 0.004840779583901167, ...",0.66481,"[-0.03927639126777649, 0.004825425334274769, -..."
322,2340,Pesan Dari Hati - Catatan Harianku,Movies,FTV,drama,"drama,romance",anika marani,"ibrahim risyad,marsha aruan,omar daniel",Indonesia,Ceza terpukul saat Aldi meninggal mendadak tep...,...,"drama, romance","ibrahim risyad, marsha aruan, omar daniel",anika marani,indonesia,2021.0,below average,title: pesan dari hati - catatan harianku\ndes...,"[0.01011586468666792, -0.008755766786634922, -...",0.664457,"[0.010142894461750984, -0.008770398795604706, ..."
303,7610,Aroma Cinta Ikan Asin,Movies,FTV,drama,"comedy,drama,romance",harris fabillah,"adinda azani,rizky alatas",Indonesia,Zakki (23th) seorang pemuda yang sukses namun ...,...,"comedy, drama, romance","adinda azani, rizky alatas",harris fabillah,indonesia,2018.0,below average,title: aroma cinta ikan asin\ndescription: Zak...,"[0.0014968585455790162, -0.0014408841961994767...",0.664148,"[0.0017264126800000668, -0.0010246782330796123..."
156,285,Mr. Bean,Series,Western,comedy,"comedy,sitcom","john birkin,john howard davies,paul weiland","matilda ziegler,rowan atkinson",Western,Acara ini mengikuti Mr Bean saat ia mencoba un...,...,"comedy, sitcom","matilda ziegler, rowan atkinson","john birkin, john howard davies, paul weiland",western,1990.0,average,title: mr. bean\ndescription: Acara ini mengik...,"[-0.0173481535166502, -0.00361390458419919, 0....",0.663463,"[-0.01741158775985241, -0.003733104793354869, ..."
19,8066,Criminal,Movies,Western,action,"action,sci-fi,thriller",ariel vromen,"alice eve,amaury nolasco,antje traue,gal gadot...",Western,Dalam upaya terakhir untuk menghentikan plot j...,...,"action, sci-fi, thriller","alice eve, amaury nolasco, antje traue, gal ga...",ariel vromen,western,2016.0,average,title: criminal\ndescription: Dalam upaya tera...,"[0.01395654771476984, -0.05585877597332001, 0....",0.653036,"[0.013694354332983494, -0.05607256293296814, 0..."
118,9483,The Bourne Legacy,Movies,Western,action,"action,adaptation,adventure,thriller",tony gilroy,"edward norton,jeremy renner,rachel weisz,stacy...",Western,"Setelah kegagalan Jason Bourne, CIA kembali me...",...,"action, adaptation, adventure, thriller","edward norton, jeremy renner, rachel weisz, st...",tony gilroy,western,2012.0,average,title: the bourne legacy\ndescription: Setelah...,"[-0.03363339602947235, -0.04320288076996803, 0...",0.652283,"[-0.03351672366261482, -0.04327357932925224, 0..."


In [None]:
results = annoy_index.get_nns_by_item(9372, 10, -1, include_distances=True)
df_id_in(final_df, results)

In [None]:
df['embedding_satuan'] = df['search_text'].progress_apply(lambda search_text: embedding_text(model, search_text))

df[['film_title','embedding_satuan','embedding']]
print("satuan\t\t\tbatch\t\t\tdiff")
for i in range(0, 768):
    batch = df.iloc[0]['embedding'][i]
    satuan = df.iloc[0]['embedding_satuan'][i]
    print(satuan,"\t", batch,"\t", satuan - batch)

In [None]:
df.drop('embedding_satuan', axis=1)


In [67]:
df2 = pd.DataFrame([[2,3,1], [3,2,2], [2,4,4]], columns=list("ABC"))
df2.style.apply(lambda x: ["background: red" if v > x.iloc[0] else "" for v in x], axis = 1)

# df3 = pd.DataFrame(np.random.rand(4,3))
# df3.style.applymap(lambda x: 'background-color : yellow' if x>df3.iloc[0,0] else '')




Unnamed: 0,A,B,C
0,2,3,1
1,3,2,2
2,2,4,4
