In [1]:
import pandas as pd
from tqdm._tqdm_notebook import tqdm_notebook


tqdm_notebook.pandas()

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook


In [2]:
df = pd.read_excel('data/chat_routing_fix.xlsx')
df.head()

Unnamed: 0,query,route,human
0,dian sastro membintangi film apa saja?,vidio-info,recommendation
1,halo,vidio-info,vidio-info
2,kenapa kurma?,vidio-info,vidio-info
3,Carikan saya 1 series terbaik yang ada di vidio,recommendation,recommendation
4,reza rahardian,vidio-info,vidio-info


In [42]:
from langchain_google_vertexai import VertexAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate


model = VertexAI(model_name="gemini-pro", project="vidio-quiz-prod", location="asia-southeast1", temperature=0.4)
route_chain = (
    PromptTemplate.from_template(
        """You are chatbot for Vidio OTT platform        
Given the user question below, classify it as either being about
`recommendation` if question related to user want film recommendation
`schedule` if question related to sport match schedule or sport team
`vidio-info` otherwise

Do not respond with other word than specified.

<user_query>
{user_query}
</user_query>

Classification:
"""
    ) | model | StrOutputParser() 
)

def chain_route(user_query):
    try:
        return route_chain.invoke({"user_query": user_query})
    except IndexError as e:
        return ""

In [43]:
df['llm'] = df['query'].progress_apply(lambda x: chain_route(x))

  0%|          | 0/224 [00:00<?, ?it/s]

In [73]:
df['recall'] = df.apply(lambda x: 1 if x['human'] == x['llm'] else 0, axis=1)

In [74]:
df['recall'].describe()

count    224.000000
mean       0.821429
std        0.383851
min        0.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        1.000000
Name: recall, dtype: float64

In [75]:
df[(df['human'] == 'vidio-info')]['recall'].describe()

count    127.000000
mean       0.685039
std        0.466340
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        1.000000
Name: recall, dtype: float64

In [76]:
df[(df['human'] == 'vidio-info') & (df['recall'] == 0)]

Unnamed: 0,query,route,human,llm,recall,similarity
4,reza rahardian,vidio-info,vidio-info,recommendation,0,recommendation
8,Apa sinopsis dari serial Happiness di atas?,vidio-info,vidio-info,recommendation,0,recommendation
12,apa yang bagus dari film ratu adil di vidio?,vidio-info,vidio-info,recommendation,0,recommendation
20,tapi film-film ini tidak sama dengan sebelumnya,vidio-info,vidio-info,recommendation,0,recommendation
27,film yang dimainkan idan sastro,vidio-info,vidio-info,recommendation,0,recommendation
40,itu sudah tamat?,vidio-info,vidio-info,recommendation,0,recommendation
41,berikan link ratu adil,vidio-info,vidio-info,recommendation,0,recommendation
44,virio original series apa yang sedang berjalan...,vidio-info,vidio-info,recommendation,0,vidio-info
57,di vidio ada film bokep gak?,vidio-info,vidio-info,,0,vidio-info
60,rekomendasikan konten porno dong,vidio-info,vidio-info,,0,recommendation


In [50]:
from vertexai.language_models import TextEmbeddingModel
model = TextEmbeddingModel.from_pretrained("textembedding-gecko-multilingual")

def embedding_text(model, text):
    embeddings = model.get_embeddings([text])
    for embedding in embeddings:
        vector = embedding.values
        # print(f"Length of Embedding Vector: {len(vector)}")
    return vector

In [64]:
import numpy as np

template_recommendation = """You are content expert from OTT company.
Your task is to give recommendation based on User Query, Movie Data retrieved from Vector Search and User History.
The answer in **Bahasa Indonesia**, give **5 (five)** recommendation, re-rank the recommendation based on User Query and User History relevancy.
{format_instructions}
Always use title case for title field.
Do not recommend content from User History unless it really relevant for them based on their query.
Give the explanation for each content why it relevant for the user based on the User Query, User Profile, Vector Search Result and User History. The explanation should show only if it is relevant to be shown to user and summarize it based on User Query and User History.

Here is the context.
{user_profile}

User Query: {user_query}

Vector Search Result:
{vector_search_result}

User History:
{user_history}"""

template_schedule = """"Based on context answer following question, answer with human readable form.
Context:
{context}
Question: {user_question}
Answer:"""

template_vidio_info = """You are chatbot for Vidio OTT platform. Your task is to answer to user question
Question can is related to vidio. Like about vidio product, and film.
Or question can be unrelated to vidio

Here are the useful information:
<information>
{information} 
</information>

User Question:
<user_query>
{user_query}
</user_query>"""

prompt_templates = [template_recommendation, template_schedule, template_vidio_info]
prompt_embeddings = [embedding_text(model, text) for text in prompt_templates]

def prompt_router(input):
    query_embedding = embedding_text(model, input)
    # similarity = cosine_similarity([query_embedding], prompt_embeddings)[0]
    similarity = [np.dot(query_embedding, prompt_embedding) for prompt_embedding in prompt_embeddings]
    most_similar_index = np.argmax(similarity)
    answers = ['recommendation', 'schedule', 'vidio-info']
    return answers[most_similar_index]


In [77]:
similarity_df = df.copy()
similarity_df['similarity'] = similarity_df.progress_apply(lambda x: prompt_router(x['query']), axis=1)
similarity_df['recall'] = similarity_df.apply(lambda x: 1 if x['human'] == x['similarity'] else 0, axis=1)

  0%|          | 0/224 [00:00<?, ?it/s]

In [78]:
similarity_df['recall'].describe()

count    224.000000
mean       0.549107
std        0.498697
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        1.000000
Name: recall, dtype: float64

In [79]:
similarity_df[(similarity_df['human'] == 'vidio-info')]['recall'].describe()

count    127.000000
mean       0.322835
std        0.469412
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000
Name: recall, dtype: float64

In [80]:
similarity_df[(similarity_df['human'] == 'vidio-info') & (similarity_df['recall'] == 0)]

Unnamed: 0,query,route,human,llm,recall,similarity
1,halo,vidio-info,vidio-info,vidio-info,0,recommendation
2,kenapa kurma?,vidio-info,vidio-info,vidio-info,0,recommendation
4,reza rahardian,vidio-info,vidio-info,recommendation,0,recommendation
8,Apa sinopsis dari serial Happiness di atas?,vidio-info,vidio-info,recommendation,0,recommendation
10,how long to beat sekiro,other,vidio-info,vidio-info,0,recommendation
...,...,...,...,...,...,...
214,siapa presiden indonesia,vidio-info,vidio-info,vidio-info,0,recommendation
217,apple,vidio-info,vidio-info,vidio-info,0,schedule
220,generenya apa saja itu,vidio-info,vidio-info,vidio-info,0,recommendation
221,selamat siang,vidio-info,vidio-info,vidio-info,0,recommendation
