In [46]:
import pandas
from tqdm.auto import tqdm
import json

In [47]:
df = pandas.read_csv('../data/random_200_movies.csv', keep_default_na=False)

In [48]:
df['release_date'].replace('', '1970-01-01', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['release_date'].replace('', '1970-01-01', inplace=True)


In [49]:
documents = df.to_dict(orient='records')

In [50]:
len(documents)

200

In [51]:
prompt_template = """
You emulate a user of our movies assistant application.
Formulate 5 questions this user might ask based on a provided movies.
Make the questions specific to this movies.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

title: {title}
genres: {genres}
original_language: {original_language}
overview: {overview}
popularity: {popularity}
production_companies: {production_companies} 
release_date: {release_date}
budget: {budget}
revenue: {revenue}
runtime: {runtime}
status: {status}
tagline: {tagline}
vote_average: {vote_average}
vote_count: {vote_count}
credits: {credits}
keywords: {keywords}

Provide the output in parsable JSON without using code blocks:
{{"questions": ["question1", "question2", ..., "question5"]}}

please dont add any extra information, dont add any node, dont use any special characters like, and dont use any abbreviations.
""".strip()

In [19]:
from huggingface_hub import InferenceClient
import os

HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN')

def llm(prompt, model='mistralai/Mixtral-8x7B-Instruct-v0.1'):
    client = InferenceClient(
        model,
        token=HUGGINGFACE_TOKEN,
    )

    response = ""
    
    for message in client.chat_completion(
        messages=[{"role": "user", "content": prompt}],
        max_tokens=700,
        stream=True,
    ):
         response += message.choices[0].delta.content
    return response

In [32]:
results = {}

In [33]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)
    return llm(prompt)

In [52]:

for doc in tqdm(documents): # hugging face limit 50 requests per minute so i need 
                            # to wait, or save the already data first then concat with the new come late 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

100%|██████████| 73/73 [00:42<00:00,  1.71it/s]


In [53]:
results

{483836: ['What is the title of the drama movie about an intense bromance between a gay football player and a straight military guy?',
  "What is the original language of the movie 'Forces' released in 2016?",
  "Can you provide a summary of the movie 'Forces'?",
  "How popular is the movie 'Forces' based on its popularity score?",
  "Who are the credits for the movie 'Forces' released in 2016?"],
 372628: ['What is the title of the 10 year anniversary celebration film featuring fan favorites from Celtic Woman?',
  'What type of genre does Celtic Woman: Fan Favorites belong to?',
  'What is the original language of the movie Celtic Woman: Fan Favorites?',
  'Can you describe the overview of the movie Celtic Woman: Fan Favorites?',
  'What is the runtime of the movie Celtic Woman: Fan Favorites?'],
 633439: ['What is the title of the Chinese thriller movie that was released in 2018 and deals with a fan abducting a writer?',
  "Who are the directors of the movie 'The Very Last Day' that 

In [54]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [55]:
final_results[0]

(483836,
 'What is the title of the drama movie about an intense bromance between a gay football player and a straight military guy?')

In [56]:
df_results = pandas.DataFrame(final_results, columns=['id', 'question'])


In [58]:
df = pandas.read_csv('../data/ground-truth-retrieval.csv')


In [59]:
df = pandas.concat([df, df_results])

In [60]:
df.count()

id          1000
question    1000
dtype: int64

In [62]:
df.to_csv('../data/ground-truth-retrieval.csv', index=False)