In [16]:
# wget for Windows PowerShell
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py -OutFile minsearch.py 


'wget' is not recognized as an internal or external command,
operable program or batch file.


In [9]:
import pandas
from tqdm.auto import tqdm

In [10]:
df = pandas.read_csv('../data/movies_filter.csv', keep_default_na=False, dtype= str)

In [11]:
documents = df.to_dict(orient='records')

In [12]:
import minsearch

In [13]:
index = minsearch.Index(
    text_fields=['title', 'genres', 'overview', 'popularity',
       'production_companies', 'release_date', 'budget', 'revenue', 
       'runtime', 'tagline', 'vote_average', 'vote_count', 'credits',
       'keywords'],
    keyword_fields=['id', 'original_language','status']
)

In [14]:
index.fit(documents)

<minsearch.Index at 0x2952e258790>

In [30]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [31]:
prompt_template = """
You're a movie assistant. Answer the QUESTION based on the CONTEXT from our movies data.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()


entry_template = """
id: {id}
title: {title}
genres: {genres}
original_language: {original_language}
overview: {overview}
popularity: {popularity}
production_companies: {production_companies} 
release_date: {release_date}
budget: {budget}
revenue: {revenue}
runtime: {runtime}
status: {status}
tagline: {tagline}
vote_average: {vote_average}
vote_count: {vote_count}
credits: {credits}
keywords: {keywords}
""".strip()

def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [54]:
# import os
# HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN')

In [51]:
from huggingface_hub import InferenceClient

def llm(prompt, model='mistralai/Mixtral-8x7B-Instruct-v0.1'):
    client = InferenceClient(
        model,
        token=HUGGINGFACE_TOKEN,
    )

    response = ""
    
    for message in client.chat_completion(
        messages=[{"role": "user", "content": prompt}],
        max_tokens=500,
        stream=True,
    ):
         response += message.choices[0].delta.content
    return response

In [52]:
def rag(query, model='mistralai/Mixtral-8x7B-Instruct-v0.1'):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, model=model)
    return answer

In [55]:
question = 'how do you know about before sunrise, before sunset, before midnight?'
answer = rag(question)
print(answer)

 Based on the context provided, I can see that there are several movies with the word "Before" in the title. However, the specific movies you're asking about are "Before Sunrise", "Before Sunset", and "Before Midnight". These movies are part of a trilogy directed by Richard Linklater and star Ethan Hawke and Julie Delpy. The films explore the relationship between two characters, Celine and Jesse, as they meet on a train and spend a day together in Vienna (Before Sunrise), reunite nine years later in Paris (Before Sunset), and take a trip to Greece with their families (Before Midnight).

There are also behind-the-scenes documentaries for "Before Sunrise" and "Before Sunset," which provide insight into the making of those films. None of the movies or documentaries are directly related to the data you provided for movies with the keyword "before dawn" or "before" in the title.


model available in HF:
- mistralai/Mistral-Nemo-Instruct-2407
- google/gemma-2-2b-it
- meta-llama/Meta-Llama-3-8B-Instruct
- mistralai/Mistral-7B-Instruct-v0.3
- mistralai/Mixtral-8x7B-Instruct-v0.1
