In [48]:
import pandas
from tqdm.auto import tqdm

In [49]:
df = pandas.read_csv('../data/movies.csv', keep_default_na=False)

In [50]:
df['release_date'].replace('', '1970-01-01', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['release_date'].replace('', '1970-01-01', inplace=True)


In [51]:
documents = df.to_dict(orient='records')

In [52]:
from elasticsearch import Elasticsearch, helpers
es_client = Elasticsearch('http://localhost:9200')

index_name = "movies"

In [53]:
if es_client.indices.exists(index='movies'):
    es_client.indices.delete(index='movies')

In [54]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "id": {"type": "integer", "null_value": 0},
            "title": {"type": "text"},
            "genres": {"type": "text"},
            "original_language": {"type": "keyword"},
            "overview": {"type": "text"},
            "popularity": {"type": "float"},
            "production_companies": {"type": "text"},
            "release_date": {"type": "date", "format": "yyyy-MM-dd", "null_value": "1970-01-01"},
            "budget": {"type": "float"},
            "revenue": {"type": "float"},
            "runtime": {"type": "float"},
            "status": {"type": "keyword"},
            "tagline": {"type": "text"},
            "vote_average": {"type": "float"},
            "vote_count": {"type": "float"},
            "credits": {"type": "text"},
            "keywords": {"type": "text"},
        }
    }
}

es_client.indices.create(index=index_name, body=index_settings) 

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'movies'})

In [55]:
# #  took so long to index, about 20it with 560k rows after filter, ~ 450 minutes
# for doc in tqdm(documents):
#     es_client.index(index=index_name, document=doc)

In [59]:
es_client.count(index=index_name).get('count')

722359

In [57]:
# Function to create the bulk actions
def generate_actions():
    for doc in documents:
        yield {
            "_index": index_name,  # Replace with your index name
            "_source": doc
        }

# Bulk indexing function with progress bar
def bulk_index(batch_size=2000):
    total_documents = len(documents)
    progress_bar = tqdm(total=total_documents, desc="Indexing Progress")
    
    for success, info in helpers.parallel_bulk(
        es_client,
        generate_actions(),
        chunk_size=batch_size
    ):
        if not success:
            print('A document failed:', info)
        progress_bar.update(batch_size)
    
    progress_bar.close()

In [None]:
bulk_index()

## RAG flow

In [8]:
import os
from dotenv import load_dotenv

load_dotenv()
HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN')

In [9]:
def elastic_search(query):

    search_query = {
        "size": 10,
        "query": {
            "multi_match": {
                "query": query,
                "fields": ["title^3", "description^2", "overview^1.5", "genres", "keywords"],
                "type": "best_fields",
                "fuzziness": "AUTO"
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = [hit['_source'] for hit in response['hits']['hits']]
    
    return result_docs

In [10]:
prompt_template = """
You're a movie assistant. Answer the QUESTION based on the CONTEXT from our movies data.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()


entry_template = """
title: {title}
genres: {genres}
original_language: {original_language}
overview: {overview}
popularity: {popularity}
production_companies: {production_companies} 
release_date: {release_date}
budget: {budget}
revenue: {revenue}
runtime: {runtime}
status: {status}
tagline: {tagline}
vote_average: {vote_average}
vote_count: {vote_count}
credits: {credits}
keywords: {keywords}
""".strip()

def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [48]:
from huggingface_hub import InferenceClient

def llm(prompt, model='mistralai/Mixtral-8x7B-Instruct-v0.1'):
    client = InferenceClient(
        model,
        token=HUGGINGFACE_TOKEN,
    )

    response = client.chat_completion(
        messages=[{"role": "user", "content": prompt}],
        max_tokens=500
    )
    
    return response.choices[0].message.content

In [49]:
def rag(query, model='mistralai/Mixtral-8x7B-Instruct-v0.1'):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, model=model)
    return answer

In [50]:
question = 'Julie Delpy and Ethan Hawke starred in which movies?'
answer = rag(question)
print(answer)

 Based on the provided context, Julie Delpy and Ethan Hawke starred in "Before Midnight" and "The Space in Between". "Before Midnight" is not included in the context, but it is part of the trilogy discussed in "The Space in Between" where both actors are listed in the credits.


In [None]:
query = 'Julie Delpy and Ethan Hawke starred in which movies?'

prompt = build_prompt(query, elastic_search(query))

In [37]:

client = InferenceClient(
        'mistralai/Mixtral-8x7B-Instruct-v0.1',
        token=HUGGINGFACE_TOKEN,
    )
response = ""


test  = client.chat_completion(
        messages=[{"role": "user", "content": prompt}],
        max_tokens=500
        # stream=True,
    )

    
        # response += message.choices[0].delta.content

In [41]:
test.choices[0].message.content

' Based on the provided context, Julie Delpy and Ethan Hawke starred in "Before Midnight" and "The Space in Between". "Before Midnight" is not included in the context, but it is part of the trilogy discussed in "The Space in Between" where both actors are listed in the credits.'

In [47]:
test.usage

ChatCompletionOutputUsage(completion_tokens=68, prompt_tokens=2644, total_tokens=2712)