In [12]:
import pandas
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
df = pandas.read_csv('../data/movies_filter.csv', keep_default_na=False)

In [14]:
df['release_date'].replace('', '1970-01-01', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['release_date'].replace('', '1970-01-01', inplace=True)


In [15]:
documents = df.to_dict(orient='records')

In [16]:
documents[500]

{'id': 779112,
 'title': "BTS World Tour 'Love Yourself - Speak Yourself' London",
 'genres': 'Music',
 'original_language': 'English',
 'overview': "At last it's Wembley! The milestone concert of 2 hours and 30 minutes that filled the immense stadium returns to ARMY all around the world!",
 'popularity': 114.009,
 'production_companies': '',
 'release_date': '2020-09-24',
 'budget': 0,
 'revenue': 0,
 'runtime': 226,
 'status': 'Released',
 'tagline': '',
 'vote_average': 8.8,
 'vote_count': 5,
 'credits': 'Kim Nam-joon-Kim Seok-jin-Min Yoon-gi-Jung Ho-seok-Park Ji-min-Kim Tae-hyung-Jeon Jung-kook',
 'keywords': ''}

In [17]:
len(documents)

560462

In [18]:
from elasticsearch import Elasticsearch, helpers
es_client = Elasticsearch('http://localhost:9200')

In [7]:
index_name = "movies"

In [69]:
if es_client.indices.exists(index='movies'):
    es_client.indices.delete(index='movies')

In [70]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "id": {"type": "integer", "null_value": 0},
            "title": {"type": "text"},
            "genres": {"type": "text"},
            "original_language": {"type": "keyword"},
            "overview": {"type": "text"},
            "popularity": {"type": "float"},
            "production_companies": {"type": "text"},
            "release_date": {"type": "date", "format": "yyyy-MM-dd", "null_value": "1970-01-01"},
            "budget": {"type": "float"},
            "revenue": {"type": "float"},
            "runtime": {"type": "float"},
            "status": {"type": "keyword"},
            "tagline": {"type": "text"},
            "vote_average": {"type": "float"},
            "vote_count": {"type": "float"},
            "credits": {"type": "text"},
            "keywords": {"type": "text"},
        }
    }
}

es_client.indices.create(index=index_name, body=index_settings) 

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'movies'})

In [71]:
# #  took so long to index, about 20it with 560k rows after filter, ~ 450 minutes
# for doc in tqdm(documents):
#     es_client.index(index=index_name, document=doc)

In [76]:
es_client.count(index=index_name)

ObjectApiResponse({'count': 560462, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

In [73]:
# Function to create the bulk actions
def generate_actions():
    for doc in documents:
        yield {
            "_index": index_name,  # Replace with your index name
            "_source": doc
        }

# Bulk indexing function with progress bar
def bulk_index(batch_size=1000):
    total_documents = len(documents)
    progress_bar = tqdm(total=total_documents, desc="Indexing Progress")
    
    for success, info in helpers.parallel_bulk(
        es_client,
        generate_actions(),
        chunk_size=batch_size
    ):
        if not success:
            print('A document failed:', info)
        progress_bar.update(batch_size)
    
    progress_bar.close()

In [74]:
bulk_index()

Indexing Progress:   0%|          | 0/560462 [04:13<?, ?it/s]             
Indexing Progress: 560462000it [00:36, 15487886.62it/s]


In [107]:
def elastic_search(query):

    search_query = {
        "size": 10,
        "query": {
            "multi_match": {
                "query": query,
                "fields": ["title^3", "description^2", "overview^1.5", "genres", "keywords"],
                "type": "best_fields",
                "fuzziness": "AUTO"
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = [hit['_source'] for hit in response['hits']['hits']]
    
    return result_docs

In [113]:
prompt_template = """
You're a movie assistant. Answer the QUESTION based on the CONTEXT from our movies data.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()


entry_template = """
title: {title}
genres: {genres}
original_language: {original_language}
overview: {overview}
popularity: {popularity}
production_companies: {production_companies} 
release_date: {release_date}
budget: {budget}
revenue: {revenue}
runtime: {runtime}
status: {status}
tagline: {tagline}
vote_average: {vote_average}
vote_count: {vote_count}
credits: {credits}
keywords: {keywords}
""".strip()

def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [102]:
import os
HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN')

In [114]:
from huggingface_hub import InferenceClient

def llm(prompt, model='mistralai/Mixtral-8x7B-Instruct-v0.1'):
    client = InferenceClient(
        model,
        token=HUGGINGFACE_TOKEN,
    )

    response = ""
    
    for message in client.chat_completion(
        messages=[{"role": "user", "content": prompt}],
        max_tokens=500,
        stream=True,
    ):
         response += message.choices[0].delta.content
    return response

In [115]:
def rag(query, model='mistralai/Mixtral-8x7B-Instruct-v0.1'):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, model=model)
    return answer

In [116]:
question = 'how do you know about before sunrise, before sunset, before midnight?'
question = 'Julie Delpy and Ethan Hawke starred in which movies?'
answer = rag(question)
print(answer)

 Julie Delpy and Ethan Hawke starred in "The Space in Between" and are also cowriters for this movie. They are well known for their work together in the "Before" film series, including "Before Sunrise", "Before Sunset", and "Before Midnight". However, based on the provided context, they starred in "The Space in Between", a documentary where Linklater, Delpy, and Hawke discuss the "Before" trilogy.
