In [None]:
!pip install indexify langchain langchain_openai

# Accurate Image RAG using Yolo and GPT3.5

LLMs don't produce accurate results when they are asked to count objects. In this example we show how to create a pipeline that is able to ingest images, extract structured data from the images, and make LLMs use SQL query for retreival. 

This allows using much smaller LLMs which are not capable of vision but can generate good SQL and reason on data.

### Download Indeixfy and the Yolo Extractor
curl https://www.tensorlake.ai | sh

1. Start the Server
   ```bash
    ./indexify server -d
    ```

3. Download and start the extractor
   ```bash
    indexify-extractor download hub://image/yolo
    indexify-extractor join-server yolo.yolo_extractor:YoloExtractor
    ```



In [100]:
from indexify import IndexifyClient
client = IndexifyClient()

In [101]:
client.add_extraction_policy(extractor='tensorlake/yolo-extractor', name="object_detection")

In [102]:
schema = client.list_schemas()["ddls"]["ingestion"]

In [130]:
response = client.ingest_remote_file("https://extractor-files.diptanu-6d5.workers.dev/images/Central_Park_Lake.jpg", "image/png", {"location": "central park"})
content_id = response['content_id']

In [122]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI


In [126]:
def ask(prompt, question):
    model = ChatOpenAI()
    chain = (
        {"question": RunnablePassthrough()}
        | prompt
        | model
        | StrOutputParser()
    )
    return chain.invoke(question)
    
    

In [127]:
def generatate_sql_from_question(question):
    template = f"""
    Images are stored in the database with the following schema:
    {schema}

    fyi. 

    Generate the SQL query based on the following question below:

    """ + "Question: {question}"
    prompt = ChatPromptTemplate.from_template(template)
    generated_sql = ask(prompt, question)
    return generated_sql

def run_sql(query):
    query_result = client.sql_query(query)
    query_result = pformat(query_result.result).replace('{', '').replace('}', '')
    return query_result


def answer_from_results(question, generated_sql, query_result):
    template = f"""
    The question user asked is:
    {question}
    We ran a database query:  {generated_sql}
    The query returned the result: {query_result}

    FYI. 

    """
    prompt = ChatPromptTemplate.from_template(template)
    return ask(prompt, question)

def ask_question(question):
    sql_query = generatate_sql_from_question(question)
    results = run_sql(sql_query)
    answer = answer_from_results(question, sql_query, results)
    return answer
    


In [131]:
response = ask_question(f"how many people are in content_id: {content_id}?")
response


"There are 14 people in content_id '5rBcNSgU4YqAxoiu'."

In [113]:
file_names=["skate.jpg", "congestion.jpg", "bushwick-bred.jpg", "141900.jpg", "132500.jpg", "123801.jpg","120701.jpg", "103701.jpg"]
file_urls = [f"https://extractor-files.diptanu-6d5.workers.dev/images/{file_name}" for file_name in file_names]
for file_url in file_urls:
    client.ingest_remote_file(file_url, "image/png", {})



In [132]:
response = ask_question("List all the content_id with boat and also print the number of boats in each of the photos?")
print(response)

Here are the content_id's with the number of boats in each of the photos:
- Content ID: uqGr1N8l42o9kfDI, Number of boats: 8
- Content ID: Li50p9XKlhfLs9ke, Number of boats: 15
- Content ID: -K7LcspteAvmoO3s, Number of boats: 6
- Content ID: gBqaqxd7anhHY-yU, Number of boats: 18
- Content ID: pibDeT6mnJCdKlOA, Number of boats: 8
- Content ID: ZHRMX0HgjSQVybaR, Number of boats: 8
