In [None]:
!pip install indexify langchain langchain_community

# Accurate Image RAG using Yolo and CodeGemma

Most Language Models(especially smaller ones) don't have vision capabilities. In this example, we will augment them with vision capabilities by automatically injecting structured data from images. The pipeline is tested to work at any scale, on laptops and with 10s of 1000s images on the cloud.

What happens behind the scenes - 
1. Indexify extracts and automatically populates structured data from images as they are ingested.
2. The LLM is presented with the schema of the underlying schema for it to retrieve data based on the question
3. Indexify client retrieves the information based on the generated SQL schema and provides the necessary information to the LLM.

### Download Indeixfy and the Yolo Extractor
curl https://www.tensorlake.ai | sh

1. Start the Server
   ```bash
    ./indexify server -d
    ```

3. Download and start the extractor
   ```bash
    indexify-extractor download hub://image/yolo
    indexify-extractor join-server yolo.yolo_extractor:YoloExtractor
    ```

4. We are using Ollama to run small models, you can point Langchain to use any other Ollama model or hosted models.

In [221]:
from indexify import IndexifyClient
client = IndexifyClient()

In [223]:
client.add_extraction_policy(extractor='tensorlake/yolo-extractor', name="object_detection")

In [224]:
schema = client.list_schemas()["ddls"]["ingestion"]
schema

'CREATE TABLE IF NOT EXISTS "ingestion" ("content_id" TEXT NULL, "bounding_box" LIST NULL, "object_name" TEXT NULL);'

In [225]:
response = client.ingest_remote_file("https://extractor-files.diptanu-6d5.workers.dev/images/Central_Park_Lake.jpg", "image/png", {"location": "central park"})
content_id = response['content_id']

In [226]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_community.llms import Ollama


In [227]:
def ask(prompt, question):
    model = Ollama(model="codegemma")
    chain = (
        {"question": RunnablePassthrough()}
        | prompt
        | model
        | StrOutputParser()
    )
    return chain.invoke(question)
    
    

In [228]:
def generatate_sql_from_question(question):
    template = f"""
    Images are stored in the database with the following schema:
    {schema}

    fyi. 

    Generate the SQL query as raw text, without any explanation, based on the following question below:
    1. Generate only the raw SQL statement as text
    2. Please don't add any backticks in the response 
    3. The object_name column has entity name of the object detected in the content. The values can be person, boat, bulb, etc.
    4. Add predicates appropiately

    """ + "Question: {question}"
    prompt = ChatPromptTemplate.from_template(template)
    generated_sql = ask(prompt, question)
    print(f"Generated SQL: {generated_sql}")
    return generated_sql

def run_sql(query):
    query_result = client.sql_query(query)
    query_result = pformat(query_result.result).replace('{', '').replace('}', '')
    return query_result


def answer_from_results(question, generated_sql, query_result):
    template = f"""
    The question user asked is:
    {question}
    We ran a database query:  {generated_sql}
    The query returned the result: {query_result}

    FYI. 

    """
    prompt = ChatPromptTemplate.from_template(template)
    return ask(prompt, question)

def ask_question(question):
    sql_query = generatate_sql_from_question(question)
    results = run_sql(sql_query)
    answer = answer_from_results(question, sql_query, results)
    return answer
    


In [229]:
response = ask_question(f"how many people are in content_id: {content_id}?")
print(response)

Generated SQL: SELECT COUNT(*)
FROM ingestion
WHERE content_id = 'mbntU1flW0qZFZSl' AND object_name = 'person';
The query returned a result of 13, indicating that there are 13 people in content_id: mbntU1flW0qZFZSl.


In [230]:
file_names=["skate.jpg", "congestion.jpg", "bushwick-bred.jpg", "141900.jpg", "132500.jpg", "123801.jpg","120701.jpg", "103701.jpg"]
file_urls = [f"https://extractor-files.diptanu-6d5.workers.dev/images/{file_name}" for file_name in file_names]
for file_url in file_urls:
    client.ingest_remote_file(file_url, "image/png", {})



In [233]:
response = ask_question("List all the content_id with boat and also print the number of boats in each of the photos?")
print(response)

Generated SQL: SELECT content_id, COUNT(*) AS num_boats
FROM ingestion
WHERE object_name = 'boat'
GROUP BY content_id;
Based on the provided query results, the content_id with boat and the number of boats in each photo are:

| content_id | num_boats |
|---|---|
| mbntU1flW0qZFZSl | 8 |
| B4jdKmmKzlg3buza | 15 |
| 8mtV1EXpnLl4_1jS | 6 |
| E3eZbhOSRhWQ6aCE | 18 |
