### Information retrieval from text and images

In [1]:
%load_ext autoreload
%autoreload 2

import os
curpath = os.getcwd()
os.chdir(curpath.split("core")[0])

In [2]:
import json
import time
import logging
logging.basicConfig(level=logging.INFO)


from typing import List, Dict, Any, Tuple, Union
from tqdm import tqdm

from prompts import SIMPLE_INFO_RETRIEVAL_TEXT_PROMPT, SIMPLE_INFO_RETRIEVAL_IMAGE_PROMPT
from core.features.topic_segregation.utils import add_dicts, calculate_cost_gpt4_8k, calculate_cost_gpt4_turbo
from core.features.provider import creator

from dotenv import load_dotenv
load_dotenv()

text_model_defaults = {"model" : "gpt-4-1106-preview", "temperature" : 0.1, "response_format" : {"type": "json_object"}}
vision_model_defaults = {"model" : "gpt-4-vision-preview", "temperature" : 0.1}

### from text

In [7]:

def information_retrieval_from_text(raw_text: str, queries: List):

    if len(raw_text) > 100000:
        raise ValueError("Raw Text is too long. Should be less than 100000 characters")
    
    if len(queries) > 0:
        queries = {i+1: q for i, q in enumerate(queries)}
    else:
        raise ValueError("Atleast one query should be present")

    conversation = [{"role": "system", "content": SIMPLE_INFO_RETRIEVAL_TEXT_PROMPT}]

    user_prompt = f"""
    RAW_TEXT:

    //raw_text//

    {raw_text}

    //raw_text//

    Queries:

    {queries}
    
    OUTPUT:
    """

    user_message = {'role': 'user', "content": user_prompt}
    conversation.append(user_message)

    total_usage = {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}
    response = creator(
                    **text_model_defaults,
                    messages = conversation,
                    )
    
    total_usage = add_dicts(total_usage, dict(response.usage))

    output = response.choices[0].message.content

    return {"output": output, "total_usage": total_usage}


In [8]:
raw_text = ""

path = r"C:\Users\DELL\Documents\Curate\curate-v1\core\features\information_retrieval\test_news1.txt"

with open(path, 'rb') as f:
    raw_text = f.read().decode('utf-8')
 
queries = ["Major Dates with Events"]



res = information_retrieval_from_text(raw_text, queries)

In [14]:
len(raw_text)

215827

In [11]:
print(res["output"])

{
    "queries": [
        {
            "query": "1",
            "answers": [
                "November 09, 2023 06:20 pm - Mandal Education Officers in Andhra Pradesh told to engage with local community",
                "November 10, 2023 - IIT Kanpur, IIS Kanpur to Establish Laboratories and Develop curriculum",
                "November 10, 2023 - K12 Education Market to Reach $525.7 Billion by 2031",
                "November 10, 2023 - NC State College of Education to House the Educational Opportunities Program for Individuals With Intellectual Disabilities",
                "November 10, 2023 - Clean California Transforms Blighted Vacant Lot into Nursery and Education Center in the Heart of San Francisco",
                "November 10, 2023 - Four public secondary schools have been recognised in the WA Education Awards for the exceptional way they prepare students for life beyond school"
            ]
        }
    ]
}


In [15]:
print(res["total_usage"])
print(calculate_cost_gpt4_turbo(res["total_usage"]))

{'completion_tokens': 200, 'total_tokens': 43490, 'prompt_tokens': 43290}
0.4389


### from images

In [9]:
import base64
# import openai

# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def information_retrieval_from_image(image_paths: List[str], query: List[str]):

    # Getting the base64 string
    encoded = []
    if isinstance(image_paths, List):
        for image_path in image_paths:
            encoded.append(encode_image(image_path))
    else:
        raise ValueError("image_paths should be a list of image paths")

    logging.info("encoded the images")
    total_usage = {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}

    conversation = [{"role": "system", "content": SIMPLE_INFO_RETRIEVAL_IMAGE_PROMPT}]
    user_message = {"role": "user", "content": [f"{query}", *map(lambda x: {"image": x, "resize": 1024}, encoded)]}
    conversation.append(user_message)

    try:
        response = creator(
                **vision_model_defaults,
                messages = conversation,
                max_tokens = 3000
                )
        
        response = response.model_dump()
        text = response["choices"][0]["message"]["content"]

    except:
        # print(response.json())
        raise ValueError("Error in the response")

    total_usage = add_dicts(total_usage, dict(response["usage"]))

    return json.loads(text), response["usage"]

In [10]:
dir_path = r"C:\Users\DELL\Documents\Curate\curate-v1\core\test_data\mmh_english"
paths = sorted([os.path.join(dir_path, path) for path in os.listdir(dir_path)])

t = information_retrieval_from_image(paths[:19], query = ["teacher of qutubbudiin aibak", "slave dynasty years", "who died in 1236", "first to fix the prices of commodities"])

INFO:root:encoded the images
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [11]:
print(t[0])

{'queries': [{'query': '1', 'answers': ['Qutbuddin Aibak']}, {'query': '2', 'answers': ['1206-1526']}, {'query': '3', 'answers': ['Iltutmish']}, {'query': '4', 'answers': ['Alauddin Khilji']}]}


In [12]:
print(t[1])

{'completion_tokens': 117, 'prompt_tokens': 1835, 'total_tokens': 1952}
