### Information retrieval from text and images

In [18]:
%load_ext autoreload
%autoreload 2

import os
curpath = os.getcwd()
os.chdir(curpath.split("core")[0])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
import re
import json
import time
# import logging
# logging.basicConfig(level=logging.INFO)

from typing import List, Dict, Any, Tuple, Union
from tqdm import tqdm

from prompts import SIMPLE_INFO_RETRIEVAL_TEXT_PROMPT, SIMPLE_INFO_RETRIEVAL_IMAGE_PROMPT
from core.features.topic_segregation.utils import add_dicts, calculate_cost_gpt4_8k, calculate_cost_gpt4_turbo
from core.features.provider import creator

from dotenv import load_dotenv
load_dotenv()

text_model_defaults = {"model" : "gpt-4-1106-preview", "temperature" : 0.1, "response_format" : {"type": "json_object"}}
vision_model_defaults = {"model" : "gpt-4-vision-preview", "temperature" : 0.1}

In [20]:
def get_unique_from_raw_text(raw_text, sep = "\n"):

    if isinstance(raw_text, str):
        raw_text_l = raw_text.splitlines()
    elif isinstance(raw_text, list):
        raw_text_l = raw_text
    else:
        raise ValueError("raw_text must be a string or a list")

    raw_text_list = []
    for x in raw_text_l:
        x = x.strip()
        if x and len(x) > 2:
            raw_text_list.append(x)

    return list(set(raw_text_list))

def chunk_raw_text_list(raw_text_list, max_len=3000):
    """
    Splits a list of text strings into chunks where the total length of each chunk is 
    less than or equal to max_len.

    :param raw_text_list: List of text strings to be chunked.
    :param max_len: Maximum character length for each chunk.
    :return: List of text chunks, each being a list of text strings.
    """
    text_list = []
    current_chunk = []
    current_len = 0

    for text in raw_text_list:
        text_len = len(text)
        
        # Check if adding this text would exceed the max length
        if current_len + text_len > max_len and current_chunk:
            # Start a new chunk
            text_list.append(current_chunk)
            current_chunk = [text]
            current_len = text_len
        else:
            # Add text to current chunk
            current_chunk.append(text)
            current_len += text_len

    # Add the last chunk if it's not empty
    if current_chunk:
        text_list.append(current_chunk)

    return text_list


def text_path_to_chunk(path, max_len=20000, verbose=False):

    with open(path, 'rb') as f:
        _text = f.read().decode('utf-8')

    total_chars = len(_text)

    # convert raw_text to list and get unique values
    unique_text_list = get_unique_from_raw_text(_text)

    # chunking the list into sublists with max length of characters in a chunk is max_len
    chunked_text_list = chunk_raw_text_list(unique_text_list, max_len=max_len)

    if verbose:
        print("Total chars:", total_chars)
        print("Unique Text Lists:", len(unique_text_list))
        print("Text List chunks:", len(chunked_text_list))
        for i in chunked_text_list:
            t = "".join(i)
            print("Chunk of size", len(t), "characters")

    return chunked_text_list


### from text

In [21]:

def information_retrieval_from_text(raw_text: str, queries: List):

    if len(raw_text) > 100000:
        raise ValueError("Raw Text is too long. Should be less than 100000 characters")
    
    # if len(queries) > 0:
    #     queries = {i+1: q for i, q in enumerate(queries)}
    # else:
    #     raise ValueError("Atleast one query should be present")

    conversation = [{"role": "system", "content": SIMPLE_INFO_RETRIEVAL_TEXT_PROMPT}]

    user_prompt = f"""
    RAW_TEXT:

    //raw_text//

    {raw_text}

    //raw_text//

    Queries:

    {queries}
    
    OUTPUT:
    """

    user_message = {'role': 'user', "content": user_prompt}
    conversation.append(user_message)

    total_usage = {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}
    response = creator(
                    **text_model_defaults,
                    messages = conversation,
                    )
    
    total_usage = add_dicts(total_usage, dict(response.usage))

    output = response.choices[0].message.content

    return {"output": output, "total_usage": total_usage}


In [22]:
# final batch processing function
def ir_batch_process(queries: List, input_text: List[List]) -> [Dict, Dict]:

    final_dict = {key:[] for key in list(queries.keys())}
    # print(final_dict)
    total_tokens = {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}

    for raw_text_list in tqdm(input_text):
        raw_text = "\n".join(raw_text_list)

        # print(raw_text)
        result = information_retrieval_from_text(raw_text=raw_text, queries=queries)
        output = result['output']
        # print(output)
        tokens = result['total_usage']

        try:
            json_out = json.loads(output)
        except:
            raise ValueError("Output is not a valid JSON")
        # print(json_out)
        final_dict = add_dicts(final_dict, json_out)
        total_tokens = add_dicts(total_tokens, tokens)

    return final_dict, total_tokens


In [24]:
chunked_text_list = text_path_to_chunk(path=r"C:\Users\DELL\Documents\Curate\curate-v1\core\features\information_retrieval\test_news1.txt", verbose=True)
queries = ["All College Names"]
indexed_queries = {str(i+1): query for i, query in enumerate(queries)}
res, tokens = ir_batch_process(queries=indexed_queries, input_text=chunked_text_list)

print(tokens)
print(calculate_cost_gpt4_turbo(tokens))
for k, query in indexed_queries.items():
    print(f"Query: {query}")
    print(res[k])
    print()

# for i in res[queries[0]]:
#     print(text_dict[int(i)])
#     print()

Total chars: 215819
Unique Text Lists: 108
Text List chunks: 13
Chunk of size 2810 characters
Chunk of size 17233 characters
Chunk of size 16352 characters
Chunk of size 19999 characters
Chunk of size 13598 characters
Chunk of size 15513 characters
Chunk of size 15487 characters
Chunk of size 19877 characters
Chunk of size 18857 characters
Chunk of size 17578 characters
Chunk of size 16973 characters
Chunk of size 17918 characters
Chunk of size 19369 characters


100%|██████████| 13/13 [01:13<00:00,  5.63s/it]

{'prompt_tokens': 45694, 'total_tokens': 47322, 'completion_tokens': 1628}
0.50578
Query: All College Names
['universities', 'NC State College of Education', 'The Academy School', 'Applecross Senior High School', 'Ashdale Secondary College', 'Manea Senior College', 'Western Australian College of Agriculture Cunderdin', 'IIT Kanpur', 'IIS Kanpur', 'Singapore Institute of Management Global Education', 'Garodia International College', 'Jagannath Institute of Management School', 'University at Buffalo, State University of New York', 'University of London', 'University of Birmingham', 'RMIT', 'University of Wollongong', 'Sydney University', 'Indian Institute of Technology Kanpur', 'Indian Institute of Skills', 'Antai College of economics and management', 'Shanghai Jiao Tong University', 'New Saraswati House', 'USC', 'Northern Virginia Community College', 'George Mason University', 'Irvine Valley College', 'California State University-Fullerton', 'Heartland Community College', 'Illinois Stat




In [25]:
{'prompt_tokens': 45694, 'total_tokens': 47322, 'completion_tokens': 1628}
0.50578

190


### from images

In [28]:
import base64
# import openai

# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def information_retrieval_from_image(image_paths: List[str], query: List[str]):

    # Getting the base64 string
    encoded = []
    if isinstance(image_paths, List):
        for image_path in image_paths:
            encoded.append(encode_image(image_path))
    else:
        raise ValueError("image_paths should be a list of image paths")

    # logging.info("encoded the images")
    total_usage = {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}

    conversation = [{"role": "system", "content": SIMPLE_INFO_RETRIEVAL_IMAGE_PROMPT}]
    user_message = {"role": "user", "content": [f"{query}", *map(lambda x: {"image": x, "resize": 1024}, encoded)]}
    conversation.append(user_message)

    try:
        response = creator(
                **vision_model_defaults,
                messages = conversation,
                max_tokens = 3000
                )
        
        response = response.model_dump()
        text = response["choices"][0]["message"]["content"]

    except:
        # print(response.json())
        raise ValueError("Error in the response")

    total_usage = add_dicts(total_usage, dict(response["usage"]))

    return json.loads(text), response["usage"]

In [29]:
dir_path = r"C:\Users\DELL\Documents\Curate\curate-v1\core\test_data\mmh_english"
paths = sorted([os.path.join(dir_path, path) for path in os.listdir(dir_path)])

t = information_retrieval_from_image(paths[:19], query = ["teacher of qutubbudiin aibak", "slave dynasty years", "who died in 1236", "first to fix the prices of commodities"])

INFO:root:encoded the images
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [30]:
print(t[0])

{'1': ['Qutbuddin Aibak'], '2': ['1206-1526'], '3': ['Iltutmish'], '4': ['Alauddin Khilji']}


In [31]:
print(t[1])

{'completion_tokens': 66, 'prompt_tokens': 1808, 'total_tokens': 1874}
