# Evaluation
Evaluate accuracy of vision model in identifying the items correctly present in an image

Steps
- Pass image (file name being the ground truth) to vision model with prompt
- Vision model outputs json with a list of item names present in the image (predictions)
- Use an LLM to compute a similarity score between the predictions and the ground truth
- Manually evaluate the csv results (e.g. using Excel in .xlsx format) the similarity score, the ground truth and the predictions as to whether the item identification is correct or not

In [59]:
from langchain_openai import ChatOpenAI

vision_model = ChatOpenAI(model="gpt-4o", temperature=0)


In [60]:
# util function to base64 encode an image
import base64

def encode_image(image_path):
    """Getting the base64 string"""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


# base64_image = encode_image(
#     "../images/test/download.jpeg"
# )
# # Write the base64 string to a text file
# with open("encoded_image.txt", "w") as file:
#     file.write(img_base64)


In [61]:
# json schema for the model output
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import List

class ItemNames(BaseModel):
    items: List[str] = Field(
        description="List of item names. Each item name is either an item name from NEA_ITEM_NAMES, or the name of the item that the vision model have identified in the image"
    )

In [62]:
import os
IMAGE_DIR = '../images/test/batch2' # to change this accordingly where the images are stored

filenames = [file for file in os.listdir(IMAGE_DIR) if file.endswith('jpeg') or file.endswith('jpg') or file.endswith('png') or file.endswith('webp')]
filenames = [os.path.join(IMAGE_DIR, filename) for filename in filenames]
print(f"Number of images: {len(filenames)}")
filenames[:5]


Number of images: 179


['../images/test/batch2/Luggage bag (5).jpg',
 '../images/test/batch2/Glass cup (3).jpg',
 '../images/test/batch2/Glass bakeware (2).jpg',
 '../images/test/batch2/DVD player (3).jpg',
 '../images/test/batch2/Plastic egg trays (3).jpg']

In [63]:
# function to calculate approximately number tokens for an image sent to openai, to be used in deciding  whether to send image or not to the openai api (to avoid token limit)
from math import ceil
from PIL import Image

def calculate_openai_image_tokens(image_filepath: str):
    image = Image.open(image_filepath)
    width, height = image.size

    if width > 2048 or height > 2048:
        aspect_ratio = width / height
        if aspect_ratio > 1:
            width, height = 2048, int(2048 / aspect_ratio)
        else:
            width, height = int(2048 * aspect_ratio), 2048
            
    if width >= height and height > 768:
        width, height = int((768 / height) * width), 768
    elif height > width and width > 768:
        width, height = 768, int((768 / width) * height)

    tiles_width = ceil(width / 512)
    tiles_height = ceil(height / 512)
    total_tokens = 85 + 170 * (tiles_width * tiles_height)
    
    return total_tokens



In [64]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens


In [65]:
NEA_ITEM_NAMES = [
    "9-volt battery", "aerosol cans", "air conditioner", "alkaline battery", "ang pow", "anniversary card", "assessment books", "atm card", "audio sound system", "bag", "baking paper", "bank card", "basketball", "battery charger", "bed frame", "bedsheet", "beer can", "beverage carton", "beverage glass bottle", "beverage metal can", "bill", "bio-degradable bag", "birthday card", "biscuit tin", "biscuits", "blanket", "blister pack", "blouse", "books", "boots", "borosilicate glassware", "briefcase", "brochure (glossy and non-glossy)", "bubble tea cups", "bubble wrap", "bulky waste", "business card", "button cell battery", "cabinet", "calendar", "camisole", "candy", "canned food", "cardboard box", "carton box", "cassette", "cd", "cd casing", "cd player", "cellphone", "ceramic plate", "ceramic products", "cereal box", "chair", "chairs", "child seat", "chilli sauce bottle", "chocolate", "christmas card", "cigarettes", "clean aluminium foil", "clean aluminium tray", "cling film", "clothes", "coffee capsules", "coffee table", "comic books", "compact disc", "computer battery", "computer mouse", "containers", "cookies", "cooking pot", "correction liquid", "correction tape", "cosmetic glass bottle", "cotton bud", "cotton wool", "couch", "crackers", "crayon drawing", "crocs", "crystal glass", "cupboard", "curtains", "cushion", "debit card", "desktop computer", "desktop monitor", "detergent bottle", "diaper", "diary", "dining table", "dirty aluminium foil", "dirty aluminium tray", "disposable bowl", "disposable cup", "disposable cutlery", "disposable fork", "disposable knife", "disposable plate", "disposable shaver", "disposable spoon", "disposable wooden chopsticks", "docking station", "dress shoes", "drink packet", "drinking glass", "drinking straw", "drumstick", "dvd", "dvd casing", "dvd player", "e-scooter", "electric bicycle", "electric fan", "electric kettle", "electric mobility devices", "electric mobility scooter", "electric scooter", "electric shaver", "electronic cables", "electronic waste", "envelope (with and without plastic window)", "eraser", "exam papers", "exercise book", "expired credit cards", "fabric", "fan", "fire wire", "fish", "fluorescent bulb", "fluorescent lamp", "flyer (glossy and non-glossy)", "food blender", "food glass bottle", "food jars", "food metal can", "food peels", "food processor", "food tin", "food waste", "foolscap paper", "football", "football shoes (without metal studs)", "fridge", "fruit box", "furniture", "gaming console", "gift", "gift bag", "gift wrapping paper", "gifts", "glad wrap", "glass bakeware", "glass beer bottle", "glass bottle", "glass condiment bottle", "glass cup", "glass plate", "glass sauce bottle", "glass tube", "glass wine bottle", "glass with metal wires", "glasses", "glassware", "glitter paper", "golf ball", "graph paper", "green packet", "greeting card", "gummy bears", "hairdryer", "handphone", "hard disk drive", "heels", "highlighter", "hong bao", "horticultural waste", "household battery", "incandescent bulb", "incandescent lamp", "iron", "ironing board", "jacket", "jam spread bottle", "joss sticks", "jotter book", "journal", "juice packet", "ketchup bottle", "keyboard", "kitchen roll", "kitchen towel", "lamp", "lamp fixture", "lamp stand", "laptop", "large household appliances", "leaflet", "led bulb", "led lamp", "leftover medicine", "leftovers", "letter", "light bulb", "liquor bottle", "luggage bag", "magazine (glossy and non-glossy)", "mail", "make up", "marker", "masks", "mason jar", "mattress", "meat", "medals", "medicine glass bottle", "medicine pack", "melamine cups", "melamine plates", "melamine products", "metal accessories", "metal bottle cap", "metal container", "metal cutlery", "metal paint cans", "metal paint container", "microwave oven", "milk carton", "mineral water bottle", "mirror", "mobile phone", "mobile phone battery", "modem", "mouth wash", "mouthwash bottle", "music player", "namecard", "newsletter", "newspaper", "nintendo", "non-food metal container", "noodles", "notebook", "notes", "notice", "novel", "oven-safe food containers", "oxo-degradable bag", "packaging", "pampers", "pamphlet", "pants", "paper", "paper bag", "paper box", "paper cup", "paper disposables", "paper egg trays", "paper packaging (printed paper box etc)", "paper packaging contaminated with food", "paper packaging with food", "paper plate", "paper receipt", "paper towel", "paper towel tube", "parchment paper", "pen", "pencil", "perfume glass bottle", "permanent marker", "personal mobility devices", "pizza", "pizza boxes", "plant waste", "plaster", "plastic bag", "plastic bodywash bottle", "plastic bottle", "plastic bottle cap", "plastic bowl", "plastic bread bag", "plastic bubble tea carrier", "plastic carbonated drink bottle", "plastic clothes hanger", "plastic container", "plastic crockery", "plastic cup lids", "plastic cups", "plastic cutlery", "plastic disposables", "plastic egg trays", "plastic envelope", "plastic facial cleanser bottle", "plastic film", "plastic food wrap", "plastic fork", "plastic fruit box", "plastic fruit container", "plastic knife", "plastic magazine wrapper", "plastic medicine bottle", "plastic milk bottles", "plastic packaging", "plastic packaging contaminated with food/oil stains", "plastic packaging for packet drink", "plastic packaging with foil", "plastic pill bottle", "plastic plate", "plastic saucer", "plastic serving bowl", "plastic serving plate", "plastic soft drink bottle", "plastic spoon", "plastic spork", "plastic takeaway food container", "plastic tupperware container", "plastic water bottle", "playstation", "polystyrene foam product", "porcelain", "portable charger", "post-it", "potato chip bags", "power bank", "power-assisted bicycle", "present", "presents", "printed paper (glossy and non-glossy)", "printed paper box", "printer", "publication", "pumps", "pyrex glassware", "radio", "rechargeable battery", "red packet", "red wine bottle", "refrigerator", "ribbons", "rice", "rice cooker", "robot cleaner", "robot vacuum", "roller blades", "roller skates", "rotten food", "router", "ruler", "rusty metal cans", "sandles", "sanitary pad", "saran wrap", "sauce bottle", "school shoes", "seafood", "seal bag", "shampoo", "shampoo bottle", "shirts", "shoe rack", "shoes", "shorts", "shot glass", "shredded paper", "singlet", "skirt", "sneakers", "soap bottle", "sofa", "soft drink can", "speaker", "spectacles", "spoiled food", "sports shoes", "spray cans", "stamps", "standing fan", "stationery", "steel wool", "stool", "storybook", "styrofoam", "styrofoam clamshell container", "styrofoam cup", "supplement glass bottle", "sweets", "switch", "t-shirt", "table", "tablet computer", "takeaway", "tea pot", "telephone directory", "television", "tempered glass", "tennis ball", "textbooks", "textile", "tie", "tissue box", "tissue box packaging", "tissue paper", "toaster oven", "toilet paper", "toilet paper packaging", "toilet roll tube", "toys", "tv", "umbrella", "vacuum cleaner", "vase", "vegetables", "video tape", "washing machine", "wax paper", "wet wipes", "white wine bottle", "whiteboard marker", "whskey glass", "wii", "windows", "wine bottle", "wine glass", "wooden chopsticks", "wrapper", "writing paper", "ziplock bag"
    ]

In [66]:
system_message = "You are an expert on answering questions briefly and accurately about recycling in Singapore. Users may send you images of items to check if the items can be recycled, and your task is to correctly identify what are the items in the image, and provide the recycling instructions of the items."

In [67]:
from langchain_core.prompts import ChatPromptTemplate

image_prompt = """
I have an image containing items that I am unsure of whether they are recyclable. Please help me to identify the item(s) in the image.
For each of the unique items, find the best or closest matching item from the following NEA_ITEM_NAMES, and return it. If there is no best match for the item, return the item according to the name that you have identified.
The number of items returned should be the same as the number of unique items identified in the image.
Each item name must be unique. Do not repeat the item names if there are multiple items of the same item name in the image.

NEA_ITEM_NAMES: \n\n {NEA_ITEM_NAMES} \n\n

Return the answer as JSON output according to the following schema:
{schema}

"""
image_prompt = image_prompt.format(
    NEA_ITEM_NAMES=NEA_ITEM_NAMES, schema=ItemNames.schema_json()
)

image_prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            system_message,
        ),
        (
            "human",
            [
                {"type": "text", "text": "{image_prompt}"},
                {
                    "type": "image_url",
                    "image_url": {"url": "data:image/jpeg;base64,{base64_image}"},
                },
            ],
        ),
    ]
)

vision_model_json_output = vision_model.with_structured_output(schema=ItemNames)

vision_chain = image_prompt_template | vision_model_json_output


In [68]:
encoding = tiktoken.encoding_name_for_model('gpt-4o')

image_prompt_tokens_count = num_tokens_from_string(image_prompt, encoding)
system_message_tokens_count = num_tokens_from_string(system_message, encoding)
print(f"""
      System message token count: {system_message_tokens_count}
      Image prompt token count: {image_prompt_tokens_count}
      Total prompt token count: {image_prompt_tokens_count + system_message_tokens_count}""")



      System message token count: 55
      Image prompt token count: 2218
      Total prompt token count: 2273


In [69]:
# imports
import random
import time
import openai

# define a retry decorator
def retry_after_delay(
    func,
    errors: tuple = (openai.RateLimitError,),
):

    def wrapper(*args, delay: float = 65, max_retries: int = 3, **kwargs):
        # Initialize variables
        num_retries = 0

        # Loop until a successful response or max_retries is hit or an exception is raised
        while True:
            try:
                return func(*args, **kwargs)

            # Retry on specified errors
            except errors as e:
                print(e, end='\n\n')
                # Increment retries
                num_retries += 1

                # Check if max retries has been reached
                if num_retries > max_retries:
                    raise Exception(
                        f"Maximum number of retries ({max_retries}) exceeded."
                    )

                # Sleep for the delay
                time.sleep(delay)

            # Raise exceptions for any errors not specified
            except Exception as e:
                raise e

    return wrapper


@retry_after_delay
def batch_image_prediction_with_retry(filenames, **kwargs):
    batch_res = vision_chain.batch([{'image_prompt': image_prompt, 'base64_image': encode_image(filename)} for filename in filenames]) # batch_res is List[ItemNames]

    return batch_res


In [70]:
# Checkpoint save to csv
import csv
def save_to_csv(csv_filename: str, filenames: List[str], res: List[ItemNames]):
    file_exists = os.path.exists(csv_filename)
    new_rows = [(filename, res.items) for filename, res in zip(filenames, res)]
    with open(csv_filename, 'a', newline='') as file:
        writer = csv.writer(file)
        if not file_exists:
            writer.writerow(['filename', 'predictions'])
        writer.writerows(new_rows)

    print(f'Saved {len(filenames)} rows to {csv_filename}')


In [71]:
# Batch invoke the model, retrying after 65 seconds on rate limit error

start_index = 0
end_index = 0
curr_tokens = 0

# obtained from openai project limits based on project tier
TPM = 30_000

res = []

vision_model_preds_checkpoint = '../evaluation/checkpoints/batch2/vision_model-preds.csv'

while end_index < len(filenames[:57]):
    image_path = filenames[end_index]
    image_tokens_count = calculate_openai_image_tokens(image_path)
    curr_tokens += system_message_tokens_count + image_prompt_tokens_count + image_tokens_count + 1024 # last constant at the end is buffer for input and output formatting + the response from the model

    if curr_tokens < TPM:
        end_index += 1

    else:
        print(f'Exceeding {TPM} TPM. Current tokens {curr_tokens}')
        base64_image = encode_image(image_path)
        filenames_to_invoke = filenames[start_index: end_index]

        print("batch invoking for filenames:\n", ',\n'.join(filenames_to_invoke), end='\n----------------------------\n')

        batch_res = batch_image_prediction_with_retry(filenames_to_invoke) # batch_res is List[ItemNames]
        save_to_csv(vision_model_preds_checkpoint, filenames_to_invoke, batch_res)

        start_index = end_index
        res.extend(batch_res)

        curr_tokens = 0 # Rest token count for next batch

    if end_index == len(filenames):
        print('Last file reached.')
        filenames_to_invoke = filenames[start_index: end_index]
        print("batch invoking for filenames:\n", ',\n'.join(filenames_to_invoke), end='\n----------------------------\n')

        batch_res = batch_image_prediction_with_retry(filenames_to_invoke) # batch_res is List[Items]
        save_to_csv(vision_model_preds_checkpoint, filenames_to_invoke, batch_res)
        res.extend(batch_res)

res



Exceeding 30000 TPM. Current tokens 32666
batch invoking for filenames:
 ../images/test/batch2/Luggage bag (5).jpg,
../images/test/batch2/Glass cup (3).jpg,
../images/test/batch2/Glass bakeware (2).jpg,
../images/test/batch2/DVD player (3).jpg,
../images/test/batch2/Plastic egg trays (3).jpg,
../images/test/batch2/Metal paint container (2).jpg,
../images/test/batch2/Computer battery (4).jpg
----------------------------
Saved 7 rows to ../evaluation/checkpoints/batch2/vision_model-preds.csv
Exceeding 30000 TPM. Current tokens 32326
batch invoking for filenames:
 ../images/test/batch2/Luggage bag (2).jpg,
../images/test/batch2/Plastic egg trays (2).jpg,
../images/test/batch2/Tea pot (6).jpg,
../images/test/batch2/Writing paper (2).jpg,
../images/test/batch2/Stationery (3).jpg,
../images/test/batch2/Paper packaging with food (2).jpg,
../images/test/batch2/Music player (5).jpg
----------------------------
Saved 7 rows to ../evaluation/checkpoints/batch2/vision_model-preds.csv
Exceeding 300

[ItemNames(items=['luggage bag']),
 ItemNames(items=['glass cup']),
 ItemNames(items=['borosilicate glassware']),
 ItemNames(items=['dvd player', 'electronic cables', 'cardboard box']),
 ItemNames(items=['plastic egg trays']),
 ItemNames(items=['metal paint container']),
 ItemNames(items=['rechargeable battery']),
 ItemNames(items=['luggage bag']),
 ItemNames(items=['blister pack']),
 ItemNames(items=['tea pot']),
 ItemNames(items=['post-it']),
 ItemNames(items=['correction tape']),
 ItemNames(items=['pizza', 'pizza boxes']),
 ItemNames(items=['cd player']),
 ItemNames(items=['air conditioner']),
 ItemNames(items=['plastic container']),
 ItemNames(items=['aerosol cans']),
 ItemNames(items=['tea pot']),
 ItemNames(items=['chairs']),
 ItemNames(items=['washing machine', 'fridge', 'cardboard box']),
 ItemNames(items=['plant', 'chopping board', 'baking paper']),
 ItemNames(items=['highlighter']),
 ItemNames(items=['glass bakeware']),
 ItemNames(items=['cd player']),
 ItemNames(items=['hair

## Use LLM to Evaluate
Based on the grouth truth (file name), use LLM to output a similarity score (similar to the ground truth) for each prediction


Evaluation steps
- Read the results csv into dataframe
- Extract the `ground truth` field from each image filename
- Extract the fields `ground_truth` and `predictions` from the dataframe as json
- Pass the json to LLM to compute similarity score between each `prediction` and `ground_truth`
- Save the results into dataframe
- Export dataframe to csv


In [84]:
# put results into dataframe
import pandas as pd
df = pd.read_csv(vision_model_preds_checkpoint)
print(df.shape)
df.head()



(179, 2)


Unnamed: 0,filename,predictions
0,../images/test/batch2/Luggage bag (5).jpg,['luggage bag']
1,../images/test/batch2/Glass cup (3).jpg,['glass cup']
2,../images/test/batch2/Glass bakeware (2).jpg,['borosilicate glassware']
3,../images/test/batch2/DVD player (3).jpg,"['dvd player', 'electronic cables', 'cardboard..."
4,../images/test/batch2/Plastic egg trays (3).jpg,['plastic egg trays']


In [85]:
import os

df['ground_truth'] = df['filename'].apply(lambda x: os.path.splitext(os.path.basename(x))[0])
df.head()

Unnamed: 0,filename,predictions,ground_truth
0,../images/test/batch2/Luggage bag (5).jpg,['luggage bag'],Luggage bag (5)
1,../images/test/batch2/Glass cup (3).jpg,['glass cup'],Glass cup (3)
2,../images/test/batch2/Glass bakeware (2).jpg,['borosilicate glassware'],Glass bakeware (2)
3,../images/test/batch2/DVD player (3).jpg,"['dvd player', 'electronic cables', 'cardboard...",DVD player (3)
4,../images/test/batch2/Plastic egg trays (3).jpg,['plastic egg trays'],Plastic egg trays (3)


In [86]:
# remove the parentheses from the ground truth e.g. `Newsletter (4)` becomes `Newsletter`
import re

def clean_text(text):
    # This regex removes a trailing space followed by parentheses containing only digits
    return re.sub(r'\s+\(\d+\)$', '', text)

# Example usage:
df['ground_truth'] = df['ground_truth'].apply(clean_text)
df.head()

Unnamed: 0,filename,predictions,ground_truth
0,../images/test/batch2/Luggage bag (5).jpg,['luggage bag'],Luggage bag
1,../images/test/batch2/Glass cup (3).jpg,['glass cup'],Glass cup
2,../images/test/batch2/Glass bakeware (2).jpg,['borosilicate glassware'],Glass bakeware
3,../images/test/batch2/DVD player (3).jpg,"['dvd player', 'electronic cables', 'cardboard...",DVD player
4,../images/test/batch2/Plastic egg trays (3).jpg,['plastic egg trays'],Plastic egg trays


In [87]:
ground_truth_prediction_json_list = df[['ground_truth', 'predictions']].to_dict(orient='records')
ground_truth_prediction_json_list

[{'ground_truth': 'Luggage bag', 'predictions': "['luggage bag']"},
 {'ground_truth': 'Glass cup', 'predictions': "['glass cup']"},
 {'ground_truth': 'Glass bakeware',
  'predictions': "['borosilicate glassware']"},
 {'ground_truth': 'DVD player',
  'predictions': "['dvd player', 'electronic cables', 'cardboard box']"},
 {'ground_truth': 'Plastic egg trays', 'predictions': "['plastic egg trays']"},
 {'ground_truth': 'Metal paint container',
  'predictions': "['metal paint container']"},
 {'ground_truth': 'Computer battery',
  'predictions': "['rechargeable battery']"},
 {'ground_truth': 'Luggage bag', 'predictions': "['luggage bag']"},
 {'ground_truth': 'Plastic egg trays', 'predictions': "['blister pack']"},
 {'ground_truth': 'Tea pot', 'predictions': "['tea pot']"},
 {'ground_truth': 'Writing paper', 'predictions': "['post-it']"},
 {'ground_truth': 'Stationery', 'predictions': "['correction tape']"},
 {'ground_truth': 'Paper packaging with food',
  'predictions': "['pizza', 'pizza bo

Use chat model or LLM to evaluate the predictions accuracy

In [88]:
# Define the json output
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import List

class Similarity(BaseModel):
    similarity: float = Field(description="List of similarity score of the pair of prediction and the corresponding ground truth")


In [89]:
from langchain_openai import ChatOpenAI

qa_model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
qa_model_json_output = qa_model.with_structured_output(
    Similarity, method="json_mode"
)


In [78]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

prompt = """ 
You are given a JSON input with the following schema:
{{
    "ground_truth": str,
    "prediction": List[str]
}}

Please evaluate the similarity of the ground truth and the model's prediction and return the similarity as a float between 0 and 1.0. 

Return the result as a JSON output according to the following schema:
{{
    "similarity": float
}}

Examples of how to calculate similarity:
Example 1
Given:
{{
    "ground_truth": "Shampoo bottle",
    "prediction": ["shampoo bottle", "conditioner Bottle"]
}}

Expected output:
{{
    "similarity": 1.0
}}

Example 2
Given:
{{
    "ground_truth": "Paper cup",
    "prediction": ["paper cup"]
}}

Expected output:
{{
    "similarity": 1.0
}}

Example 3
Given:
{{
    "ground_truth": "Paper cup",
    "prediction": ["plastic cup"]
}}

Expected output:
{{
    "similarity": 0.0
}}

Example 4
Given:
{{
    "ground_truth": "Milk bottle",
    "prediction": ["bottled milk"]
}}

Expected output:
{{
    "similarity": 1.0
}}

Example 5
Given:
{{
    "ground_truth": "Plastic packaging with bubble wrap",
    "prediction": ["bubble wrap"]
}}

Expected output:
{{
    "similarity": 0.5
}}


Here is the JSON input: {ground_truth_prediction_json}
"""

prompt_template = ChatPromptTemplate.from_template(prompt)

chain = prompt_template | qa_model_json_output

In [79]:
# testing out with small sample
res = chain.batch([{'ground_truth_prediction_json': ground_truth_prediction_json} for ground_truth_prediction_json in ground_truth_prediction_json_list[:5]])
res

[Similarity(similarity=1.0),
 Similarity(similarity=1.0),
 Similarity(similarity=0.0),
 Similarity(similarity=0.3333333333333333),
 Similarity(similarity=1.0)]

In [80]:
@retry_after_delay
def batch_invoke_evaluate(ground_truth_prediction_json_list, **kwargs):
    batch_res = chain.batch([{'ground_truth_prediction_json': ground_truth_prediction_json} for ground_truth_prediction_json in ground_truth_prediction_json_list])

    return batch_res

In [81]:
# Checkpoint save to csv
import csv
def save_to_csv(csv_filename: str, ground_truth_prediction_json_list: List[str], res: List[Similarity]):
    file_exists = os.path.exists(csv_filename)
    new_rows = [(ground_truth_prediction_json, res.similarity) for ground_truth_prediction_json, res in zip(ground_truth_prediction_json_list, res)]
    with open(csv_filename, 'a', newline='') as file:
        writer = csv.writer(file)
        if not file_exists:
            writer.writerow(['ground_truth_prediction_json', 'similarity'])
        writer.writerows(new_rows)

    print(f'Saved {len(ground_truth_prediction_json_list)} rows to {csv_filename}')


In [82]:
# Batch invoke the model, retrying after 65 seconds on rate limit error
import tiktoken
encoding = tiktoken.encoding_name_for_model('gpt-3.5-turbo')

start_index = 0
end_index = 0
curr_tokens = 0

# obtained from openai project limits based on project tier
TPM = 60_000

res = []

checkpoint_csv = '../evaluation/checkpoints/batch2/llm_evaluation_results.csv' # to change accordingly

while end_index < len(ground_truth_prediction_json_list):
    ground_truth_prediction_json = ground_truth_prediction_json_list[end_index]
    prompt_tokens_count = num_tokens_from_string(prompt_template.format(ground_truth_prediction_json=ground_truth_prediction_json), encoding)

    curr_tokens += prompt_tokens_count + 1024 # last constant at the end is buffer for input and output formatting + the response from the model

    if curr_tokens < TPM:
        end_index += 1

    else:
        print(f'Exceeding {TPM} TPM. Current tokens {curr_tokens}. Start index {start_index}. End index {end_index}')
        batch_to_invoke = ground_truth_prediction_json_list[start_index: end_index]

        batch_res = batch_invoke_evaluate(batch_to_invoke) # batch_res is List[Items]
        save_to_csv(checkpoint_csv, batch_to_invoke, batch_res)

        start_index = end_index
        res.extend(batch_res)

        curr_tokens = 0 # Rest token count for next batch

    if end_index == len(ground_truth_prediction_json_list):
        print('Last file reached.')
        batch_to_invoke = ground_truth_prediction_json_list[start_index: end_index]
        batch_res = batch_invoke_evaluate(batch_to_invoke) # batch_res is List[Items]
        save_to_csv(checkpoint_csv, batch_to_invoke, batch_res)
        res.extend(batch_res)

res



Exceeding 60000 TPM. Current tokens 60929. Start index 0. End index 44
Saved 44 rows to ../evaluation/checkpoints/batch2/llm_evaluation_results.csv
Last file reached.
Saved 12 rows to ../evaluation/checkpoints/batch2/llm_evaluation_results.csv


[Similarity(similarity=1.0),
 Similarity(similarity=1.0),
 Similarity(similarity=0.0),
 Similarity(similarity=0.3333333333333333),
 Similarity(similarity=1.0),
 Similarity(similarity=1.0),
 Similarity(similarity=0.0),
 Similarity(similarity=1.0),
 Similarity(similarity=0.0),
 Similarity(similarity=1.0),
 Similarity(similarity=0.0),
 Similarity(similarity=0.0),
 Similarity(similarity=0.0),
 Similarity(similarity=0.0),
 Similarity(similarity=1.0),
 Similarity(similarity=0.0),
 Similarity(similarity=1.0),
 Similarity(similarity=1.0),
 Similarity(similarity=1.0),
 Similarity(similarity=0.33),
 Similarity(similarity=0.0),
 Similarity(similarity=0.0),
 Similarity(similarity=1.0),
 Similarity(similarity=0.0),
 Similarity(similarity=1.0),
 Similarity(similarity=0.0),
 Similarity(similarity=1.0),
 Similarity(similarity=0.0),
 Similarity(similarity=0.0),
 Similarity(similarity=1.0),
 Similarity(similarity=0.0),
 Similarity(similarity=1.0),
 Similarity(similarity=1.0),
 Similarity(similarity=1.0)

In [83]:
len(res)

56

In [90]:
df_with_similarity = pd.read_csv("../evaluation/checkpoints/batch2/llm_evaluation_results.csv")

In [93]:
df['similarity'] = df_with_similarity['similarity']

put similarity into dataframe

In [30]:
df['similarity'] = [r.similarity for r in res]
df.head()


Unnamed: 0,filename,predictions,ground_truth,similarity
0,../images/test/batch1/Canned food (5).jpg,['canned food'],Canned food,1.0
1,../images/test/batch1/Melamine plates (5).jpg,['ceramic plate'],Melamine plates,0.0
2,../images/test/batch1/Plastic takeaway food co...,['plastic takeaway food container'],Plastic takeaway food container,1.0
3,../images/test/batch1/Namecard.jpg,['business card'],Namecard,0.0
4,../images/test/batch1/Cardboard box (2).jpg,['cardboard box'],Cardboard box,1.0


Output to csv for manual evaluation

In [96]:
# Change the output file name accordingly
df[['filename', 'ground_truth', 'predictions', 'similarity']].to_csv('../evaluation/batch2_evaluation.csv', index=False)

