## Setup

In [1]:
!aws codeartifact login --tool pip --repository shared --domain amazon --domain-owner 149122183214 --region us-west-2

Successfully configured pip to use AWS CodeArtifact repository https://amazon-149122183214.d.codeartifact.us-west-2.amazonaws.com/pypi/shared/ 
Login expires in 12 hours at 2025-04-24 20:43:15+00:00


In [2]:
!pip install  -U -q transformers==4.46.3 trl==0.12.1 datasets bitsandbytes peft accelerate

In [3]:
!pip install -q flash-attn --no-build-isolation

In [4]:
!pip install -q tensorboard

In [5]:
import torch 
import time

## Base model

In [6]:
# --- Resources & Precision ---
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DTYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
ATTN_IMPLEMENTATION = "flash_attention_2"

In [7]:
# Hugging Face transformers
from transformers import (
    AutoProcessor,
    Idefics3ForConditionalGeneration,
    BitsAndBytesConfig,
    TrainingArguments,
    AutoConfig
)


MODEL_ID = "HuggingFaceTB/SmolVLM-Base" # specify model to use here

base_model_config = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
base_model_config.use_cache = True

if hasattr(base_model_config, "attn_implementation"):
    base_model_config.attn_implementation = ATTN_IMPLEMENTATION

base_load_kwargs = {"device_map": "auto", "torch_dtype": DTYPE}

base_model_eval = Idefics3ForConditionalGeneration.from_pretrained(
    MODEL_ID,
    config=base_model_config,
    trust_remote_code=True,
    **base_load_kwargs
)

base_model_eval.eval()
print("Base model loaded successfully.")

2025-04-24 08:43:24.033468: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-24 08:43:24.049194: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-24 08:43:24.054127: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-24 08:43:24.065549: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
The argument `trust_remote_code` is

Base model loaded successfully.


## Load data

In [8]:
import os

WORKING_DIR = "./"
OUTPUT_DIR_BASE_NAME = "base_evaluation"

OUTPUT_DIR = os.path.join(WORKING_DIR, OUTPUT_DIR_BASE_NAME) # full path on Drive

In [9]:
# --- Evaluation Config ---
CHARTQA_DATASET_ID = "HuggingFaceM4/ChartQA"
EVAL_SPLIT = "test"
EVAL_LIMIT = None # number of chartqa samples to use when evaluating ChartQA
MAX_NEW_TOKENS_EVAL = 32
EVAL_OUTPUT_FILE = os.path.join(OUTPUT_DIR, MODEL_ID, "chartqa_evaluation_results_comparison.json")

In [10]:
from datasets import load_dataset

chartqa_test_iterable = load_dataset(CHARTQA_DATASET_ID, split=EVAL_SPLIT, streaming=False)
chartqa_test_dataset = list(chartqa_test_iterable)
print(f"Loaded {len(chartqa_test_dataset)} samples for evaluation (full {EVAL_SPLIT} set).")

if chartqa_test_dataset:
     if isinstance(chartqa_test_dataset[0], dict) and 'img_idx' not in chartqa_test_dataset[0]:
          chartqa_test_dataset = [dict(sample, img_idx=i) for i, sample in enumerate(chartqa_test_dataset)]

Loaded 2500 samples for evaluation (full test set).


In [11]:
# Define Evaluation helper functions
print("\n--- Defining Evaluation Helper Functions ---")

# System message for evaluation prompting (provided by SmolVLM)
EVAL_SYSTEM_MESSAGE = """You are a Vision Language Model specialized in interpreting visual data from chart images.
Your task is to analyze the provided chart image and respond to queries with concise answers, usually a single word, number, or short phrase.
The charts include a variety of types (e.g., line charts, bar charts) and contain colors, labels, and text.
Focus on delivering accurate, succinct answers based on the visual information. Avoid additional explanation unless absolutely necessary."""

def create_inference_chat_messages(sample):
    """Creates the chat message structure for inference using ChartQA's 'query' field."""
    question = sample.get("query")
    if not question:
        return None
    return [
        {"role": "system", "content": [{"type": "text", "text": EVAL_SYSTEM_MESSAGE}]},
        {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": question}]},
    ]


print("Evaluation helper functions defined.")


--- Defining Evaluation Helper Functions ---
Evaluation helper functions defined.


In [12]:
# --- Generate Predictions and Save Results ---

print("\n--- Running Evaluation Generation & Saving ---")

# --- Define Concise Generation Function ---
def generate_prediction_concise(model, processor, sample, max_tokens):
    question, image = sample.get("query"), sample.get("image")
    if not (question and isinstance(image, Image.Image)): return "ERROR:Input"
    if image.mode != 'RGB':
        try: image = image.convert('RGB')
        except: return "ERROR:Convert"

    messages = create_inference_chat_messages(sample)
    if not messages: return "ERROR:Messages"

    prediction = "ERROR:Generate"
    inputs = None; generated_ids = None
    try:
        prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
        device = next(iter(model.parameters()), torch.tensor([])).device
        if device is None: device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        inputs = processor(text=[prompt], images=[image], return_tensors="pt", padding=True).to(device)
        generated_ids = model.generate(**inputs, max_new_tokens=max_tokens, do_sample=False,
                                       pad_token_id=processor.tokenizer.pad_token_id,
                                       eos_token_id=processor.tokenizer.eos_token_id)
        gen_tokens = generated_ids[0, inputs["input_ids"].shape[1]:]
        prediction = processor.decode(gen_tokens, skip_special_tokens=True).strip() if gen_tokens.numel() > 0 else "[NO_TOKENS]"
    except Exception as e:
        prediction = f"ERROR:{e.__class__.__name__}"
    finally:
        del inputs, generated_ids
    return prediction
# --- End Generation Function ---


--- Running Evaluation Generation & Saving ---


In [13]:
# load Processor
print(f"Loading processor for model: {MODEL_ID}")
eval_processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
if eval_processor.tokenizer.pad_token_id is None:
    eval_processor.tokenizer.pad_token = eval_processor.tokenizer.eos_token
    if hasattr(eval_processor, 'pad_token') and eval_processor.pad_token is None:
        eval_processor.pad_token = eval_processor.tokenizer.eos_token
print("Processor loaded.")

Loading processor for model: HuggingFaceTB/SmolVLM-Base


Some kwargs in processor config are unused and will not have any effect: image_seq_len. 


Processor loaded.


## Generate prediction

In [None]:
from tqdm.notebook import tqdm
from PIL import Image
import pandas as pd

# Determine models to evaluate
models_to_eval = {}
models_to_eval["base"] = base_model_eval

results_list = []
# Check dataset and models before looping
print(f"Generating predictions for {len(models_to_eval)} model(s) on {len(chartqa_test_dataset)} samples...")
# --- Generate Predictions ---
for sample in tqdm(chartqa_test_dataset, desc="Generating Predictions"):
    entry = {"id": sample.get("img_idx", "N/A"),
             "question": sample.get("query"),
             "ground_truth": str(sample.get("label"))} # Convert GT to string here

    # Basic validation of core sample data needed for processing
    if not all([entry["id"] != "N/A", entry["question"], entry["ground_truth"] is not None, isinstance(sample.get("image"), Image.Image)]):
        print(f"Warning: Skipping sample {entry['id']} due to missing/invalid core data.")
        continue # Skip this sample

    # Generate for applicable models
    for name, model_obj in models_to_eval.items():
        entry[f"predicted_answer_{name}"] = generate_prediction_concise(model_obj, eval_processor, sample, MAX_NEW_TOKENS_EVAL)
    results_list.append(entry)
# --- End Prediction Loop ---

# --- Save results_df to JSON ---
if results_list:
    results_df = pd.DataFrame(results_list)
    if 'EVAL_OUTPUT_FILE' in locals() and EVAL_OUTPUT_FILE:
        try:
            os.makedirs(os.path.dirname(EVAL_OUTPUT_FILE), exist_ok=True)
            results_df.to_json(EVAL_OUTPUT_FILE, orient="records", indent=2)
            print(f"\nEvaluation results (raw predictions) saved to: {EVAL_OUTPUT_FILE}")
        except Exception as e_save:
             print(f"\nERROR saving evaluation results to {EVAL_OUTPUT_FILE}: {e_save}")
    else:
         print("\nWarning: EVAL_OUTPUT_FILE not defined, results not saved.")
else:
    print("No valid evaluation results were generated to save.")



print(f"\n--- Evaluation Generation & Saving Finished ---")

Generating predictions for 1 model(s) on 2500 samples...


Generating Predictions:   0%|          | 0/2500 [00:00<?, ?it/s]

In [None]:
from pathlib import Path
from functools import reduce

### Final results compilation
print("--- Aggregating Evaluation Results ---")

BASE_OUTPUT_DIR = Path(WORKING_DIR)
OUTPUT_FOLDER_PREFIX = OUTPUT_DIR_BASE_NAME
EVAL_FILENAME = "chartqa_evaluation_results_comparison.json"

search_pattern = f"{OUTPUT_FOLDER_PREFIX}/{MODEL_ID}/{EVAL_FILENAME}"
result_files = list(BASE_OUTPUT_DIR.glob(search_pattern))
# if not result_files:
#     raise FileNotFoundError("No evaluation result files found.")

aggregated_dfs = []
base_models_added = set()

for file_path in result_files:
    run_label = file_path.parent.name.replace(f"{OUTPUT_FOLDER_PREFIX}-", "")
    parts = run_label.split('-')
    base_model_name = '-'.join(parts[:-1]) if len(parts) >= 2 else "UnknownBase"
    base_label = f"{base_model_name}-Original"

    df = pd.read_json(file_path)

    columns = ['id', 'question', 'ground_truth'] if not aggregated_dfs else ['id']
    rename_dict = {}

    if 'predicted_answer_finetuned' in df:
        columns.append('predicted_answer_finetuned')
        rename_dict['predicted_answer_finetuned'] = f"Pred_{run_label}"

    if base_model_name not in base_models_added and 'predicted_answer_base' in df:
        columns.append('predicted_answer_base')
        rename_dict['predicted_answer_base'] = f"Pred_{base_label}"
        base_models_added.add(base_model_name)

    df_subset = df[columns].rename(columns=rename_dict)
    aggregated_dfs.append(df_subset)

if aggregated_dfs:
    final_comparison_df = reduce(lambda left, right: pd.merge(left, right, on='id', how='outer'), aggregated_dfs)
    final_comparison_df.ffill(inplace=True)
    final_comparison_df.bfill(inplace=True)
else:
    print("⚠️ No evaluation results found to aggregate.")
    final_comparison_df = pd.DataFrame()


print("Aggregated Results Preview:")
display(final_comparison_df.head())

output_file = BASE_OUTPUT_DIR / OUTPUT_DIR_BASE_NAME / MODEL_ID /f"{OUTPUT_FOLDER_PREFIX}-ALL_RUNS_COMPARISON_WITH_BASE.csv"
final_comparison_df.to_csv(output_file, index=False)

print(f"Aggregated results saved to {output_file}")

## Evaluation

In [None]:
import re
from word2number import w2n

# --- Advanced accuracy metrics -------------------------------

YES, NO      = {"yes","y","true","correct"}, {"no","n","false","incorrect"}
TOL, EPS     = 0.05, 1e-9                  # ±5 %, tiny tolerance for 0
PRED_PREFIX  = "Pred_"                     # rename if your columns differ

# 2.  Helpers -------------------------------------------------------------------
def to_scalar(v):
    """Clean & convert value → float | 'yes' | 'no' | None."""
    if pd.isna(v): return None
    s = str(v).strip()
    m = re.fullmatch(r"\[['\"]?(.*?)['\"]?\]", s)
    if m: s = m.group(1)
    s = s.lower().replace(",","").replace("$","").replace("%","").strip()
    if s in YES: return "yes"
    if s in NO:  return "no"
    for fn in (float, w2n.word_to_num):
        try: return float(fn(s))
        except Exception: pass
    return None

def relaxed(gt, pred):
    return abs(gt - pred) <= (abs(gt) * TOL or EPS)

# 3.  Basic checks --------------------------------------------------------------
if not isinstance(globals().get("final_comparison_df"), pd.DataFrame):
    sys.exit("  `final_comparison_df` is missing.")

df = final_comparison_df.copy()
df["GT_proc"] = df["ground_truth"].map(to_scalar)

keep = df["GT_proc"].isin(["yes","no"]) | df["GT_proc"].apply(lambda x: isinstance(x,(int,float)))
df_filt = df[keep]
if df_filt.empty:
    sys.exit("  No yes/no/numeric ground‑truth rows after processing.")

# Metrics per run -----------------------------------------------------------
results = {}
for col in [c for c in df.columns if c.startswith(PRED_PREFIX)]:
    proc = f"{col}_proc"
    df[proc] = df[col].map(to_scalar)

    valid   = df[["GT_proc", proc]].dropna()
    numeric = valid[valid["GT_proc"].apply(lambda x:isinstance(x,(int,float))) &
                    valid[proc].apply(lambda x:isinstance(x,(int,float)))]

    em  = 100 * (valid["GT_proc"] == valid[proc]).mean() if not valid.empty else 0
    rel = 100 * numeric.apply(lambda r: relaxed(r["GT_proc"], r[proc]), axis=1).mean() if not numeric.empty else 0

    run = col.replace(PRED_PREFIX, "")
    results[run] = {"EM (%)": round(em,2),
                    "Relaxed Num (%)": round(rel,2),
                    "# Numeric": len(numeric),
                    "# Valid": len(valid)}

    print(f"{run:<15} EM={em:6.2f}%  Relaxed={rel:6.2f}%  ({len(numeric)} num / {len(valid)} valid)")

# Summary & preview ---------------------------------------------------------
summary = pd.DataFrame(results).T.sort_values("EM (%)", ascending=False)
display(summary)
display(df.head())

# Save to CSV ---------------------------------------------------------------
BASE_OUTPUT_DIR = Path(globals().get("BASE_OUTPUT_DIR", "./outputs"))
BASE_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
prefix = globals().get("OUTPUT_FOLDER_PREFIX", "RUNS")
out_file = BASE_OUTPUT_DIR / OUTPUT_DIR_BASE_NAME / MODEL_ID / f"{prefix}-ALL_RUNS_COMPARISON_WITH_PROCESSED.csv"
df.to_csv(out_file, index=False)
print("💾  Saved:", out_file)
