### Vision Fine-tuning on GPT-4.1

In [3]:




import base64, cv2, json, random, os
from pathlib import Path
from PIL import Image
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm


def build_df(img_dir: Path):
    records = []
    
    # We now know the folder structure is more complex with nested folders
    for label_dir_name in ["claimed", "unclaimed"]:
        label_dir = img_dir / label_dir_name
        if not label_dir.exists():
            print(f"Directory {label_dir} doesn't exist")
            continue
            
        # Recursively find all image files under this label directory
        for root, dirs, files in os.walk(label_dir):
            root_path = Path(root)
            for file in files:
                if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                    file_path = root_path / file
                    records.append({"path": str(file_path), "label": label_dir_name})
    
    return pd.DataFrame(records)

# Using Path to handle relative paths correctly - going up one directory then to claim-img
script_dir = Path.cwd()  # Use current working directory in notebooks
img_dir = Path(r"d:\github-repo-tkhongsap\vision-fine-tuning\claim-img")
df = build_df(img_dir)

# Debug print to see if we're getting data
print(f"Found {len(df)} images")
if len(df) > 0:
    print("Sample data:")
    print(df.head())
    
    # Print label distribution
    label_counts = df["label"].value_counts()
    print("\nLabel distribution:")
    for label, count in label_counts.items():
        percentage = count/len(df)*100
        print(f"{label}: {count} images ({percentage:.1f}%)")

    # Limit to only 30 images, but maintain class balance
    if len(df) > 30:
        # Stratified sampling to maintain label distribution
        df = df.groupby("label", group_keys=False).apply(
            lambda x: x.sample(min(len(x), int(30 * len(x) / len(df))), random_state=42)
        )
        # If we don't have exactly 30 due to rounding, adjust
        if len(df) > 30:
            df = df.sample(30, random_state=42)
        elif len(df) < 30:
            # This is unlikely but just in case
            remaining = 30 - len(df)
            excluded = pd.concat([df, df]).drop_duplicates(keep=False)
            if len(excluded) >= remaining:
                df = pd.concat([df, excluded.sample(remaining, random_state=42)])
        
        print(f"\nLimited to 30 images")
        
        # Print updated label distribution
        limited_label_counts = df["label"].value_counts()
        print("\nLimited dataset label distribution:")
        for label, count in limited_label_counts.items():
            percentage = count/len(df)*100
            print(f"{label}: {count} images ({percentage:.1f}%)")

    # 70 / 20 / 10 split – but ALWAYS stratified by label
    train_df, tmp_df = train_test_split(df, test_size=0.30, stratify=df["label"], random_state=42)
    val_df, test_df  = train_test_split(tmp_df, test_size=1/3, stratify=tmp_df["label"], random_state=42)

    print(f"\nData split statistics:")
    print(f"train: {len(train_df)} images ({len(train_df)/len(df)*100:.1f}%)")
    print(f"val: {len(val_df)} images ({len(val_df)/len(df)*100:.1f}%)")
    print(f"test: {len(test_df)} images ({len(test_df)/len(df)*100:.1f}%)")
    
    # Print label distribution in each split
    print("\nTrain set label distribution:")
    train_label_counts = train_df["label"].value_counts()
    for label, count in train_label_counts.items():
        percentage = count/len(train_df)*100
        print(f"{label}: {count} images ({percentage:.1f}%)")
    
    print("\nValidation set label distribution:")
    val_label_counts = val_df["label"].value_counts()
    for label, count in val_label_counts.items():
        percentage = count/len(val_df)*100
        print(f"{label}: {count} images ({percentage:.1f}%)")
    
    print("\nTest set label distribution:")
    test_label_counts = test_df["label"].value_counts()
    for label, count in test_label_counts.items():
        percentage = count/len(test_df)*100
        print(f"{label}: {count} images ({percentage:.1f}%)")

    # --- Add filename and base64_uri columns ---
    def get_mime_type(filename):
        ext = filename.lower().split('.')[-1]
        if ext in ["jpg", "jpeg"]:
            return "image/jpeg"
        elif ext == "png":
            return "image/png"
        else:
            return "application/octet-stream"

    def add_filename_and_base64(df):
        df = df.copy()
        df["filename"] = df["path"].apply(lambda p: Path(p).name)
        base64_uris = []
        for path in tqdm(df["path"], desc="Encoding images to base64"):
            try:
                with open(path, "rb") as f:
                    img_bytes = f.read()
                mime = get_mime_type(path)
                b64 = base64.b64encode(img_bytes).decode("utf-8")
                uri = f"data:{mime};base64,{b64}"
                base64_uris.append(uri)
            except Exception as e:
                print(f"Error encoding {path}: {e}")
                base64_uris.append("")
        df["base64_uri"] = base64_uris
        return df

    train_df = add_filename_and_base64(train_df)
    val_df = add_filename_and_base64(val_df)
    test_df = add_filename_and_base64(test_df)

    print("\nSample train row with new columns:")
    print(train_df.iloc[0][["filename", "base64_uri"]])
    print("\nSample val row with new columns:")
    print(val_df.iloc[0][["filename", "base64_uri"]])
    print("\nSample test row with new columns:")
    print(test_df.iloc[0][["filename", "base64_uri"]])
else:
    print("No images found. Check the directory path and structure.")
    print(f"Looking for images in: {img_dir}")
    print(f"Does directory exist: {img_dir.exists()}")
    if img_dir.exists():
        print("Directory contents:")
        for item in img_dir.iterdir():
            print(f"  - {item.name} ({'dir' if item.is_dir() else 'file'})")




  df = df.groupby("label", group_keys=False).apply(


Found 262 images
Sample data:
                                                path    label
0  d:\github-repo-tkhongsap\vision-fine-tuning\cl...  claimed
1  d:\github-repo-tkhongsap\vision-fine-tuning\cl...  claimed
2  d:\github-repo-tkhongsap\vision-fine-tuning\cl...  claimed
3  d:\github-repo-tkhongsap\vision-fine-tuning\cl...  claimed
4  d:\github-repo-tkhongsap\vision-fine-tuning\cl...  claimed

Label distribution:
unclaimed: 190 images (72.5%)
claimed: 72 images (27.5%)

Limited to 30 images

Limited dataset label distribution:
unclaimed: 21 images (72.4%)
claimed: 8 images (27.6%)

Data split statistics:
train: 20 images (69.0%)
val: 6 images (20.7%)
test: 3 images (10.3%)

Train set label distribution:
unclaimed: 14 images (70.0%)
claimed: 6 images (30.0%)

Validation set label distribution:
unclaimed: 5 images (83.3%)
claimed: 1 images (16.7%)

Test set label distribution:
unclaimed: 2 images (66.7%)
claimed: 1 images (33.3%)


Encoding images to base64: 100%|██████████| 20/20 [00:00<00:00, 77.46it/s]
Encoding images to base64: 100%|██████████| 6/6 [00:00<00:00, 82.83it/s]
Encoding images to base64: 100%|██████████| 3/3 [00:00<00:00, 78.61it/s]


Sample train row with new columns:
filename                                           IMG_8873.JPG
base64_uri    data:image/jpeg;base64,/9j/4S7jRXhpZgAATU0AKgA...
Name: 173, dtype: object

Sample val row with new columns:
filename                                           IMG_9272.JPG
base64_uri    data:image/jpeg;base64,/9j/4SjFRXhpZgAATU0AKgA...
Name: 128, dtype: object

Sample test row with new columns:
filename                                           IMG_9281.JPG
base64_uri    data:image/jpeg;base64,/9j/4S6XRXhpZgAATU0AKgA...
Name: 137, dtype: object





In [4]:
# Display the columns and their data types for each split
print("\nTrain DataFrame Info:")
print(train_df.info())

print("\nValidation DataFrame Info:")
print(val_df.info())

print("\nTest DataFrame Info:")
print(test_df.info())

# Show sample rows from each split to verify data
print("\nSample rows from Train DataFrame:")
print(train_df.head())

print("\nSample rows from Validation DataFrame:")
print(val_df.head())

print("\nSample rows from Test DataFrame:")
print(test_df.head())




Train DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 20 entries, 173 to 167
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   path        20 non-null     object
 1   label       20 non-null     object
 2   filename    20 non-null     object
 3   base64_uri  20 non-null     object
dtypes: object(4)
memory usage: 800.0+ bytes
None

Validation DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, 128 to 181
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   path        6 non-null      object
 1   label       6 non-null      object
 2   filename    6 non-null      object
 3   base64_uri  6 non-null      object
dtypes: object(4)
memory usage: 240.0+ bytes
None

Test DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, 137 to 10
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  -----

## Constructing the Training, Validation, and Test datasets

In [5]:

SYSTEM_PROMPT = """
You are an assistant that decides whether a bottle can be returned for a deposit refund.
Look at the image and answer with exactly one word: “claimable” or “non-claimable”.
"""

def row_to_chat_json(row: pd.Series) -> dict:
    """
    Map one DataFrame row to the 3-turn chat format:
      system  - your fixed instructions
      user    - ALWAYS the same question + the image
      assistant - ground-truth (claimable / non-claimable)
    """
    return {
        "messages": [
            {
                "role": "system",
                "content": SYSTEM_PROMPT.strip()
            },
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Is the bottle claimable?"},
                    {"type": "image_url", "image_url": {"url": row["base64_uri"]}}
                ]
            },
            {
                "role": "assistant",
                "content": [{"type": "text", "text": row["label"]}]
            }
        ]
    }

def df_to_jsonl(df: pd.DataFrame, out_path: Path):
    with out_path.open("w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            json_line = row_to_chat_json(row)
            f.write(json.dumps(json_line, ensure_ascii=False) + "\n")
    print(f"Wrote {len(df):>4} lines  →  {out_path}")

data_dir = Path("data_jsonl"); data_dir.mkdir(exist_ok=True)

df_to_jsonl(train_df, data_dir / "train.jsonl")
df_to_jsonl(val_df,   data_dir / "val.jsonl")
df_to_jsonl(test_df,  data_dir / "test.jsonl")   # keep only locally



Wrote   20 lines  →  data_jsonl\train.jsonl
Wrote    6 lines  →  data_jsonl\val.jsonl
Wrote    3 lines  →  data_jsonl\test.jsonl


In [None]:
from openai import OpenAI, ChatCompletion
import json
import os
import dotenv

dotenv.load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")


client = OpenAI(api_key=openai_api_key)


# upload training file
train_file = client.files.create(
  file=open("data_jsonl/train.jsonl", "rb"),
  purpose="fine-tune"
)

# upload validation file
val_file = client.files.create(
  file=open("data_jsonl/val.jsonl", "rb"),
  purpose="fine-tune"
)