# Validate the holdout image dataset

In [1]:
from embetter.vision import ImageLoader
from embetter.multi import ClipEncoder
from embetter.grab import ColumnGrabber

from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

import pandas as pd
from pathlib import Path
from joblib import load

In [2]:
def create_filepaths_df() -> pd.DataFrame:
    dirs = ['cats', 'dogs']
    data = []
    for dir in dirs:
        for file in Path(f'holdout_data/{dir}').glob('*.jpg'):
            row_data = {
                'filepath': file,
                'target': dir
            }
            data.append(row_data)
    files_df = pd.DataFrame(data, columns=["filepath", "target"])
    return files_df

In [3]:
image_embedding_pipeline = make_pipeline(
   ColumnGrabber("filepath"),
  ImageLoader(convert="RGB"),
  ClipEncoder(),
)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [4]:
holdout_files_df = create_filepaths_df()


In [5]:
holdout_files_df.head()

Unnamed: 0,filepath,target
0,holdout_data/cats/cat.6232.jpg,cats
1,holdout_data/cats/cat.46.jpg,cats
2,holdout_data/cats/cat.9589.jpg,cats
3,holdout_data/cats/cat.67.jpg,cats
4,holdout_data/cats/cat.9396.jpg,cats


In [6]:
%%time

X = image_embedding_pipeline.fit_transform(holdout_files_df)

CPU times: user 124 ms, sys: 62.2 ms, total: 186 ms
Wall time: 231 ms


In [7]:
y = holdout_files_df['target']

In [8]:
model = load("models/log_regression_baseline.joblib")

In [9]:
y_pred = model.predict(X)

In [10]:
y_pred

array(['cats', 'cats', 'cats', 'cats', 'cats', 'cats', 'cats', 'cats',
       'cats', 'cats', 'dogs', 'dogs', 'dogs', 'dogs', 'dogs', 'dogs',
       'dogs', 'dogs', 'dogs', 'dogs'], dtype=object)

In [11]:
y

0     cats
1     cats
2     cats
3     cats
4     cats
5     cats
6     cats
7     cats
8     cats
9     cats
10    dogs
11    dogs
12    dogs
13    dogs
14    dogs
15    dogs
16    dogs
17    dogs
18    dogs
19    dogs
Name: target, dtype: object

In [12]:
accuracy_score(y, y_pred)

1.0