# Validate the holdout image dataset

In [22]:
from embetter.vision import ImageLoader
from embetter.multi import ClipEncoder
from embetter.grab import ColumnGrabber

from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

import pandas as pd
from pathlib import Path
from joblib import load

In [4]:
def create_filepaths_df() -> pd.DataFrame:
    dirs = ['cats', 'dogs']
    data = []
    for dir in dirs:
        for file in Path(f'holdout_data/{dir}').glob('*.jpg'):
            row_data = {
                'filepath': file,
                'target': dir
            }
            data.append(row_data)
    files_df = pd.DataFrame(data, columns=["filepath", "target"])
    return files_df

In [9]:
image_embedding_pipeline = make_pipeline(
   ColumnGrabber("filepath"),
  ImageLoader(convert="RGB"),
  ClipEncoder(),
)

In [12]:
holdout_files_df = create_filepaths_df()


In [13]:
holdout_files_df.head()

Unnamed: 0,filepath,target
0,holdout_data/cats/cat.6232.jpg,cats
1,holdout_data/cats/cat.46.jpg,cats
2,holdout_data/cats/cat.9589.jpg,cats
3,holdout_data/cats/cat.67.jpg,cats
4,holdout_data/cats/cat.9396.jpg,cats


In [15]:
%%time

X = image_embedding_pipeline.fit_transform(holdout_files_df)

CPU times: user 134 ms, sys: 125 ms, total: 259 ms
Wall time: 252 ms


In [16]:
y = holdout_files_df['target']

In [18]:
model = load("models/log_regression_baseline.joblib")

In [19]:
y_pred = model.predict(X)

In [20]:
y_pred

array(['cats', 'cats', 'cats', 'cats', 'cats', 'cats', 'cats', 'cats',
       'cats', 'cats', 'dogs', 'dogs', 'dogs', 'dogs', 'dogs', 'dogs',
       'dogs', 'dogs', 'dogs', 'dogs'], dtype=object)

In [21]:
y

0     cats
1     cats
2     cats
3     cats
4     cats
5     cats
6     cats
7     cats
8     cats
9     cats
10    dogs
11    dogs
12    dogs
13    dogs
14    dogs
15    dogs
16    dogs
17    dogs
18    dogs
19    dogs
Name: target, dtype: object

In [23]:
accuracy_score(y, y_pred)

1.0