# Validate the holdout image dataset

In [15]:
from embetter.vision import ImageLoader
from embetter.multi import ClipEncoder
from embetter.grab import ColumnGrabber

from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

import pandas as pd
from pathlib import Path
from joblib import load

In [16]:
dirs = ['cats', 'dogs']

# root_dir = '/Users/patrickryan/Development/machinelearning/scikit-learn/cats-vs-dogs-with-scikit-learn'


root_dir = '/Volumes/TheVault/ml_datasets/kaggle/cats-vs-dogs'

In [17]:
def create_filepaths_df(dir_name:str) -> pd.DataFrame:
    
    data = []
    for dir in dirs:
        for file in Path(f'{root_dir}/{dir_name}/{dir}').glob('*.jpg'):
            row_data = {
                'filepath': file,
                'target': dir
            }
            data.append(row_data)
    files_df = pd.DataFrame(data, columns=["filepath", "target"])
    return files_df


In [18]:
image_embedding_pipeline = make_pipeline(
   ColumnGrabber("filepath"),
  ImageLoader(convert="RGB"),
  ClipEncoder(),
)

In [19]:
holdout_files_df = create_filepaths_df(dir_name="holdout")


In [20]:
holdout_files_df.head()

Unnamed: 0,filepath,target
0,/Volumes/TheVault/ml_datasets/kaggle/cats-vs-d...,cats
1,/Volumes/TheVault/ml_datasets/kaggle/cats-vs-d...,cats
2,/Volumes/TheVault/ml_datasets/kaggle/cats-vs-d...,cats
3,/Volumes/TheVault/ml_datasets/kaggle/cats-vs-d...,cats
4,/Volumes/TheVault/ml_datasets/kaggle/cats-vs-d...,cats


In [21]:
%%time

X = image_embedding_pipeline.fit_transform(holdout_files_df)

CPU times: user 145 ms, sys: 91.2 ms, total: 237 ms
Wall time: 779 ms


In [22]:
y = holdout_files_df['target']

In [23]:
model = load(f"{root_dir}/models/log_regression_baseline.joblib")

In [24]:
y_pred = model.predict(X)

In [25]:
y_pred

array(['cats', 'cats', 'cats', 'cats', 'cats', 'cats', 'cats', 'cats',
       'cats', 'cats', 'cats', 'cats', 'cats', 'dogs', 'dogs', 'dogs',
       'dogs', 'dogs', 'dogs', 'dogs', 'dogs', 'dogs', 'dogs', 'dogs',
       'dogs', 'dogs', 'dogs', 'dogs', 'dogs', 'dogs', 'dogs', 'dogs',
       'dogs'], dtype=object)

In [26]:
y

0     cats
1     cats
2     cats
3     cats
4     cats
5     cats
6     cats
7     cats
8     cats
9     cats
10    cats
11    cats
12    cats
13    dogs
14    dogs
15    dogs
16    dogs
17    dogs
18    dogs
19    dogs
20    dogs
21    dogs
22    dogs
23    dogs
24    dogs
25    dogs
26    dogs
27    dogs
28    dogs
29    dogs
30    dogs
31    dogs
32    dogs
Name: target, dtype: object

In [27]:
accuracy_score(y, y_pred)

1.0