# Cats vs Dogs Image classification Kaggle Notebook

This notebook pulls in the working cells from the 01, 02, 03 notebooks into a single notebook to publish on Kaggle. 

See the other notebooks for an explaination and detailed steps


In [17]:
from embetter.vision import ImageLoader
from embetter.multi import ClipEncoder
from embetter.grab import ColumnGrabber

from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

import pandas as pd
from pathlib import Path
from joblib import load

In [6]:
dirs = ['cats', 'dogs']

# root_dir = '/Users/patrickryan/Development/machinelearning/scikit-learn/cats-vs-dogs-with-scikit-learn'


root_dir = '/Volumes/TheVault/ml_datasets/kaggle/cats-vs-dogs'

In [7]:
def create_filepaths_df(dir_name:str) -> pd.DataFrame:
    
    data = []
    for dir in dirs:
        for file in Path(f'{root_dir}/{dir_name}/{dir}').glob('*.jpg'):
            row_data = {
                'filepath': file,
                'target': dir
            }
            data.append(row_data)
    files_df = pd.DataFrame(data, columns=["filepath", "target"])
    return files_df


In [8]:
files_df = create_filepaths_df(dir_name='data')

In [9]:
# Image Embedding Pipeline
image_embedding_pipeline = make_pipeline(
   ColumnGrabber("filepath"),
  ImageLoader(convert="RGB"),
  ClipEncoder(),
)

In [10]:
%%time

X = image_embedding_pipeline.fit_transform(files_df)



CPU times: user 1min 9s, sys: 6.4 s, total: 1min 15s
Wall time: 1min 50s


In [12]:
y = files_df['target']

In [15]:
model = LogisticRegression(solver='liblinear', max_iter=1_000)

In [18]:
%%time

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(model, X, y, cv=cv)
scores

CPU times: user 6.27 s, sys: 10.4 s, total: 16.7 s
Wall time: 4.98 s


array([0.99439103, 0.99639423, 0.99599359, 0.99559207, 0.99519134])

In [20]:
scores.mean()

np.float64(0.995512451258419)

In [22]:
model = LogisticRegression(solver='liblinear', max_iter=1_000)
model.fit(X,y)

In [23]:
holdout_files_df = create_filepaths_df(dir_name="holdout")


In [24]:
%%time

holdout_X = image_embedding_pipeline.fit_transform(holdout_files_df)

CPU times: user 180 ms, sys: 85.8 ms, total: 266 ms
Wall time: 445 ms


In [25]:
holdout_y = holdout_files_df['target']

In [26]:
y_pred = model.predict(holdout_X)

In [27]:
accuracy_score(holdout_y, y_pred)

1.0