# Model Training on Cats vs Dogs image dataset

In [72]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from joblib import dump
import pandas as pd
import numpy as np
import json

## Load the Data set

In [58]:
df = pd.read_csv("preprocessed_data/image_embeddings.csv")

In [59]:
df.head()

Unnamed: 0,filepath,target,image_embedding
0,data/cats/cat.5077.jpg,cats,"[-0.27597397565841675, -0.3609091341495514, -0..."
1,data/cats/cat.2718.jpg,cats,"[-0.2487901747226715, 0.06151339039206505, 0.2..."
2,data/cats/cat.10151.jpg,cats,"[-0.20199576020240784, -0.08704289048910141, 0..."
3,data/cats/cat.3406.jpg,cats,"[-0.2417299449443817, -0.42382076382637024, -0..."
4,data/cats/cat.4369.jpg,cats,"[0.11845763772726059, 0.021671850234270096, 0...."


In [60]:
df.tail()

Unnamed: 0,filepath,target,image_embedding
24975,data/dogs/dog.9316.jpg,dogs,"[-0.007655435241758823, 0.3041452467441559, 0...."
24976,data/dogs/dog.6025.jpg,dogs,"[0.12140308320522308, 0.28480660915374756, 0.0..."
24977,data/dogs/dog.8008.jpg,dogs,"[0.2268548309803009, 0.021711133420467377, -0...."
24978,data/dogs/dog.1992.jpg,dogs,"[0.0660947933793068, -0.3919999301433563, 0.12..."
24979,data/dogs/dog.12412.jpg,dogs,"[-0.18819163739681244, -0.33489400148391724, -..."


In [61]:
image_embeddings = df['image_embedding'].apply(json.loads)

In [62]:

type(image_embeddings[0])

list

In [63]:
X_df = image_embeddings.apply(pd.Series)

In [64]:
type(X_df)

pandas.core.frame.DataFrame

In [65]:
X_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,-0.275974,-0.360909,-0.0805,-0.043479,-0.233582,-0.186832,0.082863,0.319521,0.318694,-0.142109,...,0.104367,-0.044334,-0.055113,-0.100325,0.080443,-0.124107,0.216158,0.747369,-0.154848,0.224701
1,-0.24879,0.061513,0.265697,-0.099238,-0.079052,-0.199255,-0.277647,-0.212935,0.095857,0.232316,...,-0.078692,0.205512,0.213015,-0.356619,-0.262643,-0.269416,-0.150575,1.244251,0.104376,0.274662
2,-0.201996,-0.087043,0.028204,0.498045,0.389393,-0.112062,0.285145,-0.049174,1.146117,0.148908,...,0.107276,0.021671,-0.16483,-0.450092,-0.142347,0.307071,0.527308,0.52803,-0.163767,0.327534
3,-0.24173,-0.423821,-0.03827,0.009968,0.134129,-0.467065,0.00611,0.045,0.551778,0.151405,...,-0.256871,-0.084569,0.216711,-0.069324,0.073171,-0.018208,-0.005185,0.989258,-0.290039,0.239592
4,0.118458,0.021672,0.094224,0.141429,0.021709,-0.364074,-0.116778,-0.434064,0.554343,-0.194686,...,-0.475455,0.079406,0.412562,-0.271615,0.085459,-0.40815,0.13863,1.212937,0.081448,0.04398


In [66]:
y = df['target']

In [67]:
model = LogisticRegression(solver='liblinear', max_iter=1_000)

In [68]:
%%time

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(model, X_df, y, cv=cv)
scores

CPU times: user 6.67 s, sys: 10.2 s, total: 16.8 s
Wall time: 5.18 s


array([0.99359488, 0.99619696, 0.99539632, 0.99619696, 0.99439552])

In [69]:
scores.mean()

np.float64(0.99515612489992)

## Re-fit the Model and Save it


In [71]:
model2 = LogisticRegression(solver='liblinear', max_iter=1_000)
model2.fit(X_df,y)

In [73]:
dump(model2, "models/log_regression_baseline.joblib")

['models/log_regression_baseline.joblib']