In [1]:
import json
import os
import pickle

import numpy as np
from skimage import feature, io, transform, util
from sklearn import model_selection

In [2]:
DATA_DIRECTORY = "data"
MEN_DIRECTORY = os.path.join(DATA_DIRECTORY, "men")
WOMEN_DIRECTORY = os.path.join(DATA_DIRECTORY, "women")
LABELS_FILENAME = os.path.join(DATA_DIRECTORY, "labels.jsonl")

In [3]:
paths: list[str] = []
labels: list[int] = []
with open(LABELS_FILENAME, "r") as f:
    for line in f:
        entry = json.loads(line)
        path = entry["image_url"]
        label = int(entry["label"])
        paths.append(path)
        labels.append(label)

In [4]:
N = 20
# Sample N images from each class
classes = np.unique(labels)
sampled_paths = []
sampled_labels = []
for c in classes:
    class_paths = [p for p, l in zip(paths, labels) if l == c]
    sampled_paths.extend(class_paths[:N])
    sampled_labels.extend([c] * N)

paths = sampled_paths
labels = sampled_labels

In [5]:
from hog import hog

myhog = hog
skhog = feature.hog


def get_features(img_path: str) -> np.ndarray:
    img = util.img_as_float(io.imread(img_path, as_gray=True))

    img = transform.rescale(img, 1 / 16)

    # features = myhog(img)
    features = skhog(
        img, orientations=8, pixels_per_cell=(16, 16), cells_per_block=(1, 1)
    )

    return features

In [6]:
features = [get_features(path) for path in paths]

In [7]:
with open("features.pkl", "wb") as f:
    pickle.dump(features, f)

In [8]:
# with open("features.pickle", "rb") as f:
# features = pickle.load(f)

In [9]:
random_state = 312

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    features, labels, test_size=0.2, random_state=random_state
)

In [10]:
cv = model_selection.StratifiedKFold(
    n_splits=5, shuffle=True, random_state=random_state
)

In [11]:
from sklearn.svm import SVC

clf = SVC(kernel="linear")

result = model_selection.cross_validate(
    clf,
    X_train,
    y_train,
    cv=cv,
    scoring=("accuracy", "balanced_accuracy"),
)

result

{'fit_time': array([0.00369215, 0.00295591, 0.00278711, 0.00277948, 0.00268126]),
 'score_time': array([0.00222325, 0.00094128, 0.00097966, 0.00109577, 0.00088787]),
 'test_accuracy': array([0.4       , 0.36842105, 0.26315789, 0.42105263, 0.52631579]),
 'test_balanced_accuracy': array([0.36111111, 0.33333333, 0.23611111, 0.41666667, 0.5       ])}

In [12]:
clf.fit(X_train, y_train)

In [13]:
with open("model.pkl", "wb") as f:
    pickle.dump(clf, f)

In [14]:
# with open("model.pkl", "rb") as f:
#     clf = pickle.load(f)

In [15]:
clf.score(X_test, y_test)

0.4583333333333333