In [74]:
import json
import os
import pickle

import numpy as np
from skimage import feature, io, transform, util, filters, morphology, measure, color, exposure
from sklearn import model_selection

In [75]:
DATA_DIRECTORY = "data"
MEN_DIRECTORY = os.path.join(DATA_DIRECTORY, "men")
WOMEN_DIRECTORY = os.path.join(DATA_DIRECTORY, "women")
LABELS_FILENAME = os.path.join(DATA_DIRECTORY, "labels.jsonl")

In [76]:
paths: list[str] = []
labels: list[int] = []
with open(LABELS_FILENAME, "r") as f:
    for line in f:
        entry = json.loads(line)
        path = entry["image_url"]
        label = int(entry["label"])
        paths.append(path)
        labels.append(label)

In [77]:
def get_features(path: str):
    image = io.imread(path)
    
    grayscale = color.rgb2gray(image)
    
    corrected = exposure.adjust_gamma(grayscale, gamma=0.5)
    
    resized = transform.rescale(corrected, 1 / 24, anti_aliasing=True, preserve_range=True)
    
    features = feature.hog(
        resized,
        orientations=8,
        pixels_per_cell=(8, 8),
        cells_per_block=(2, 2),
        block_norm="L2-Hys",
        feature_vector=True,
    )

    return features

In [78]:
get_features(paths[0])

array([0.30851812, 0.30851812, 0.18436151, ..., 0.        , 0.        ,
       0.        ])

In [79]:
# N = 20
# samples_per_class = {label: 0 for label in set(labels)}
# unsampled_paths = paths
# unsampled_labels = labels
# paths = []
# labels = []

# count = len(unsampled_paths)
# for i in range(count):
#     label = unsampled_labels[i]
#     if samples_per_class[label] < N:
#         samples_per_class[label] += 1
#         paths.append(unsampled_paths[i])
#         labels.append(unsampled_labels[i])

In [61]:
features = [get_features(path) for path in paths]

In [62]:
with open("features_paper.pkl", "wb") as f:
    pickle.dump(features, f)

In [63]:
# with open("features_paper.pickle", "rb") as f:
# features = pickle.load(f)

In [64]:
random_state = 312

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    features, labels, test_size=0.2, random_state=random_state
)

In [65]:
cv = model_selection.StratifiedKFold(
    n_splits=5, shuffle=True, random_state=random_state
)

In [73]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
# clf = SVC(kernel="linear")
clf = RandomForestClassifier(n_estimators=1000, random_state=random_state)

result = model_selection.cross_validate(
    clf,
    X_train,
    y_train,
    cv=cv,
    scoring=("accuracy", "balanced_accuracy"),
)

result

{'fit_time': array([2.65560389, 2.7510004 , 2.4602654 , 2.50384307, 2.21533847]),
 'score_time': array([0.05157614, 0.06212592, 0.09504008, 0.0472734 , 0.04770827]),
 'test_accuracy': array([0.45      , 0.42105263, 0.36842105, 0.52631579, 0.42105263]),
 'test_balanced_accuracy': array([0.41666667, 0.38888889, 0.36111111, 0.5       , 0.43055556])}

In [67]:
clf.fit(X_train, y_train)

In [68]:
with open("model_paper.pkl", "wb") as f:
    pickle.dump(clf, f)

In [69]:
# with open("model_paper.pkl", "rb") as f:
#     clf = pickle.load(f)

In [70]:
clf.score(X_test, y_test)

0.375