In [17]:
from io import BytesIO
import numpy as np
import polars as pl
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [18]:
train_features_file = Path("data/train_features.parquet")
train_df = pl.read_parquet(train_features_file, use_pyarrow=True, memory_map=True)

In [19]:
def sample_from_group(sub_df):
    samples_per_class = 200
    random_seed = 42
    return sub_df.sample(
        min(samples_per_class, len(sub_df)), with_replacement=False, seed=random_seed
    )


train_sampled_df = train_df.group_by("ClassId").map_groups(sample_from_group)
del train_df

In [20]:
X = np.asarray(train_sampled_df["HOG_Features"].to_list())
y = train_sampled_df["ClassId"].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [31]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [38]:
svm_classifier = SVC(
    kernel="rbf", degree=4, C=1.0, random_state=42
)  # linear, poly, rbf
svm_classifier.fit(X_train, y_train)
y_pred = svm_classifier.predict(X_test)

In [39]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9796511627906976
