In [13]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

emb = np.load("data/image_emb.npy", allow_pickle=True)
labels_raw = np.load("data/image_labels.npy", allow_pickle=True)

print("emb:", emb.shape)
print("labels_raw:", labels_raw.shape)

emb_test = np.load("data/image_emb_test.npy", allow_pickle=True)
labels_test = np.load("data/image_labels_test.npy", allow_pickle=True)

print("emb_test:", emb_test.shape)
print("labels_test:", labels_test.shape)


emb: (42500, 512)
labels_raw: (42500,)
emb_test: (7500, 512)
labels_test: (7500,)


In [14]:
labeled_idx = []
labeled_classes = []

for i, lab in enumerate(labels_raw):
    if lab != '':
        labeled_idx.append(i)
        labeled_classes.append(lab)

print("Liczba oznaczonych przykładów:", len(labeled_idx))
print("Przykładowe indeksy:", labeled_idx[:10])
print("Przykładowe klasy:", labeled_classes[:10])

Liczba oznaczonych przykładów: 50
Przykładowe indeksy: [447, 582, 1104, 1527, 2474, 2640, 3345, 3694, 4078, 4797]
Przykładowe klasy: ['truck', 'automobile', 'dog', 'deer', 'truck', 'automobile', 'airplane', 'ship', 'horse', 'dog']


In [15]:
class_names = np.unique(labeled_classes)
num_classes = len(class_names)
print("Klasy:", class_names)
print("Liczba klas:", num_classes)

kmeans = KMeans(n_clusters=num_classes, random_state=42)
clusters_train = kmeans.fit_predict(emb) 

Klasy: ['airplane' 'automobile' 'bird' 'cat' 'deer' 'dog' 'frog' 'horse' 'ship'
 'truck']
Liczba klas: 10


In [None]:
index_to_class = dict(zip(labeled_idx, labeled_classes))

cluster_to_class = {}

for c in range(num_classes):
    idxs = [i for i in labeled_idx if clusters_train[i] == c]
    cls = [index_to_class[i] for i in idxs]

    majority = max(set(cls), key=cls.count)
    cluster_to_class[c] = majority

print("klaster -> klasa:")
for c in range(num_classes):
    print(c, "->", cluster_to_class[c])


klaster -> klasa:
0 -> deer
1 -> ship
2 -> truck
3 -> cat
4 -> frog
5 -> dog
6 -> horse
7 -> airplane
8 -> bird
9 -> automobile


In [17]:
def predict_classes(embeddings):
    cluster_ids = kmeans.predict(embeddings)
    return np.array([cluster_to_class[c] for c in cluster_ids])

In [18]:
y_true = labels_test
y_pred = predict_classes(emb_test)

print("Accuracy:", accuracy_score(y_true, y_pred))
print(classification_report(y_true, y_pred))
print(confusion_matrix(y_true, y_pred))

Accuracy: 0.7693333333333333
              precision    recall  f1-score   support

    airplane       0.97      0.54      0.69       692
  automobile       0.94      0.94      0.94       759
        bird       0.95      0.79      0.86       761
         cat       0.32      0.88      0.46       746
        deer       0.92      0.76      0.83       761
         dog       0.86      0.75      0.80       758
        frog       0.97      0.76      0.85       754
       horse       0.97      0.50      0.66       760
        ship       0.95      0.89      0.92       739
       truck       0.98      0.88      0.93       770

    accuracy                           0.77      7500
   macro avg       0.88      0.77      0.79      7500
weighted avg       0.88      0.77      0.80      7500

[[372   9  17 263   2   0   1   0  24   4]
 [  0 710   0  36   0   0   0   1   2  10]
 [  8   0 599 118  30   3   3   0   0   0]
 [  0   0   3 656   5  81   1   0   0   0]
 [  0   0   4 164 578   0  13   0   2   