In [1]:
from sklearn.metrics import accuracy_score
from numpy.linalg import norm
from pathlib import Path

from PIL import Image

import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from script.tool import *

In [2]:
def filter_data(data, minimum_data_class=5):
    df_pd = pd.DataFrame(data, columns = ['classes'])
    df_pd['classes_labeled'], _ = pd.factorize(df_pd['classes'])
    counts = df_pd['classes_labeled'].value_counts()
    classes_less_than_n = counts[counts < minimum_data_class].index
    index_less_than_n = df_pd['classes_labeled'].isin(classes_less_than_n)
    index_greater_than_or_equal_to_n = ~df_pd['classes_labeled'].isin(classes_less_than_n)
    print("amount of all image :", len(df_pd))
    print(f"amount of image that less than {minimum_data_class} in that class : {sum(index_less_than_n)}")
    print(f"amount of image that more than {minimum_data_class} in that class : {sum(index_greater_than_or_equal_to_n)}")
    return df_pd, index_less_than_n, index_greater_than_or_equal_to_n

In [3]:
def scan_directory(path):
    df = pd.read_csv(path)
    return np.array(df['classes'])

In [6]:
n_cv = 5 
path_dataset = '/home/music/Desktop/measure_model/feature_map/vit_b_p16_224_last_hidden_trained_lr2e_05_cos.csv'

In [7]:
df = scan_directory(path_dataset)
df_pd, index_less_than_n, index_greater_than_or_equal_to_n = filter_data(df, minimum_data_class=n_cv)

amount of all image : 15524
amount of image that less than 5 in that class : 116
amount of image that more than 5 in that class : 15408


In [None]:
def StratifiedKFold_score(X, y, n_cv=5, index_filter = False):
    first = True
    result = []
    result_in_n = []
    skf = StratifiedKFold(n_splits=n_cv)
    if index_filter != False:
        index_greater_filtered, index_less_filtered = index_filter
        X_less = X[index_less_filtered]
        y_less = y[index_less_filtered]
        X = X[index_greater_filtered]
        y = y[index_greater_filtered]
                
    index_df_split = skf.split(X, y)

    for train_index, test_index in index_df_split:
        x_train = np.array(X)[train_index]
        y_train = np.array(y)[train_index]
        x_test = np.array(X)[test_index]
        y_test = np.array(y)[test_index]
        
        if index_filter != False:
            x_train = np.concatenate((x_train, X_less))
            y_train = np.concatenate((y_train, y_less))

        dot_product = np.dot(x_test,x_train.T)              # (x_test , x_train)
        norm_test = norm(x_test, axis=1).reshape(-1, 1)     # (x_test, 1)
        norm_train = norm(x_train, axis=1).reshape(1, -1)   # (1, x_train)
        res = dot_product/(norm_test*norm_train)            # res = (x_test , x_train), norm_test*norm_train = (x_test , x_train)
        
        ranking = np.argsort(res, axis=1)
        # ind_top_n = np.argsort(res, axis=1)[:, -n_cv:]
        y_pred = y_train[np.argmax(res, axis=1)]
        acc = accuracy_score(y_test, y_pred)
        if first:
            first = False
            result = [acc]
        else:
            result.append(acc)
        
        break
    return result, [train_index, test_index], res, ranking

In [6]:
x_gg, y_gg = load_feature("vit_base_patch16_224_in21k_last_hidden_state_onnx.csv")
y_gg_label_encode, _ = pd.factorize(y_gg)

In [6]:
x_gg, y_gg = load_feature("vit_b_p16_224_last_hidden_trained_lr2e_05.csv")
y_gg_label_encode, _ = pd.factorize(y_gg)

In [8]:
x_gg, y_gg = load_feature("vit_b_p16_224_last_hidden_trained_lr2e_05_cos.csv")
y_gg_label_encode, _ = pd.factorize(y_gg)

In [44]:
x_gg, y_gg = load_feature("efficientnet_b1.csv")
y_gg_label_encode, _ = pd.factorize(y_gg)

In [53]:
x_gg, y_gg = load_feature("efficientnet_b5.csv")
y_gg_label_encode, _ = pd.factorize(y_gg)

In [None]:
result_gg, ls_gg, res_t_gg, result_in_n = StratifiedKFold_score(x_gg, 
                                                   y_gg_label_encode, 
                                                   n_cv=n_cv, 
                                                   index_filter=(index_greater_than_or_equal_to_n, index_less_than_n))
print(f"scores {n_cv} fold : {result_gg}")
print(f"average score : {sum(result_gg)/n_cv}")
print(result_in_n)

In [9]:
X = x_gg
y = y_gg_label_encode
y_gg_un = y_gg
index_filter=(index_greater_than_or_equal_to_n, index_less_than_n)

first = True
result = []
result_in_n = []
score_top_n = []
skf = StratifiedKFold(n_splits=n_cv)
if index_filter != False:
    index_greater_filtered, index_less_filtered = index_filter
    X_less = X[index_less_filtered]
    y_less = y[index_less_filtered]
    y_gg_un_less = y_gg_un[index_less_filtered]
    X = X[index_greater_filtered]
    y = y[index_greater_filtered]
    y_gg_un = y_gg_un[index_greater_filtered]
            
index_df_split = skf.split(X, y)

for train_index, test_index in index_df_split:
    x_train = np.array(X)[train_index]
    y_gg_un_train = np.array(y_gg_un)[train_index]
    y_train = np.array(y)[train_index]
    x_test = np.array(X)[test_index]
    y_test = np.array(y)[test_index]
    y_gg_un_test = np.array(y_gg_un)[test_index]
    
    if index_filter != False:
        x_train = np.concatenate((x_train, X_less))
        y_train = np.concatenate((y_train, y_less))
        y_gg_train = np.concatenate((y_gg_un_train, y_gg_un_less))

    dot_product = np.dot(x_test,x_train.T)              # (x_test , x_train)
    norm_test = norm(x_test, axis=1).reshape(-1, 1)     # (x_test, 1)
    norm_train = norm(x_train, axis=1).reshape(1, -1)   # (1, x_train)
    res = dot_product/(norm_test*norm_train)            # res = (x_test , x_train), norm_test*norm_train = (x_test , x_train)
    
    f = True
    rank_top_n = []
    ranking = np.argsort(res, axis=1)
    y_ranking = np.repeat(y_gg_train.reshape(1, -1), repeats=ranking.shape[0], axis=0)
    result_ranking = np.take_along_axis(y_ranking, ranking, axis=1)[:, ::-1]
    for row in result_ranking:
        indexes = np.unique(row, return_index=True)
        res_row = row[sorted(indexes[1])][:5].reshape(1, -1)
        if f:
            f = False
            rank_top_n = res_row
        else:
            rank_top_n = np.concatenate((rank_top_n, res_row))
    
    y_pred = y_train[np.argmax(res, axis=1)]
    acc = accuracy_score(y_test, y_pred)
    # acc_top_n = 
    if first:
        first = False
        result = [acc]
        result_in_n = [rank_top_n]
    else:
        result.append(acc)
        result_in_n.append(rank_top_n)

In [43]:
# vit_base_patch16_224_in21k_last_hidden_state_onnx
sum((y_gg_un_test.reshape(-1, 1) == result_in_n[-1]).any(axis=1))/result_in_n[-1].shape[0]

0.8231093800714054

In [8]:
# vit_b_p16_224_last_hidden_trained_lr2e_05
sum((y_gg_un_test.reshape(-1, 1) == result_in_n[-1]).any(axis=1))/result_in_n[-1].shape[0]

0.8805582603050958

In [10]:
# vit_b_p16_224_last_hidden_trained_lr2e_05_cos
sum((y_gg_un_test.reshape(-1, 1) == result_in_n[-1]).any(axis=1))/result_in_n[-1].shape[0]

0.8620577734501785

In [46]:
# efficientnet_b1
sum((y_gg_un_test.reshape(-1, 1) == result_in_n[-1]).any(axis=1))/result_in_n[-1].shape[0]

0.881207400194742

In [55]:
# efficientnet_b5
sum((y_gg_un_test.reshape(-1, 1) == result_in_n[-1]).any(axis=1))/result_in_n[-1].shape[0]

0.9409282700421941