### データセット読み込みとdataset_table_indexesの初期化

In [1]:
from collections import defaultdict
import json
import pandas as pd
import random
import sqlite3
from typing import List, Tuple, TypeVar, Dict

Dataframe = TypeVar("pandas.core.frame.DataFrame")

# datasetをsqliteからDataFrame形式で読み込み
def load_dataset(dbpath="./ft.db", tablename="feature_table") -> Dataframe:
    conn = sqlite3.connect(dbpath)
    c = conn.cursor()
    dataset = pd.read_sql('SELECT * FROM ' + tablename, conn)
    return dataset

def blank_dt_indexes(dt: Dataframe) -> Dict[str, Dict]:
    labels = sorted(dt["label"].unique())
    dt_indexes = {}
    dt_indexes["selected_data"] = {}
    for label in labels:
        dt_indexes["selected_data"][label] = []
    return dt_indexes

# feature_table_indexesの初期化 (queryはランダムに選択)
def blank_ft_indexes(ft: Dataframe) -> Dict[str, Dict]:
    labels = sorted(ft["label"].unique())
    ft_indexes = {}
    ft_indexes["queries"], ft_indexes["used_queries"], ft_indexes["selected_data"],  = {}, {}, {}
    for label in labels:
        ft_indexes["used_queries"][label] = []
        ft_indexes["selected_data"][label] = []
        ft_indexes["queries"][label] = []
    return ft_indexes

def randomly_init_ft_indexes(ft: Dataframe, queryN=1, seed=0) -> Dict[str, Dict]:
    labels = sorted(ft["label"].unique())
    ft_labelby = ft.groupby("label")
    ft_indexes = {}
    ft_indexes["queries"], ft_indexes["used_queries"], ft_indexes["selected_data"],  = {}, {}, {}
    for label in labels:
        ft_indexes["used_queries"][label] = []
        ft_indexes["selected_data"][label] = []
        dt = ft_labelby.get_group(label)
        dt = dt.sample(n=queryN, random_state=seed)
        ft_indexes["queries"][label] = list(dt["index"].values)
    return ft_indexes

def save_dt_indexes(dt_indexes: Dict[int, Dict], savepath="./dt_indexes.json"):
    dic = {}
    for k1, v1 in dt_indexes.items():
        dic[k1] = {}
        for k2, v2 in dt_indexes[k1].items():
            dic[k1][str(k2)] = [int(i) for i in v2]
    with open(savepath, "w") as f:
        json.dump(dic, f, indent=4)
        
def load_dt_indexes(path="./dt_indexes.json"):
    dt_indexes = {}
    with open(path, "r") as f:
        dic = json.load(f)
    for k1, v1 in dic.items():
        dt_indexes[k1] = {}
        for k2, v2 in dic[k1].items():
            dt_indexes[k1][int(k2)] = v2
    return dt_indexes

### DataSelector

In [2]:
from collections import defaultdict
import copy
import json
import numpy as np
import pandas as pd
import torch
from typing import List, Tuple, TypeVar, Dict

Dataframe = TypeVar("pandas.core.frame.DataFrame")
Tensor = TypeVar("torch.Tensor")
NpInt64 = TypeVar("numpy.int64")

class DataSelector:
    def __init__(self, dt: Dataframe, dt_indexes: Dict[str, Dict[int, List]]):
        self.default_dt = dt
        self.dt_indexes = copy.deepcopy(dt_indexes)
        self.labels = sorted(dt["label"].unique())
        # 学習済みのデータを削除したdataset_table
        self.dt = self.__init_dt(dt, dt_indexes)
        
    def __init_dt(self, dt: Dataframe, dt_indexes: Dict[str, Dict[int, List]]) -> Dataframe:
        drop_indexes = []
        for indexes in dt_indexes["selected_data"].values():
            drop_indexes += indexes
        dt = dt.drop(index=drop_indexes)
        return dt
        
    def __convert_to_tensor_image(self, json_image) -> Tensor:
        image = json.loads(json_image)
        image = np.array(image)
        image = torch.from_numpy(image.astype(np.float32)).clone()
        return image
    
    def drop_data(self, indexes: List):
        self.dt = self.dt.drop(index=indexes)
        
    def get_dt_indexes(self) -> Dict[str, Dict[int, List]]:
        return self.dt_indexes
    
    def get_dataset(self, indexes_labelby: Dict[int, List]) -> List[Tuple[Tensor, NpInt64]]:
        dataset = []
        dt_labelby = self.default_dt.groupby("label")
        for label in self.labels:
            indexes = indexes_labelby[label]
            dt = dt_labelby.get_group(label)
            rows = dt[dt["index"].isin(indexes)]
            images = rows["image"].values
            labels = rows["label"].values
            for image, label in zip(images, labels):
                image = self.__convert_to_tensor_image(image)
                dataset.append((image, label))
        return dataset
    
    def randomly_select_dt_indexes(self, dataN: int, seed=0) -> Dict[int, List]:
        indexes_labelby = {}
        dt_labelby = self.dt.groupby("label")
        for label in self.labels:
            dt = dt_labelby.get_group(label)
            dt = dt.sample(n=dataN, random_state=seed)
            selected_indexes = list(dt["index"].values)
            indexes_labelby[label] = selected_indexes
            self.dt_indexes["selected_data"][label] += selected_indexes
            self.drop_data(selected_indexes)
        return indexes_labelby

In [49]:
dt = load_dataset(dbpath="./assets/test_dt.db")
print("データ数:  {0}".format(len(dt)))
dt[:5]

データ数:  3000


Unnamed: 0,index,image,label
0,0,"[[[0.2549020051956177, 0.45098042488098145, 0....",0
1,1,"[[[0.7019608020782471, 0.6470588445663452, 0.6...",0
2,2,"[[[1.0, 1.0, 1.0, 0.9607843160629272, 0.960784...",0
3,3,"[[[-0.1450980305671692, -0.34117645025253296, ...",0
4,4,"[[[-0.9843137264251709, -1.0, -1.0, -0.9921568...",0


In [50]:
dt_indexes1 = blank_dt_indexes(dt=dt)
dt_indexes1

{'selected_data': {0: [], 1: [], 2: []}}

#### 1反復目

In [86]:
selector = DataSelector(dt, dt_indexes1)
indexes_labelby1 = selector.randomly_select_dt_indexes(dataN=200, seed=2)
indexes_labelby2 = selector.randomly_select_dt_indexes(dataN=200, seed=2)
dt_indexes2 = selector.get_dt_indexes()

In [87]:
for i in range(3):
    print("ラベル{0}  の重複なしデータ数:  {1}".format(i, len(set(dt_indexes2["selected_data"][i]))))

ラベル0  の重複なしデータ数:  400
ラベル1  の重複なしデータ数:  400
ラベル2  の重複なしデータ数:  400


#### 2反復目

In [88]:
selector = DataSelector(dt, dt_indexes2)
indexes_labelby3 = selector.randomly_select_dt_indexes(dataN=200, seed=0)
indexes_labelby4 = selector.randomly_select_dt_indexes(dataN=200, seed=0)
dt_indexes3 = selector.get_dt_indexes()

In [89]:
for i in range(3):
    print("ラベル{0}  の重複なしデータ数:  {1}".format(i, len(set(dt_indexes3["selected_data"][i]))))

ラベル0  の重複なしデータ数:  800
ラベル1  の重複なしデータ数:  800
ラベル2  の重複なしデータ数:  800


In [71]:
dataset = selector.get_dataset(dt_indexes3["selected_data"])
print("データ数:  {0}".format(len(dataset)))
print(dataset[0])

データ数:  2400
(tensor([[[ 0.2549,  0.4510,  0.6392,  ..., -0.2078, -0.2863, -0.2627],
         [ 0.7647,  0.8745,  0.8980,  ..., -0.1608, -0.2392, -0.2157],
         [ 0.9765,  0.9529,  0.8902,  ..., -0.0980, -0.1765, -0.1765],
         ...,
         [-0.3725, -0.3020, -0.3647,  ..., -0.9137, -0.8353, -0.7412],
         [-0.4275, -0.3804, -0.4275,  ..., -0.8824, -0.7882, -0.7569],
         [-0.4588, -0.4353, -0.4039,  ..., -0.7961, -0.7647, -0.7725]],

        [[-0.7098, -0.6157, -0.5529,  ..., -0.8588, -0.8353, -0.8510],
         [-0.4745, -0.4353, -0.3961,  ..., -0.8353, -0.8275, -0.8588],
         [-0.4353, -0.4667, -0.4353,  ..., -0.8431, -0.8510, -0.8667],
         ...,
         [-0.7412, -0.7098, -0.6941,  ..., -0.8902, -0.8118, -0.7725],
         [-0.7255, -0.7098, -0.6784,  ..., -0.8431, -0.7569, -0.7647],
         [-0.6706, -0.6706, -0.5373,  ..., -0.7804, -0.7490, -0.7647]],

        [[-0.8980, -0.9137, -0.8902,  ..., -0.9373, -0.9529, -0.9608],
         [-0.8980, -0.9059, -0.9

In [90]:
save_dt_indexes(dt_indexes=dt_indexes3, savepath="./assets/dt_indexes.json")

In [102]:
dt_indexes3 = load_dt_indexes(path="assets/dt_indexes.json")
print(dt_indexes3.keys())
print(dt_indexes3["selected_data"].keys())

dict_keys(['selected_data'])
dict_keys([0, 1, 2])


### FeatureSelector

In [12]:
import faiss
from collections import defaultdict
import copy
import json
import numpy as np
import pandas as pd
import torch
from typing import List, Tuple, TypeVar, Dict

Dataframe = TypeVar("pandas.core.frame.DataFrame")
Tensor = TypeVar("torch.Tensor")
NpArrayFloat32 = TypeVar("numpy.ndarray.float32")
NpInt64 = TypeVar("numpy.int64")
FaissIndexFlatL2 = TypeVar("faiss.swigfaiss.IndexFlatL2")

class FeatureSelector(DataSelector):
    def __init__(self, ft: Dataframe, ft_indexes: Dict[str, Dict[int, List]]):
        super().__init__(ft, ft_indexes)
        
    def __ft_to_features(self, ft: Dataframe) -> NpArrayFloat32:
        features = [json.loads(f) for f in ft["feature"]]
        features = np.array(features).astype("float32")
        return features
    
    def __indexes_to_features(self, ft: Dataframe, indexes: List[int]) -> NpArrayFloat32:
        features = []
        for index in indexes:
            feature = ft[ft["index"] == index]["feature"].iloc[0]
            feature = json.loads(feature)
            features.append(feature)
        features = np.array(features).astype("float32")
        if len(features) != len(indexes): print("There is a feature that cannot be obtained")
        return features
            
    def __generate_faiss_index(self, vectors: NpArrayFloat32) -> FaissIndexFlatL2:
        dim = len(vectors[0])
        faiss_index = faiss.IndexFlatL2(dim)
        faiss_index.add(vectors)
        return faiss_index
    
    def __search_NN_ft_indexes(self, ft: Dataframe, query_ft_indexes: List[int], dataN: int) -> List[int]:
        queries = self.__indexes_to_features(ft, query_ft_indexes)
        features = self.__ft_to_features(ft)
        faiss_index = self.__generate_faiss_index(features)
        k = faiss_index.ntotal # 検索対象データ数
        D, I = faiss_index.search(queries, k) # 近傍探索
        
        NN_ft_indexes = []
        all_query_indexes = [index for indexes in self.dt_indexes["queries"].values() for index in indexes]
        for indexes in I:
            cnt, i = 0, 0
            while cnt < dataN:
                ft_index = ft.iloc[indexes[i]]["index"]
                i += 1
                if ft_index in NN_ft_indexes: continue # 既に選択済みのインデックスは検索対象外
                if ft_index in all_query_indexes: continue # クエリは検索対象外
                NN_ft_indexes.append(ft_index)
                cnt += 1
                
        return NN_ft_indexes
    
    def __search_FP_ft_indexes(self, ft: Dataframe, query_ft_indexes: List[int]) -> List[int]:
        queries = self.__indexes_to_features(ft, query_ft_indexes)
        features = self.__ft_to_features(ft)
        faiss_index = self.__generate_faiss_index(features)
        k = faiss_index.ntotal # 検索対象データ数
        D, I = faiss_index.search(queries, k) # 近傍探索
        
        FP_ft_indexes = []
        all_used_query_indexes = [index for indexes in self.dt_indexes["used_queries"].values() for index in indexes]
        for indexes in I:
            for index in reversed(indexes):
                ft_index = ft.iloc[index]["index"]
                if ft_index in FP_ft_indexes: continue # 既に選択済みのインデックスは検索対象外
                if ft_index not in all_used_query_indexes: break # 一度でも使用されたクエリは検索対象外
            FP_ft_indexes.append(ft_index)
        
        return FP_ft_indexes
    
    def __search_ft_indexes_with_rate(self, ft: Dataframe, query_ft_indexes: List[int], rate=1/2) -> List[int]:
        queries = self.__indexes_to_features(ft, query_ft_indexes)
        features = self.__ft_to_features(ft)
        faiss_index = self.__generate_faiss_index(features)
        k = faiss_index.ntotal # 検索対象データ数
        D, I = faiss_index.search(queries, k) # 近傍探索
        
        MP_ft_indexes=[]
        all_used_query_indexes = [index for indexes in self.dt_indexes["used_queries"].values() for index in indexes]
        for indexes in I:
            i = round(k*rate)
            while (1):
                index = indexes[i]
                ft_index = ft.iloc[index]["index"]
                i += 1
                if ft_index in MP_ft_indexes: continue # 既に選択済みのインデックスは検索対象外
                if ft_index not in all_used_query_indexes: break # 一度でも使用されたクエリは検索対象外
            MP_ft_indexes.append(ft_index)
        
        return MP_ft_indexes
    
    def init_ft_indexes(self, queryN=1, seed=0) -> Dict[str, Dict[int, List]]:
        ft_indexes = {}
        ft_indexes["queries"], ft_indexes["used_queries"], ft_indexes["selected_data"],  = {}, {}, {}
        ft_labelby = self.default_dt.groupby("label")
        
        for label in self.labels:
            ft_indexes["selected_data"][label] = []
            ft_indexes["used_queries"][label] = []
            ft = ft_labelby.get_group(label)
            query = json.loads(ft.sample(n=1, random_state=seed)["feature"].iloc[0])
            query = np.array([query]).astype("float32")
            features = self.__ft_to_features(ft)
            faiss_index = self.__generate_faiss_index(features)
            k = faiss_index.ntotal
            linspace = k//queryN
            D, I = faiss_index.search(query, k)
            
            indexes = I[0][::linspace][:queryN]
            tmp_ft_indexes = []
            for index in indexes:
                ft_index = ft.iloc[index]["index"]
                tmp_ft_indexes.append(ft_index)
            ft_indexes["queries"][label] = tmp_ft_indexes
        
        return ft_indexes
                
    def select_NN_ft_indexes(self, dataN: int) -> Dict[int, List]:
        indexes_labelby = {}
        ft_labelby = self.dt.groupby("label")
        
        for label in self.labels:
            ft = ft_labelby.get_group(label)
            query_ft_indexes = self.dt_indexes["queries"][label]
            NN_ft_indexes = self.__search_NN_ft_indexes(ft, query_ft_indexes, dataN)
            
            indexes_labelby[label] = NN_ft_indexes
            self.dt_indexes["selected_data"][label] += NN_ft_indexes
            self.dt_indexes["used_queries"][label] += query_ft_indexes
            self.drop_data(NN_ft_indexes)
            
        return indexes_labelby
    
    # クエリをFP(最遠傍点)へ更新
    def update_to_FP_queries(self) -> Dict[int, List]:
        indexes_labelby = {}
        ft_labelby = self.dt.groupby("label")
        
        for label in self.labels:
            ft = ft_labelby.get_group(label)
            query_ft_indexes = self.dt_indexes["queries"][label]
            FP_ft_indexes = self.__search_FP_ft_indexes(ft, query_ft_indexes)
            
            indexes_labelby[label] = FP_ft_indexes
            self.dt_indexes["queries"][label] = FP_ft_indexes
            
        return indexes_labelby
    
    # クエリを指定割合だけ更新
    def update_queries(self, rate=1/2) -> Dict[int, List]:
        indexes_labelby = {}
        ft_labelby = self.dt.groupby("label")
        
        for label in self.labels:
            ft = ft_labelby.get_group(label)
            query_ft_indexes = self.dt_indexes["queries"][label]
            ft_indexes = self.__search_ft_indexes_with_rate(ft, query_ft_indexes, rate)
            
            indexes_labelby[label] = ft_indexes
            self.dt_indexes["queries"][label] = ft_indexes
            
        return indexes_labelby

In [13]:
ft = load_dataset(dbpath="./assets/ft.db")
print("データ数:  {0}".format(len(ft)))
ft[:5]

データ数:  15000


Unnamed: 0,index,feature,image,label
0,0,"[0.30711856484413147, 0.19312363862991333, 0.0...","[[[-0.3176470398902893, -0.29411762952804565, ...",1
1,1,"[0.4214461147785187, 1.198604702949524, 0.9510...","[[[-0.9764705896377563, -0.9686274528503418, -...",1
2,2,"[0.2851516008377075, 0.20933431386947632, 0.07...","[[[0.13725495338439941, 0.13725495338439941, 0...",2
3,3,"[0.6752024292945862, 0.7612708806991577, 0.712...","[[[-0.24705880880355835, -0.27843135595321655,...",2
4,4,"[0.3068203926086426, 0.6951863169670105, 0.444...","[[[0.30980396270751953, 0.30980396270751953, 0...",2


In [14]:
ft_indexes1 = randomly_init_ft_indexes(ft=ft, queryN=5, seed=1)
ft_indexes1

{'queries': {0: [8389, 14320, 11549, 10570, 8310],
  1: [8221, 14297, 11383, 10409, 8145],
  2: [8261, 14291, 11433, 10524, 8169]},
 'selected_data': {0: [], 1: [], 2: []},
 'used_queries': {0: [], 1: [], 2: []}}

In [52]:
blank_ft_indexes1 = blank_ft_indexes(ft=ft)
selector = FeatureSelector(ft=ft, ft_indexes=blank_ft_indexes1)
ft_indexes1 = selector.init_ft_indexes(queryN=5, seed=1)
ft_indexes1

{'queries': {0: [8389, 278, 3498, 4078, 3161],
  1: [8221, 14026, 5401, 4736, 845],
  2: [8261, 2578, 10993, 8561, 1692]},
 'selected_data': {0: [], 1: [], 2: []},
 'used_queries': {0: [], 1: [], 2: []}}

#### 1反復目

In [43]:
selector = FeatureSelector(ft, ft_indexes1)
indexes_labelby1 = selector.select_NN_ft_indexes(dataN=50)
indexes_labelby2 = selector.select_NN_ft_indexes(dataN=50)
ft_indexes2 = selector.get_dt_indexes()

In [44]:
for i in range(3):
    print("ラベル{0}  の重複なしデータ数:  {1}".format(i, len(set(ft_indexes2["selected_data"][i]))))

ラベル0  の重複なしデータ数:  500
ラベル1  の重複なしデータ数:  500
ラベル2  の重複なしデータ数:  500


#### 2反復目

In [45]:
selector = FeatureSelector(ft, ft_indexes2)
indexes_labelby3 = selector.select_NN_ft_indexes(dataN=50)
indexes_labelby4 = selector.select_NN_ft_indexes(dataN=50)
ft_indexes3 = selector.get_dt_indexes()

In [46]:
for i in range(3):
    print("ラベル{0}  の重複なしデータ数:  {1}".format(i, len(set(ft_indexes3["selected_data"][i]))))

ラベル0  の重複なしデータ数:  1000
ラベル1  の重複なしデータ数:  1000
ラベル2  の重複なしデータ数:  1000


#### 3反復目

In [33]:
selector = FeatureSelector(ft, ft_indexes3)
indexes_labelby5 = selector.update_to_FP_queries() # クエリの更新
indexes_labelby6 = selector.select_NN_ft_indexes(dataN=100)
ft_indexes4 = selector.get_dt_indexes()

In [34]:
for i in range(3):
    print("ラベル{0}  の重複なしデータ数:  {1}".format(i, len(set(ft_indexes4["selected_data"][i]))))

ラベル0  の重複なしデータ数:  1500
ラベル1  の重複なしデータ数:  1500
ラベル2  の重複なしデータ数:  1500


In [35]:
print(ft_indexes3["queries"])
print(ft_indexes4["queries"])

{0: [1347, 2238, 2070, 8954, 4165], 1: [7522, 6188, 14867, 5218, 14887], 2: [916, 14267, 3882, 10627, 9697]}
{0: [4353, 8787, 107, 11956, 10470], 1: [8833, 10055, 5380, 4324, 5368], 2: [1609, 10722, 3546, 4383, 6056]}


In [36]:
dataset = selector.get_dataset(ft_indexes4["selected_data"])
print("データ数:  {0}".format(len(dataset)))
print(dataset[0])

データ数:  4500
(tensor([[[1.0000, 0.9843, 0.9843,  ..., 0.9843, 0.9843, 1.0000],
         [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
         [1.0000, 0.9922, 0.9922,  ..., 0.9922, 0.9922, 1.0000],
         ...,
         [1.0000, 0.9843, 0.9529,  ..., 1.0000, 1.0000, 1.0000],
         [1.0000, 0.9922, 0.9922,  ..., 1.0000, 0.9922, 1.0000],
         [1.0000, 0.9922, 0.9843,  ..., 1.0000, 1.0000, 1.0000]],

        [[1.0000, 0.9843, 0.9843,  ..., 0.9843, 0.9843, 1.0000],
         [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
         [1.0000, 0.9922, 0.9922,  ..., 0.9922, 0.9922, 1.0000],
         ...,
         [1.0000, 0.9843, 0.9451,  ..., 1.0000, 1.0000, 1.0000],
         [1.0000, 0.9922, 0.9922,  ..., 1.0000, 0.9922, 1.0000],
         [1.0000, 0.9922, 0.9843,  ..., 1.0000, 1.0000, 1.0000]],

        [[1.0000, 0.9843, 0.9843,  ..., 0.9843, 0.9843, 1.0000],
         [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
         [1.0000, 0.9922, 0.9922,  ..., 0.984

### update_to_MP_queries

In [16]:
ft_indexes1 = randomly_init_ft_indexes(ft=ft, queryN=5, seed=2)
ft_indexes1

{'queries': {0: [10765, 12835, 5776, 12445, 4431],
  1: [10596, 12717, 5751, 12319, 4413],
  2: [10741, 12690, 5740, 12227, 4400]},
 'selected_data': {0: [], 1: [], 2: []},
 'used_queries': {0: [], 1: [], 2: []}}

#### 1反復目

In [17]:
selector = FeatureSelector(ft, ft_indexes1)
indexes_labelby1 = selector.update_queries(rate=1/2)
indexes_labelby2 = selector.select_NN_ft_indexes(dataN=50)
ft_indexes2 = selector.get_dt_indexes()

In [18]:
for i in range(3):
    print("ラベル{0}  の重複なしデータ数:  {1}".format(i, len(set(ft_indexes2["selected_data"][i]))))

ラベル0  の重複なしデータ数:  250
ラベル1  の重複なしデータ数:  250
ラベル2  の重複なしデータ数:  250


#### 2反復目

In [19]:
selector = FeatureSelector(ft, ft_indexes2)
indexes_labelby3 = selector.update_queries(rate=1/4)
indexes_labelby4 = selector.select_NN_ft_indexes(dataN=50)
ft_indexes3 = selector.get_dt_indexes()

In [20]:
for i in range(3):
    print("ラベル{0}  の重複なしデータ数:  {1}".format(i, len(set(ft_indexes3["selected_data"][i]))))

ラベル0  の重複なしデータ数:  500
ラベル1  の重複なしデータ数:  500
ラベル2  の重複なしデータ数:  500
