# Prepare data

In [1]:
from script.func_split_data import split_data

In [2]:
split_df = split_data(data_path='Cosmenet_product_20231018', data_csv='datas_20231018.csv')
split_df.split_data()
split_df.report_train_test_split()
print()
split_df.report_train_test_val_split()

amount of all data : 60196
amount of all class : 4178
amount of data 2-8 img : 1548
amount of 2-8 img class : 278
amount of data more 8 img : 58631
amount of more 8 img class : 3883
amount of data & class only one : 17

amount of train split : 38474
amount of train split class : 3184
amount of test split : 9620
amount of test split class : 3184
amount of train val : 8430
amount of train val class : 699
amount of test val : 2107
amount of test val class : 699
amount of train val mix : 9204
amount of train val mix class : 977
amount of test val mix : 2881
amount of test val mix class : 977


In [3]:
df_train_split, df_test_split = split_df.get_train_test()
df_train_val_mix, df_test_val_mix = split_df.get_validate()

In [4]:
df_train_split.head(1)

Unnamed: 0,file_names,labels,images_path
0,11596_2.jpg,11596,/app/nfs_clientshare/Datasets/Cosmenet_product...


In [5]:
df = split_df.get_dict()

# Crop product

In [6]:
import numpy as np
from PIL import Image, ImageOps
import onnxruntime as rt

In [7]:
device = rt.get_device()
w = '/app/nfs_clientshare/mew/project/Similarity_model/models/yolov7-cosme.onnx'
providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] if device == 'GPU' else ['CPUExecutionProvider']
session = rt.InferenceSession(w, providers=providers)

In [8]:
outname = [i.name for i in session.get_outputs()]
inname = [i.name for i in session.get_inputs()]

In [9]:
def add_border(image, border_color=(0, 0, 0), target_size=640):
    # # Load the image
    # image = Image.open(image_path)

    # Get the original image size
    width, height = image.size

    # Scale ratio (new / old)
    ratio = min(target_size / height, target_size / width)

    # Resize the image to a 640x640 ratio based on the old width or height
    if width >= height:
        new_width = target_size
        new_height = int(height * target_size / width)
    else:
        new_height = target_size
        new_width = int(width * target_size / height)
    resized_image = image.resize((new_width, new_height))

    # Add a border to the resized image
    border_size = (int((target_size - new_width) / 2), int((target_size - new_height) / 2))
    bordered_image = ImageOps.expand(resized_image, border=border_size, fill=border_color)
    bordered_image = bordered_image.resize((target_size, target_size))

    # Return the bordered image
    return bordered_image, ratio, border_size

In [10]:
def preprocess_yolo(img, target_size=640):
    image, ratio, dwdh = add_border(img, target_size=target_size)
    image = np.array(image)
    image = image.transpose((2, 0, 1))
    image = np.expand_dims(image, 0)
    image = np.ascontiguousarray(image)
    im = image.astype(np.float32)
    im /= 255
    return im, ratio, dwdh

In [11]:
def detector(img, target_size=640, thresh=0.9):
    im, ratio, dwdh = preprocess_yolo(img, target_size=target_size)
    img = np.array(img)
    ori_images = [img.copy()]

    inp = {inname[0]:im}

    # ONNX inference
    outputs = session.run(outname, inp)[0]

    if len(outputs) != 0:
        for i,(batch_id,x0,y0,x1,y1,cls_id,score) in enumerate(outputs):
            if score < thresh or i > 0:
                return {
                    'img': ori_images[int(batch_id)],
                    'score': score,
                    'bbox': None
                }
            
            image = ori_images[int(batch_id)]
            # Get the image size of the image
            h_image, w_image, channels = image.shape
        
            box = np.array([x0,y0,x1,y1])
            box -= np.array(dwdh*2)
            box /= ratio
            box = box.round().astype(np.int32).tolist()

            # Limit the bounding box coordinates to the image bounds
            box[0] = max(0, min(box[0], w_image))
            box[1] = max(0, min(box[1], h_image))
            box[2] = max(0, min(box[2], w_image))
            box[3] = max(0, min(box[3], h_image))

            cls_id = int(cls_id)
            score = round(float(score),3)
            img_crop = image[box[1]:box[3], box[0]:box[2]]


        return {
            'img': img_crop,
            'score': score,
            'bbox': box
        }
    else:
        return {
                    'img': ori_images[0],
                    'score': 0.0,
                    'bbox': None
                }

# extract and put to elasticsearch

In [12]:
from PIL import Image
import numpy as np
import pandas as pd

import torch
from transformers import ViTImageProcessor, ViTModel

from script.func_extract_feature import select_transformers_model, pipeline_transformer
from script.tool import ROOT_NFS_TEST, standardize_feature
from tqdm.notebook import tqdm

2023-11-02 16:26:24.200631: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-02 16:26:24.849905: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-02 16:26:24.849981: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-02 16:26:24.854020: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-02 16:26:25.189031: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: A

In [13]:
device = torch.device('cpu')

model, preprocess = select_transformers_model(ViTModel, ViTImageProcessor, 
                                              pretrain=ROOT_NFS_TEST / 'weights/vit_gg_lr2e-05_eu_9ep_0_95099acc')
model.load_state_dict(torch.load('weights/temp_epoch/vitgg_lr2e05_ep4_loss0.0.pth')['model_state_dict'])
vit_gg_trained_lr2e_05_pipe = pipeline_transformer(layer="last_hidden_state", row=0, device=device)
vit_gg_trained_lr2e_05_pipe.selct_model(model, preprocess)
vit_gg_trained_lr2e_05_pipe.report_test()

runtime : 1959.691047668457 ms
outputs layers : odict_keys(['last_hidden_state', 'pooler_output'])
shape last_hidden_state : torch.Size([1, 197, 768])
shape pooler_output : torch.Size([1, 768])


In [14]:
from elasticsearch import Elasticsearch

In [15]:
def to_unit_len(vector):
    return vector / np.linalg.norm(vector)

In [16]:
class ES_access:
    def __init__(self, name_index, name_doc, host="http://localhost", port=9200):
        self.es = Elasticsearch(HOST="http://localhost", PORT=9200)
        self.name_index = name_index
        self.name_doc = name_doc

    def check_index_exist(self, dims=768):
        if self.es.indices.exists(index=self.name_index):
            print(f"index {self.name_index} already exists")
            return True
        else:
            body_product = {
                "mappings":{
                    "properties":{
                        "tag":{
                            "type":"keyword"
                        },
                        "labels":{
                            "type":"keyword"
                        },
                        "file_names":{
                            "type":"text"
                        },
                        "images_path":{
                            "type":"text"
                        },
                        "features":{  
                            "type":"dense_vector",
                            "dims":dims,
                            "index":True,
                            "similarity": "dot_product"
                        },
                        "id":{
                            "type":"keyword"
                        }
                    }
                }
            }
            err = self.es.indices.create(index=self.name_index, body=body_product)
            print(err)
            return False

In [17]:
class extract_to_es(ES_access):
    def __init__(self, name_index, name_doc='_doc', host="http://localhost", port=9200):
        super().__init__(name_index, name_doc, host, port)

    def check_data_exist(self, data, n):
        if self.es.exists(index=self.name_index, id=data['tag']+"_"+str(n)):
            data_index = self.es.get(index=ext_ep2_crop.name_index, id=data['tag']+"_"+str(n))['_source']
            for key in ['tag', 'labels', 'file_names', 'images_path', 'id']:
                if data[key] != data_index[key]:
                    print("================")
                    print(data_index[key])
                    print(data[key])
                    return False
            return True
        return False
    
    def put_to_es(self, model, dataframe, tag="train_split", replace=True, crop=False):
        for n, img_path in enumerate(tqdm(dataframe['images_path'], leave=False)):
            data = {
                "tag": tag,
                "labels": dataframe['labels'].iloc[n],
                "file_names": dataframe['file_names'].iloc[n],
                "images_path": img_path,
                "id": tag+"_"+str(n)
            }
            if not replace and self.check_data_exist(data, n):
                continue
            img = Image.open(img_path).convert('RGB')
            if crop:
                img_crop = detector(img, thresh=0.5)
                img_crop = Image.fromarray(img_crop['img'])
                img = add_border(img_crop, target_size=224)[0]
            output = model.extract(img).flatten()
            data["features"] = to_unit_len(output)
            self.es.index(index=self.name_index, id=tag+"_"+str(n), body=data)
        print(f"put tag {tag} success")

    def put_all_tag(self, model, df, replace=True, crop=False):
        self.put_to_es(model, df['train_split'], tag="train_split", replace=replace, crop=crop)
        self.put_to_es(model, df['test_split'], tag="test_split", replace=replace, crop=crop)
        self.put_to_es(model, df['train_val'], tag="train_val", replace=replace, crop=crop)
        self.put_to_es(model, df['test_val'], tag="test_val", replace=replace, crop=crop)

In [None]:
ext_ep3 = extract_to_es('vitgg_lr2e05_ep3_loss0.0')
ext_ep3.check_index_exist()
ext_ep3.put_all_tag(vit_gg_trained_lr2e_05_pipe, df, replace=True, crop=False)

In [None]:
ext_ep3_crop = extract_to_es('vitgg_lr2e05_ep4_loss0.0')
ext_ep3_crop.check_index_exist()
ext_ep3_crop.put_all_tag(vit_gg_trained_lr2e_05_pipe, df, replace=True, crop=False)

  err = self.es.indices.create(index=self.name_index, body=body_product)


{'acknowledged': True, 'shards_acknowledged': True, 'index': 'vitgg_lr2e05_ep4_loss0.0'}


  0%|          | 0/38474 [00:00<?, ?it/s]

  self.es.index(index=self.name_index, id=tag+"_"+str(n), body=data)


In [18]:
ext_ep2_crop = extract_to_es('vitgg_lr2e05_ep2_loss0.02346_crop')
ext_ep2_crop.check_index_exist()
ext_ep2_crop.put_all_tag(vit_gg_trained_lr2e_05_pipe, df, replace=False, crop=True)

index vitgg_lr2e05_ep2_loss0.02346_crop already exists


  0%|          | 0/38474 [00:00<?, ?it/s]

put tag train_split success


  0%|          | 0/9620 [00:00<?, ?it/s]

put tag test_split success


  0%|          | 0/9204 [00:00<?, ?it/s]

  self.es.index(index=self.name_index, id=tag+"_"+str(n), body=data)


put tag train_val success


  0%|          | 0/2881 [00:00<?, ?it/s]

put tag test_val success


# Extract Elasticnet b1

In [39]:
from script.func_extract_feature import select_timm_model, pipeline_timm

In [40]:
model, preprocess = select_timm_model('efficientnet_b1', num_classes=0, pretrain=True)
eff_pipe = pipeline_timm(device=device)
eff_pipe.selct_model(model, preprocess)
eff_pipe.report_test()

runtime : 179.80599403381348 ms
Output shape at layer : torch.Size([1, 1280])


In [41]:
eff = extract_to_es('efficientnet_b1')
eff.check_index_exist(dims=1280)
eff.put_all_tag(eff_pipe, df, replace=False, crop=False)

index efficientnet_b1 already exists


  0%|          | 0/38474 [00:00<?, ?it/s]

put tag train_split success


  0%|          | 0/9620 [00:00<?, ?it/s]

put tag test_split success


  0%|          | 0/9204 [00:00<?, ?it/s]

  self.es.index(index=self.name_index, id=tag+"_"+str(n), body=data)


put tag train_val success


  0%|          | 0/2881 [00:00<?, ?it/s]

put tag test_val success


# query score