In [39]:
from PIL import Image
import numpy as np
import pandas as pd

import torch

from script.tool import ROOT_NFS_TEST, ROOT_NFS_DATA, standardize_feature
from tqdm.notebook import tqdm
from pathlib import Path
import time, os

In [14]:
path_dataset = ROOT_NFS_DATA / 'Cosmenet_product_20231018'
device = torch.device("cpu")
df_pd = pd.read_csv(path_dataset / 'datas_20231018.csv')
df_pd.head(1)

Unnamed: 0,file_names,labels,images_path
0,14624_14.jpg,14624,/app/nfs_clientshare/Datasets/Cosmenet_product...


In [3]:
group_df = df_pd.groupby(['labels'])['labels'].count().reset_index(name='count').sort_values(['count'], ascending=False)
group_df.head(1)

Unnamed: 0,labels,count
4172,50348,100


In [352]:
filter_img = group_df[group_df['count'] <= 8]['labels'].values
df_not_contain = df_pd[~df_pd['labels'].isin(filter_img)]

In [354]:
print(f"amount of all data : {df_pd.__len__()}")
print(f"amount of data not contain : {df_not_contain.__len__()}")
print(f"amount of data filter image : {df_pd[df_pd['labels'].isin(filter_img)].__len__()}")

amount of all data : 60196
amount of data not contain : 58631
amount of data filter image : 1565


In [355]:
from sklearn.model_selection import StratifiedShuffleSplit

In [356]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

In [357]:
train_idx, test_idx = sss.split(df_not_contain, df_not_contain['labels']).__next__()
df_train = df_not_contain.iloc[train_idx]
df_test = df_not_contain.iloc[test_idx]

In [367]:
df_train.head(1)

Unnamed: 0,file_names,labels,images_path
3177,11596_2.jpg,11596,/app/nfs_clientshare/Datasets/Cosmenet_product...


In [365]:
print(f"amount of all class : {df_pd['labels'].nunique()}")
print(f"amount of class two img : {filter_img.__len__()}")
print(f"amount of train class : {df_not_contain['labels'].iloc[train_idx].nunique()}")
print(f"amount of test class : {df_not_contain['labels'].iloc[test_idx].nunique()}")
print(f"amount of train data : {df_not_contain.iloc[train_idx].__len__()}")
print(f"amount of test data : {df_not_contain.iloc[test_idx].__len__()}")
print("minimum count :", df_not_contain.iloc[test_idx].groupby(['labels'])['labels'].count().min())

amount of all class : 4178
amount of class two img : 295
amount of train class : 3883
amount of test class : 3883
amount of train data : 46904
amount of test data : 11727
minimum count : 2


In [10]:
def select_transformers_model(model, processor, pretrain="google/vit-base-patch16-224-in21k"):
    model = model.from_pretrained(pretrain)
    processor = processor.from_pretrained(pretrain)
    return model, processor

In [11]:
# pipeline for transformer library
class pipeline_transformer:
    def __init__(self, layer, row=False, device='cuda:0'):
        self.device = device
        self.layer = layer
        self.row = row
    
    def selct_model(self, model, processor):
        self.model = model
        self.processor = processor
        self.model.eval().to(self.device)
    
    def process_model(self, img):
        inputs = self.processor(images=img, return_tensors="pt").to(self.device)
        outputs = self.model(**inputs)
        return outputs
        
    def extract(self, img):
        ### return specific layer
        outputs = self.process_model(img)
        if type(self.row) == bool and not self.row:
            outputs = outputs[self.layer]
        else:
            outputs = outputs[self.layer][:, self.row]
        outputs = outputs.flatten().unsqueeze(0)
        outputs = standardize_feature(outputs).to('cpu').detach().numpy()
        return outputs
    
    def report_test(self):
        img = Image.new('RGB', (224, 224))
        start_time_torch = time.time()
        outputs = self.process_model(img)
        delta_time_torch = time.time() - start_time_torch
        print("runtime :", delta_time_torch*1000, "ms")
        print(f"outputs layers : {outputs.keys()}")
        print(f"shape last_hidden_state : {outputs.last_hidden_state.shape}")
        print(f"shape pooler_output : {outputs.pooler_output.shape}")

In [12]:
from transformers import ViTImageProcessor, ViTModel

2023-10-26 16:37:41.911488: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-26 16:37:42.360946: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-26 16:37:42.360983: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-26 16:37:42.364077: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-10-26 16:37:42.589494: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: A

In [16]:
model, preprocess = select_transformers_model(ViTModel, ViTImageProcessor, 
                                              pretrain=ROOT_NFS_TEST / 'weights/vit_gg_lr2e-05_eu_9ep_0_95099acc')
vit_gg_trained_lr2e_05_pipe = pipeline_transformer(layer="last_hidden_state", row=0, device=device)
vit_gg_trained_lr2e_05_pipe.selct_model(model, preprocess)
vit_gg_trained_lr2e_05_pipe.report_test()

runtime : 287.43863105773926 ms
outputs layers : odict_keys(['last_hidden_state', 'pooler_output'])
shape last_hidden_state : torch.Size([1, 197, 768])
shape pooler_output : torch.Size([1, 768])


In [17]:
print(df_train.head(1))
print(df_test.head(1))

        file_names  labels                                        images_path
34244  40959_8.png   40959  /app/nfs_clientshare/Datasets/Cosmenet_product...
        file_names  labels                                        images_path
41380  46687_6.png   46687  /app/nfs_clientshare/Datasets/Cosmenet_product...


In [18]:
from elasticsearch import Elasticsearch

In [19]:
es = Elasticsearch(HOST="http://localhost", PORT=9200)

In [295]:
def check_index_exist(name="test_product"):
    if es.indices.exists(index=name):
            print(f"index {name} already exists")
    else:
        body_product = {
            "mappings":{
                "properties":{
                    "labels":{
                        "type":"keyword"
                    },
                    "file_names":{
                        "type":"text"
                    },
                    "images_path":{
                        "type":"text"
                    },
                    "features":{  
                        "type":"dense_vector",
                        "dims":768,
                        "index":True,
                        "similarity": "dot_product"
                    },
                }
            }
        }
        err = es.indices.create(index=name, body=body_product)
        print(err)

In [309]:
name_product = "vit_gg_lr2e_05_eu_9ep_0_95099acc"
check_index_exist(name_product)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'vit_gg_lr2e_05_eu_9ep_0_95099acc'}


  err = es.indices.create(index=name, body=body_product)


In [61]:
def to_unit_len(vector):
    return vector / np.linalg.norm(vector)

In [310]:
for n, img_path in enumerate(tqdm(df_train['images_path'])):
    img = Image.open(img_path).convert('RGB')
    output = vit_gg_trained_lr2e_05_pipe.extract(img).flatten()
    data = {
        "labels": df_train['labels'].iloc[n],
        "file_names": df_train['file_names'].iloc[n],
        "images_path": img_path,
        "features": to_unit_len(output),
    }
    es.index(index=name_product, id=n, body=data)

  0%|          | 0/48104 [00:00<?, ?it/s]

  es.index(index=name_product, id=n, body=data)


In [203]:
img_path_tet = df_train['images_path'].iloc[7]
img_test = Image.open(img_path_tet).convert('RGB')
output_test = vit_gg_trained_lr2e_05_pipe.extract(img_test).flatten()
out_unit = to_unit_len(output_test)

In [311]:
query = {
    "query": 
    {
        "function_score": 
        {
            "functions": 
            [
                {
                    "script_score": 
                    {
                        "script": 
                        {
                            "source": "cosineSimilarity(params.query_vector, doc['features'])/2+0.5",
                            "params": {"query_vector": out_unit}  # Replace with your query vector
                        }
                    }
                }
            ], 
            "boost_mode": "replace"
        }
    },
    
    "_source": 
    {
        "excludes": ["features"]  # Exclude the "features" field from the search results
    },
    
    "size": 5,
    
    "collapse": 
    {
        "field": "labels"
    },
    
    "sort": 
    [
        {
            "_score": 
            {
                "order": "desc"
            }
        }
    ],
}

In [312]:
results = es.search(index=name_product, body=query)

  results = es.search(index=name_product, body=query)


In [314]:
for res in results['hits']['hits']:
    print(f"id : {res['_id']} | label : {res['_source']['labels']} | score : {res['_score']}")

id : 7 | label : 47152 | score : 1.0
id : 43651 | label : 16909 | score : 0.8186062
id : 18485 | label : 41276 | score : 0.81833094
id : 27779 | label : 47177 | score : 0.81676626
id : 47321 | label : 45797 | score : 0.81667507


In [296]:
check_index_exist("test_product")

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'test_product'}


  err = es.indices.create(index=name, body=body_product)


In [307]:
es.indices.delete(index="test_product")

{'acknowledged': True}