# Fetch & Clone Repo

Clone the `image-tagging-tools` repo as it has all the required utils and codes from preprocessing the image datasets to training the models and using them to classify the images. 

In [1]:
!git clone https://github.com/kk-digital/kcg-ml
%cd kcg-ml

Cloning into 'kcg-ml'...
remote: Enumerating objects: 2927, done.[K
remote: Counting objects: 100% (919/919), done.[K
remote: Compressing objects: 100% (405/405), done.[K
remote: Total 2927 (delta 574), reused 795 (delta 509), pack-reused 2008[K
Receiving objects: 100% (2927/2927), 26.55 MiB | 14.21 MiB/s, done.
Resolving deltas: 100% (1721/1721), done.
/content/kcg-ml


In [7]:
# install the required dependencies
!pip3 install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ascii-graph==1.5.1
  Using cached ascii_graph-1.5.1.tar.gz (24 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting asttokens==2.1.0
  Using cached asttokens-2.1.0-py2.py3-none-any.whl (26 kB)
Collecting charset-normalizer==2.1.1
  Using cached charset_normalizer-2.1.1-py3-none-any.whl (39 kB)
Collecting colorama==0.4.6
  Using cached colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Collecting contourpy==1.0.6
  Using cached contourpy-1.0.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (296 kB)
Collecting debugpy==1.6.3
  Using cached debugpy-1.6.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.8 MB)
Collecting decorator==5.1.1
  Using cached decorator-5.1.1-py3-none-any.whl (9.1 kB)
Collecting executing==1.2.0
  Using cached executing-1.2.0-py2.py3-none-any.whl (24 kB)
Collecting filelock==3.8.0
  Using c

# Computing Platform Check GPU (CUDA) or CPU / Environment

In [8]:
#Check Environment And Cuda Version 
from utility.CheckCuda import GetCuda
from utility.envchecker import GetEnvironment

GetCuda()
GetEnvironment()

CUDA version: 11.8


'colab'

# Import the Module/Utility 

In [17]:
import sys
sys.path.insert(0, './image_classifier_pipeline/data_loader/')
sys.path.insert(0, './image_classifier_pipeline/train/')
sys.path.insert(0, './image_classifier_pipeline/classify/')
sys.path.insert(0, './image_classifier_pipeline/model_api/')
from ImageDatasetProcessor import ImageDatasetProcessor
from classify import main as classify_main
from train import main as train_main
from classify_helper_functions import *
from model_api import ModelApi
import patoolib
import shutil

# Getting Dataset from Mega

### Mount Google Drive
Mount your google drive to be used for storing the dataset into it. Note: This step is optional, the dataset can also be saved to Colab session storage.

In [None]:
from google.colab import drive
drive.mount._DEBUG = False
drive.mount('/content/drive', force_remount=True)

### Mega CMD Installation

In [19]:
# Mega CMD Requirements
!apt install libmms0 libc-ares2 libc6 libcrypto++6 libgcc1 libmediainfo0v5 libpcre3 libpcrecpp0v5 libssl1.1 libstdc++6 libzen0v5 zlib1g apt-transport-https
!apt --fix-broken install

# Mega CMD Download and Installation
!wget https://mega.nz/linux/MEGAsync/xUbuntu_18.04/amd64/megacmd-xUbuntu_18.04_amd64.deb
!sudo dpkg -i megacmd-xUbuntu_18.04_amd64.deb

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libc6 is already the newest version (2.31-0ubuntu9.9).
libpcre3 is already the newest version (2:8.39-12ubuntu0.1).
libpcrecpp0v5 is already the newest version (2:8.39-12ubuntu0.1).
libpcrecpp0v5 set to manually installed.
libssl1.1 is already the newest version (1.1.1f-1ubuntu2.17).
libssl1.1 set to manually installed.
libstdc++6 is already the newest version (10.3.0-1ubuntu1~20.04).
zlib1g is already the newest version (1:1.2.11.dfsg-2ubuntu1.5).
The following additional packages will be installed:
  libtinyxml2-6a
The following NEW packages will be installed:
  apt-transport-https libc-ares2 libcrypto++6 libgcc1 libmediainfo0v5 libmms0
  libtinyxml2-6a libzen0v5
0 upgraded, 8 newly installed, 0 to remove and 24 not upgraded.
Need to get 3,271 kB of archives.
After this operation, 12.5 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal-updates/universe a

### Download Dataset
Set download URL in Mega and destination path (in Google Drive or session storage) and download the file.

In [20]:
import os
import contextlib
from subprocess import Popen, PIPE, STDOUT

# Download URL on Mega
tagged_dataset_url = 'https://mega.nz/file/8RhFRISJ#vlQhjBp5hrNtQzFnRVQtD_ilHfIyLOSrlwVXEb3t1UM'
other_dataset_url  = 'https://mega.nz/file/5FhGkRjD#yFihfhr1RMHfPTffhPB4tQtJsnn_HBYFOSfqdPOrp78'
# Destination path for download
destination_path = './downloads'
os.makedirs(destination_path, exist_ok=True)

# Function for printing the download progress
def print_progress(proc, stream='stdout'):
  newlines = ['\n', '\r\n', '\r']
  stream = getattr(proc, stream)
  with contextlib.closing(stream):
      while True:
          out = []
          last = stream.read(1)
          # Don't loop forever
          if last == '' and proc.poll() is not None:
              break
          while last not in newlines:
              # Don't loop forever
              if last == '' and proc.poll() is not None:
                  break
              out.append(last)
              last = stream.read(1)
          out = ''.join(out)
          yield out

# Download dataset
for data_url in [tagged_dataset_url, other_dataset_url]:
  cmd = ["mega-get", data_url, destination_path]
  proc = Popen(cmd,stdout=PIPE, stderr=STDOUT, universal_newlines=True)
  for line in print_progress(proc):
    print(line)

[Initiating MEGAcmd server in background. Log: /root/.megaCmd/megacmdserver.log]
TRANSFERRING ||#.........................................||(1/929 MB:   0.12 %)  
TRANSFERRING ||#.........................................||(1/929 MB:   0.19 %)  
TRANSFERRING ||#.........................................||(2/929 MB:   0.32 %)  
TRANSFERRING ||#.........................................||(4/929 MB:   0.53 %)  
TRANSFERRING ||#.........................................||(8/929 MB:   0.86 %)  
TRANSFERRING ||#........................................||(10/929 MB:   1.08 %)  
TRANSFERRING ||#........................................||(10/929 MB:   1.12 %)  
TRANSFERRING ||#........................................||(11/929 MB:   1.19 %)  
TRANSFERRING ||#........................................||(11/929 MB:   1.29 %)  
TRANSFERRING ||#........................................||(13/929 MB:   1.42 %)  
TRANSFERRING ||#........................................||(15/929 MB:   1.63 %)  
TRANSFERRING ||#.

# Preprocess the Dataset images (Stage 1)

Process a tagged dataset and computes the images metadata along with its CLIP embeddings and writes the result into a JSON file in specified output folder. In addition, the SQLite database named `dataset_cache.sqlite` with table named `dataset_cache` containing file name, hash and file path for dataset images will be created in the `./output` folder. 

### Install Requirements 

In [21]:
%%capture
%pip install ascii_graph open_clip_torch patool fire

### Extract Downloaded ZIP-Archived Data (Optional)
This step is optional. If this step is not performed, Stage 1 will perform the extraction

In [22]:
from zipfile import ZipFile

# Specify location of downloaded data (zip file)
downloaded_data_zip = "/content/kcg-ml/downloads/pixel-art-tagged-v3.zip"
# Location to extract the zip file to
dataset_path = f'./datasets/{os.path.splitext(os.path.split(downloaded_data_zip)[-1])[0]}'

with ZipFile(downloaded_data_zip) as zip_object:
    zip_object.extractall(dataset_path)

### Set Required Variables by the Utility

Initialize the required parameters needed by the dataset preprocessor utility and they are described as follows: 

* `input_folder` _[str]_ -  path to the directory containing sub-folders of each tag.
* `output_folder` _[str]_ - path to the directory where to save the files into it.
* `clip_model` _[str]_ - CLIP model to be used
* `pretrained` _[str]_ - the pre-trained model to be used for CLIP
* `batch_size` _[int]_ -  number of images to process at a time
* `num_threads` _[int]_ - the number to be used in this process
* `device` _[str]_ -  the device to be used in computing the CLIP embeddings, if `None` is provided then `cuda` will be used if available


In [23]:
# Specify the path to the dataset in dataset_path variable
# dataset_path = '/content/kcg-ml/downloads/pixel-art-tagged-v3.zip'
dataset_path = './datasets/testdata.zip'
output_folder = './output'
tagged_dataset = True
clip_model = 'ViT-L-14'
pretrained = 'laion2b_s32b_b82k'
batch_size = 32
num_threads = 4
device = None

### Run the Preprocessor

In [24]:
ImageDatasetProcessor.process_dataset(
    dataset_path, 
    output_folder,
    tagged_dataset, 
    clip_model, 
    pretrained,
    batch_size, 
    num_threads, 
    device
)

Downloading:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

patool: Extracting ./datasets/testdata.zip ...
patool: running /usr/bin/7z x -o./datasets/testdata-decompressed-tmp -- ./datasets/testdata.zip
patool: ... ./datasets/testdata.zip extracted to `./datasets/testdata-decompressed-tmp'.
is archive dataset
dataset folder path  = ./datasets/testdata-decompressed-tmp/testdata
Processing...


100%|██████████| 1/1 [00:07<00:00,  7.99s/it]


[INFO] Writing to database table in ./output/dataset_cache.sqlite
[INFO] Finished.


# Train the Classifiers (Stage 2)
Given a `metadata` json file containing embeddings for images and `tag-to-image-hash` json file containing images' hash with tags, the script start to make for every tag two binary classification models and save it in output folder.

### Train Script Variables

* `metadata_json` _[string]_ - _[required]_ - The path to the metadata json file. 
* `tag_to_hash_json` _[string]_ - _[required]_ - The path to tag-to-hash json file. 
* `output` _[string]_ - _[optional]_ - The path to the output directory.
* `test_per` _[float]_ - _[optional]_ - The percentage of the test images from the dataset, default = 0.1 


In [25]:
# Run from ./image-tagging-tools directory
metadata_json = './output/input-metadata.json' 
tag_to_hash_json = './output/input-tag-to-image-hash-list.json'
output_dir = './output'
test_per = 0.1

### Run the Training Script

In [26]:
train_main(
    metadata_json = metadata_json,
    tag_to_hash_json = tag_to_hash_json,
    output_dir = output_dir,
    test_per = test_per
)

Initialize new classifier model for type: ovr-logistic-regression and tag: not-pixel-art-real-photo
Initialize new classifier model for type: ovr-svm and tag: not-pixel-art-real-photo
Initialize new classifier model for type: torch-logistic-regression and tag: not-pixel-art-real-photo
Initialize new classifier model for type: ovr-logistic-regression and tag: not-pixel-art
Initialize new classifier model for type: ovr-svm and tag: not-pixel-art
Initialize new classifier model for type: torch-logistic-regression and tag: not-pixel-art
[INFO] Finished.


### Get List of Type and Tag Pairs from All Models'''

In [27]:
# Creating model object
model_api = ModelApi()
type_tag_pair = model_api.get_type_tag_pair() 
print (type_tag_pair)

[('ovr-logistic-regression', 'not-pixel-art-real-photo'), ('torch-logistic-regression', 'not-pixel-art'), ('ovr-logistic-regression', 'not-pixel-art'), ('ovr-svm', 'not-pixel-art-real-photo'), ('torch-logistic-regression', 'not-pixel-art-real-photo'), ('ovr-svm', 'not-pixel-art')]


# Classify Data (Stage 3)
Running the classifier. The script will loop over every image and make the classification for it using every binary classification model. Running the classifier can be performed from command line interface (CLI) or within Python runtime as shown in the following examples.

## Running the classifier from Command Line Interface (CLI Version)

In [35]:
!touch ./__init__.py

In [37]:
!python3 ./image_classifier_pipeline/classify/classify.py --directory=$dataset_path --metadata_json=./output/input-metadata.json --output=./output --output_bins=10 --model_type=ovr-logistic-regression --tag=not-pixel-art

[INFO] Output folder ./output/tagging_output_2023_4_11_21_3_36
  0% 0/1 [00:00<?, ?it/s]
0it [00:00, ?it/s][A[INFO] Working on ZIP archive: ./datasets/testdata.zip
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_ee_d1_f0eed195037b8fc2135a44ceb5ed2044.jpg

1it [00:09,  9.37s/it][A Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f0_71_f0f071d89545fb378150180d9257f306.jpg

2it [00:09,  3.92s/it][A Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f0_b4_f0f0b4f3f1d172a789589915fc4ad212.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f5_b8_f0f5b8c94169977da31274f36b0aa703.jpg

4it [00:09,  1.51s/it][A Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f6_08_f0f608ca8c7e5bcd0b3ffc565442d74f.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_fc_b5_f0fcb58b15c19afc5fc1f33ff2acda74.jpg

6it [00:09, 

### CLI Arguments

* `directory` _[string]_ - _[required]_ - The path to the images' folder or images' .zip file. 
* `metadata_json` _[string]_ - _[required]_ - The path to the metadata json file for CLIP embeddings. 
* `output` _[string]_ - _[optional]_ - The path to the output directory for the inference results. 
* `model_type` _[string]_ - _[required]_ - The type of the model (example: `ovr-logistic-regression`, `ovr-svm`, `torch-logistic-regression`).
* `tag` _[string]_ - _[required]_ - Tag string (example: `pos-character`, `pos-environmental-space`, etc).
* `output_bins` _[int]_ - _[optional]_ -  The number of bins of the results for each model.

If the `--output` argument is not specified, the classification / inference result will be placed at `./output/tagging_output` folder. Time stamp will be appended to folder name (for example: `./output/tagging_output_2023_1_21_0_56`).
In addition, the SQLite database named `score_cache.sqlite` with table named `score_cache` containing file name, file path, file hash, model name, model type, model train date, tag string and tag score for given images will be created in the `./output` folder. For archived (ZIP) files the SQLite database named `zip_score_cache.sqlite` with table named `zip_score_cache` containing file name, file path, archive path, type of file, hash, model type, tag name and tag score for given images will be created in the `output` folder. 

## Running the Classifier (from Python Runtime)

### Variables for Classifier
* `data_path` _[string]_ - _[required]_ - The path to the images data folder or single image file'. 
* `json_file_path` _[string]_ - _[required]_ - The path to the metadata json file for CLIP embeddings. 
* `output_dir` _[string]_ - _[optional]_ - The path to the output directory for the inference results. 
* `model_type` _[string]_ - _[required]_ - The type of the model (example: `ovr-logistic-regression`, `ovr-svm`, `torch-logistic-regression`)
* `tag` _[string]_ - _[required]_ - Tag string (example: `pos-character`, `pos-environmental-space`, etc).
* `bins_number` _[int]_ - _[optional]_ -  The number of bins of the results for each model.

### Classify Images Data in Folder with Single Model

In [38]:
# Specify path to folder containing images data in data_path variable
# data_path = '../path/to/image/data/folder/'
# Or test with images in dataset
data_path    = dataset_path
output_dir     = './output/'
json_file_path = './output/input-metadata.json'
bins_number    = 10
# Specify the type of model
model_type = 'ovr-logistic-regression'
# Specify the tag string
tag = 'not-pixel-art'

In [39]:
classify_main(
        folder_path    = data_path, 
        output_dir     = output_dir, 
        json_file_path = json_file_path, 
        bins_number = bins_number,
        model_type = model_type, 
        tag = tag
        )

[INFO] Output folder ./output//tagging_output_2023_4_11_21_5_7


  0%|          | 0/1 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
1it [00:00,  8.08it/s][A

[INFO] Working on ZIP archive: ./datasets/testdata.zip
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_ee_d1_f0eed195037b8fc2135a44ceb5ed2044.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f0_71_f0f071d89545fb378150180d9257f306.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f0_b4_f0f0b4f3f1d172a789589915fc4ad212.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f5_b8_f0f5b8c94169977da31274f36b0aa703.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f6_08_f0f608ca8c7e5bcd0b3ffc565442d74f.jpg



6it [00:00, 28.82it/s][A
9it [00:00, 27.30it/s][A

 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_fc_b5_f0fcb58b15c19afc5fc1f33ff2acda74.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_fe_b3_f0feb3bf4202a15d69629e9ddd990809.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f1_00_9d_f1009d0dd411b52e827bbcd0d30080fa.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f3_bb_e6_f3bbe6fa7aa5b9c925bb36954dc8786c.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f3_c2_a5_f3c2a5e03978093319ba184fb54e009e.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f3_c3_c2_f3c3c2c7fb350b03dccc99a66d3a3a86.jpg



12it [00:00, 17.19it/s][A

 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f4_d6_97_f4d697934ac2656bcc9bba971262829c.png
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a3_b8_f6a3b85a6877bd06e0197063a720bcf6.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a4_11_f6a411b2046bad82ca94402131e14e67.png
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a4_db_f6a4db94afa72c01847cb2c8733d2155.jpg



15it [00:00, 15.65it/s][A
19it [00:01, 17.93it/s][A

 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a4_fb_f6a4fbf097fd103d2f8c923016cf1368.jpg
 Processing: testdata/other-training/https___i.pinimg.com_originals_00_00_57_000057f3850f9771864630687c4bdf26.jpg
 Processing: testdata/other-training/https___i.pinimg.com_originals_00_00_e7_0000e7f96134fbd9ea78cca6986be247.png
 Processing: testdata/other-training/https___i.pinimg.com_originals_00_0a_a5_000aa58efa428d0bdf580388a8403a49.png
 Processing: testdata/other-training/https___i.pinimg.com_originals_00_0b_2d_000b2d8351f053664b41ca1024b41b1f.jpg



24it [00:01, 20.52it/s]
100%|██████████| 1/1 [00:01<00:00,  1.18s/it]


 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_78_74_0078749a6a33588ea8995080904d96fc.jpg
 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_79_2c_00792c0d83d415a707bdacee51646d9e.png
 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_79_32_007932f2045adf1d7b785d53021ccd0f.jpg
 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_81_01_0081018e59980d9b1a5cbe03dd9557cf.png
./output///zip_score_cache.sqlite
[INFO] Writing to database table in ./output///zip_score_cache.sqlite
[INFO] Finished.


### Classify Single Image with Single Model

In [40]:
# Specify path to single image to be classified in data_path
#data_path    = '../path/to/image/file'
# Or test with sample image from dataset
data_path    = './datasets/testdata/not-pixel-art/https___i.pinimg.com_originals_f3_bb_e6_f3bbe6fa7aa5b9c925bb36954dc8786c.jpg'
output_dir     = './output/classification_single_image_single_model'
json_file_path = './output/input-metadata.json'
bins_number    = 10
# Specify the type of model
model_type = 'torch-logistic-regression'
# Specify the tag string
tag = 'not-pixel-art'

In [41]:
classify_main(
        folder_path    = data_path, 
        output_dir     = output_dir, 
        json_file_path = json_file_path, 
        bins_number = bins_number,
        model_type = model_type, 
        tag = tag 
        )

[INFO] Output folder ./output/classification_single_image_single_model/tagging_output_2023_4_11_21_5_28


100%|██████████| 1/1 [00:00<00:00, 10.62it/s]

[INFO] Writing to database table in ./output/classification_single_image_single_model//score_cache.sqlite
[INFO] Finished.





### Classify Images Data in Folder with All Models

In [42]:
# Specify path to folder containing images data in data_path variable
# data_path = '../path/to/image/data/folder/'
# Or test with images in dataset
data_path    = dataset_path
output_dir     = './output/'
json_file_path = './output/input-metadata.json'
bins_number    = 10

In [43]:
# Creating model object
model_api = ModelApi()

# Get list of model type and tag pair
type_tag_pair = model_api.get_type_tag_pair() 

for model_type, tag in type_tag_pair:
        classify_main(
                folder_path    = data_path, 
                output_dir     = output_dir, 
                json_file_path = json_file_path, 
                bins_number = bins_number,
                model_type = model_type, 
                tag = tag 
                )

[INFO] Output folder ./output//tagging_output_2023_4_11_21_5_42


  0%|          | 0/1 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
1it [00:00,  8.60it/s][A

[INFO] Working on ZIP archive: ./datasets/testdata.zip
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_ee_d1_f0eed195037b8fc2135a44ceb5ed2044.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f0_71_f0f071d89545fb378150180d9257f306.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f0_b4_f0f0b4f3f1d172a789589915fc4ad212.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f5_b8_f0f5b8c94169977da31274f36b0aa703.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f6_08_f0f608ca8c7e5bcd0b3ffc565442d74f.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_fc_b5_f0fcb58b15c19afc5fc1f33ff2acda74.jpg



6it [00:00, 30.61it/s][A
10it [00:00, 30.39it/s][A

 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_fe_b3_f0feb3bf4202a15d69629e9ddd990809.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f1_00_9d_f1009d0dd411b52e827bbcd0d30080fa.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f3_bb_e6_f3bbe6fa7aa5b9c925bb36954dc8786c.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f3_c2_a5_f3c2a5e03978093319ba184fb54e009e.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f3_c3_c2_f3c3c2c7fb350b03dccc99a66d3a3a86.jpg



14it [00:00, 17.21it/s][A

 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f4_d6_97_f4d697934ac2656bcc9bba971262829c.png
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a3_b8_f6a3b85a6877bd06e0197063a720bcf6.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a4_11_f6a411b2046bad82ca94402131e14e67.png
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a4_db_f6a4db94afa72c01847cb2c8733d2155.jpg



17it [00:00, 18.50it/s][A

 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a4_fb_f6a4fbf097fd103d2f8c923016cf1368.jpg
 Processing: testdata/other-training/https___i.pinimg.com_originals_00_00_57_000057f3850f9771864630687c4bdf26.jpg
 Processing: testdata/other-training/https___i.pinimg.com_originals_00_00_e7_0000e7f96134fbd9ea78cca6986be247.png
 Processing: testdata/other-training/https___i.pinimg.com_originals_00_0a_a5_000aa58efa428d0bdf580388a8403a49.png
 Processing: testdata/other-training/https___i.pinimg.com_originals_00_0b_2d_000b2d8351f053664b41ca1024b41b1f.jpg



24it [00:01, 21.82it/s]
100%|██████████| 1/1 [00:01<00:00,  1.11s/it]


 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_78_74_0078749a6a33588ea8995080904d96fc.jpg
 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_79_2c_00792c0d83d415a707bdacee51646d9e.png
 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_79_32_007932f2045adf1d7b785d53021ccd0f.jpg
 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_81_01_0081018e59980d9b1a5cbe03dd9557cf.png
./output///zip_score_cache.sqlite
[INFO] Writing to database table in ./output///zip_score_cache.sqlite
[INFO] Finished.
[INFO] Output folder ./output//tagging_output_2023_4_11_21_5_50


  0%|          | 0/1 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
2it [00:00, 19.54it/s][A

[INFO] Working on ZIP archive: ./datasets/testdata.zip
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_ee_d1_f0eed195037b8fc2135a44ceb5ed2044.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f0_71_f0f071d89545fb378150180d9257f306.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f0_b4_f0f0b4f3f1d172a789589915fc4ad212.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f5_b8_f0f5b8c94169977da31274f36b0aa703.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f6_08_f0f608ca8c7e5bcd0b3ffc565442d74f.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_fc_b5_f0fcb58b15c19afc5fc1f33ff2acda74.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_fe_b3_f0feb3bf4202a15d69629e9ddd990809.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals


8it [00:00, 41.40it/s][A

 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f3_bb_e6_f3bbe6fa7aa5b9c925bb36954dc8786c.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f3_c2_a5_f3c2a5e03978093319ba184fb54e009e.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f3_c3_c2_f3c3c2c7fb350b03dccc99a66d3a3a86.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f4_d6_97_f4d697934ac2656bcc9bba971262829c.png



13it [00:00, 24.22it/s][A
17it [00:00, 27.68it/s][A

 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a3_b8_f6a3b85a6877bd06e0197063a720bcf6.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a4_11_f6a411b2046bad82ca94402131e14e67.png
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a4_db_f6a4db94afa72c01847cb2c8733d2155.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a4_fb_f6a4fbf097fd103d2f8c923016cf1368.jpg
 Processing: testdata/other-training/https___i.pinimg.com_originals_00_00_57_000057f3850f9771864630687c4bdf26.jpg
 Processing: testdata/other-training/https___i.pinimg.com_originals_00_00_e7_0000e7f96134fbd9ea78cca6986be247.png
 Processing: testdata/other-training/https___i.pinimg.com_originals_00_0a_a5_000aa58efa428d0bdf580388a8403a49.png



24it [00:00, 30.49it/s]
100%|██████████| 1/1 [00:00<00:00,  1.24it/s]


 Processing: testdata/other-training/https___i.pinimg.com_originals_00_0b_2d_000b2d8351f053664b41ca1024b41b1f.jpg
 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_78_74_0078749a6a33588ea8995080904d96fc.jpg
 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_79_2c_00792c0d83d415a707bdacee51646d9e.png
 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_79_32_007932f2045adf1d7b785d53021ccd0f.jpg
 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_81_01_0081018e59980d9b1a5cbe03dd9557cf.png
./output///zip_score_cache.sqlite
[INFO] Writing to database table in ./output///zip_score_cache.sqlite
[INFO] Finished.
[INFO] Output folder ./output//tagging_output_2023_4_11_21_5_56


  0%|          | 0/1 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
2it [00:00, 15.77it/s][A

[INFO] Working on ZIP archive: ./datasets/testdata.zip
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_ee_d1_f0eed195037b8fc2135a44ceb5ed2044.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f0_71_f0f071d89545fb378150180d9257f306.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f0_b4_f0f0b4f3f1d172a789589915fc4ad212.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f5_b8_f0f5b8c94169977da31274f36b0aa703.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f6_08_f0f608ca8c7e5bcd0b3ffc565442d74f.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_fc_b5_f0fcb58b15c19afc5fc1f33ff2acda74.jpg



7it [00:00, 29.00it/s][A

 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_fe_b3_f0feb3bf4202a15d69629e9ddd990809.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f1_00_9d_f1009d0dd411b52e827bbcd0d30080fa.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f3_bb_e6_f3bbe6fa7aa5b9c925bb36954dc8786c.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f3_c2_a5_f3c2a5e03978093319ba184fb54e009e.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f3_c3_c2_f3c3c2c7fb350b03dccc99a66d3a3a86.jpg



11it [00:00, 17.23it/s][A
14it [00:00, 18.20it/s][A

 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f4_d6_97_f4d697934ac2656bcc9bba971262829c.png
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a3_b8_f6a3b85a6877bd06e0197063a720bcf6.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a4_11_f6a411b2046bad82ca94402131e14e67.png
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a4_db_f6a4db94afa72c01847cb2c8733d2155.jpg



17it [00:00, 18.35it/s][A

 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a4_fb_f6a4fbf097fd103d2f8c923016cf1368.jpg
 Processing: testdata/other-training/https___i.pinimg.com_originals_00_00_57_000057f3850f9771864630687c4bdf26.jpg
 Processing: testdata/other-training/https___i.pinimg.com_originals_00_00_e7_0000e7f96134fbd9ea78cca6986be247.png
 Processing: testdata/other-training/https___i.pinimg.com_originals_00_0a_a5_000aa58efa428d0bdf580388a8403a49.png
 Processing: testdata/other-training/https___i.pinimg.com_originals_00_0b_2d_000b2d8351f053664b41ca1024b41b1f.jpg



24it [00:01, 20.65it/s]
100%|██████████| 1/1 [00:01<00:00,  1.18s/it]


 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_78_74_0078749a6a33588ea8995080904d96fc.jpg
 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_79_2c_00792c0d83d415a707bdacee51646d9e.png
 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_79_32_007932f2045adf1d7b785d53021ccd0f.jpg
 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_81_01_0081018e59980d9b1a5cbe03dd9557cf.png
./output///zip_score_cache.sqlite
[INFO] Writing to database table in ./output///zip_score_cache.sqlite
[INFO] Finished.
[INFO] Output folder ./output//tagging_output_2023_4_11_21_6_2


  0%|          | 0/1 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
2it [00:00, 19.76it/s][A

[INFO] Working on ZIP archive: ./datasets/testdata.zip
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_ee_d1_f0eed195037b8fc2135a44ceb5ed2044.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f0_71_f0f071d89545fb378150180d9257f306.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f0_b4_f0f0b4f3f1d172a789589915fc4ad212.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f5_b8_f0f5b8c94169977da31274f36b0aa703.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f6_08_f0f608ca8c7e5bcd0b3ffc565442d74f.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_fc_b5_f0fcb58b15c19afc5fc1f33ff2acda74.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_fe_b3_f0feb3bf4202a15d69629e9ddd990809.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals


9it [00:00, 42.87it/s][A

 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f3_c2_a5_f3c2a5e03978093319ba184fb54e009e.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f3_c3_c2_f3c3c2c7fb350b03dccc99a66d3a3a86.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f4_d6_97_f4d697934ac2656bcc9bba971262829c.png
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a3_b8_f6a3b85a6877bd06e0197063a720bcf6.jpg



14it [00:00, 26.20it/s][A
18it [00:00, 29.11it/s][A

 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a4_11_f6a411b2046bad82ca94402131e14e67.png
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a4_db_f6a4db94afa72c01847cb2c8733d2155.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a4_fb_f6a4fbf097fd103d2f8c923016cf1368.jpg
 Processing: testdata/other-training/https___i.pinimg.com_originals_00_00_57_000057f3850f9771864630687c4bdf26.jpg
 Processing: testdata/other-training/https___i.pinimg.com_originals_00_00_e7_0000e7f96134fbd9ea78cca6986be247.png
 Processing: testdata/other-training/https___i.pinimg.com_originals_00_0a_a5_000aa58efa428d0bdf580388a8403a49.png
 Processing: testdata/other-training/https___i.pinimg.com_originals_00_0b_2d_000b2d8351f053664b41ca1024b41b1f.jpg



24it [00:00, 31.07it/s]
100%|██████████| 1/1 [00:00<00:00,  1.27it/s]


 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_78_74_0078749a6a33588ea8995080904d96fc.jpg
 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_79_2c_00792c0d83d415a707bdacee51646d9e.png
 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_79_32_007932f2045adf1d7b785d53021ccd0f.jpg
 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_81_01_0081018e59980d9b1a5cbe03dd9557cf.png
./output///zip_score_cache.sqlite
[INFO] Writing to database table in ./output///zip_score_cache.sqlite
[INFO] Finished.
[INFO] Output folder ./output//tagging_output_2023_4_11_21_6_10


  0%|          | 0/1 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
3it [00:00, 26.90it/s][A

[INFO] Working on ZIP archive: ./datasets/testdata.zip
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_ee_d1_f0eed195037b8fc2135a44ceb5ed2044.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f0_71_f0f071d89545fb378150180d9257f306.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f0_b4_f0f0b4f3f1d172a789589915fc4ad212.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f5_b8_f0f5b8c94169977da31274f36b0aa703.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f6_08_f0f608ca8c7e5bcd0b3ffc565442d74f.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_fc_b5_f0fcb58b15c19afc5fc1f33ff2acda74.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_fe_b3_f0feb3bf4202a15d69629e9ddd990809.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals


8it [00:00, 39.70it/s][A

 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f3_bb_e6_f3bbe6fa7aa5b9c925bb36954dc8786c.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f3_c2_a5_f3c2a5e03978093319ba184fb54e009e.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f3_c3_c2_f3c3c2c7fb350b03dccc99a66d3a3a86.jpg



13it [00:00, 23.48it/s][A
18it [00:00, 29.62it/s][A

 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f4_d6_97_f4d697934ac2656bcc9bba971262829c.png
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a3_b8_f6a3b85a6877bd06e0197063a720bcf6.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a4_11_f6a411b2046bad82ca94402131e14e67.png
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a4_db_f6a4db94afa72c01847cb2c8733d2155.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a4_fb_f6a4fbf097fd103d2f8c923016cf1368.jpg
 Processing: testdata/other-training/https___i.pinimg.com_originals_00_00_57_000057f3850f9771864630687c4bdf26.jpg
 Processing: testdata/other-training/https___i.pinimg.com_originals_00_00_e7_0000e7f96134fbd9ea78cca6986be247.png
 Processing: testdata/other-training/https___i.pinimg.com_originals_00_0a_a5_000aa58efa428d0bdf580388a8403a49.png



24it [00:00, 30.79it/s]
100%|██████████| 1/1 [00:00<00:00,  1.26it/s]


 Processing: testdata/other-training/https___i.pinimg.com_originals_00_0b_2d_000b2d8351f053664b41ca1024b41b1f.jpg
 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_78_74_0078749a6a33588ea8995080904d96fc.jpg
 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_79_2c_00792c0d83d415a707bdacee51646d9e.png
 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_79_32_007932f2045adf1d7b785d53021ccd0f.jpg
 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_81_01_0081018e59980d9b1a5cbe03dd9557cf.png
./output///zip_score_cache.sqlite
[INFO] Writing to database table in ./output///zip_score_cache.sqlite
[INFO] Finished.
[INFO] Output folder ./output//tagging_output_2023_4_11_21_6_16


  0%|          | 0/1 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
1it [00:00,  9.15it/s][A

[INFO] Working on ZIP archive: ./datasets/testdata.zip
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_ee_d1_f0eed195037b8fc2135a44ceb5ed2044.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f0_71_f0f071d89545fb378150180d9257f306.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f0_b4_f0f0b4f3f1d172a789589915fc4ad212.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f5_b8_f0f5b8c94169977da31274f36b0aa703.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f6_08_f0f608ca8c7e5bcd0b3ffc565442d74f.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_fc_b5_f0fcb58b15c19afc5fc1f33ff2acda74.jpg



6it [00:00, 31.40it/s][A
10it [00:00, 30.38it/s][A

 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_fe_b3_f0feb3bf4202a15d69629e9ddd990809.jpg
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f1_00_9d_f1009d0dd411b52e827bbcd0d30080fa.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f3_bb_e6_f3bbe6fa7aa5b9c925bb36954dc8786c.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f3_c2_a5_f3c2a5e03978093319ba184fb54e009e.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f3_c3_c2_f3c3c2c7fb350b03dccc99a66d3a3a86.jpg



14it [00:00, 17.59it/s][A

 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f4_d6_97_f4d697934ac2656bcc9bba971262829c.png
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a3_b8_f6a3b85a6877bd06e0197063a720bcf6.jpg
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a4_11_f6a411b2046bad82ca94402131e14e67.png
 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a4_db_f6a4db94afa72c01847cb2c8733d2155.jpg



17it [00:00, 18.93it/s][A
20it [00:00, 19.31it/s][A

 Processing: testdata/not-pixel-art/https___i.pinimg.com_originals_f6_a4_fb_f6a4fbf097fd103d2f8c923016cf1368.jpg
 Processing: testdata/other-training/https___i.pinimg.com_originals_00_00_57_000057f3850f9771864630687c4bdf26.jpg
 Processing: testdata/other-training/https___i.pinimg.com_originals_00_00_e7_0000e7f96134fbd9ea78cca6986be247.png
 Processing: testdata/other-training/https___i.pinimg.com_originals_00_0a_a5_000aa58efa428d0bdf580388a8403a49.png
 Processing: testdata/other-training/https___i.pinimg.com_originals_00_0b_2d_000b2d8351f053664b41ca1024b41b1f.jpg
 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_78_74_0078749a6a33588ea8995080904d96fc.jpg


24it [00:01, 22.98it/s]
100%|██████████| 1/1 [00:01<00:00,  1.05s/it]


 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_79_2c_00792c0d83d415a707bdacee51646d9e.png
 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_79_32_007932f2045adf1d7b785d53021ccd0f.jpg
 Processing: testdata/other-validation/https___i.pinimg.com_originals_00_81_01_0081018e59980d9b1a5cbe03dd9557cf.png
./output///zip_score_cache.sqlite
[INFO] Writing to database table in ./output///zip_score_cache.sqlite
[INFO] Finished.


### Classification for 'other-validation' Folder (pre-computed CLIP embeddings)

In [44]:
# Specify path to dataset folder containing 'other-validation' folder in dataset_path variable
dataset_path = 'datasets/testdata'
other_validation_path = os.path.join(dataset_path,'other-validation')
output_dir     = './output/classification_other_validation'
json_file_path =  './output/input-metadata.json'
bins_number    = 10
# Specify the type of model
model_type = 'ovr-logistic-regression'
# Specify the tag string
tag = 'not-pixel-art'

In [45]:
classify_main(
        folder_path    = other_validation_path, 
        output_dir     = output_dir, 
        json_file_path = json_file_path, 
        bins_number    = bins_number, 
        model_type = model_type, 
        tag = tag 
        )

[INFO] Output folder ./output/classification_other_validation/tagging_output_2023_4_11_21_7_47


100%|██████████| 4/4 [00:00<00:00, 178.16it/s]

[INFO] Writing to database table in ./output/classification_other_validation//score_cache.sqlite
[INFO] Finished.





### Get Score for Images Data in ZIP Archive with Single Model (Tag) and Run Function Based on Score

In [46]:
from classify_helper_functions import zip_gen, get_single_tag_score, get_clip, get_classifier_model

# Specify ZIP archive containing images data
folder_path    = './datasets/testdata.zip'
# Specify the type of model
model_type = 'ovr-logistic-regression'
# Specify the tag string
tag = 'not-pixel-art'
# Score threshold to run the function
th_score       = 0.2


In [47]:
def any_function_to_run(score):
    '''Function to run when certain prob_score is met'''
    print ('OK')

In [48]:
# import torch
# from PIL import Image

# image_file_name = "/content/kcg-ml/datasets/test_images/example1.jpg"
# img = Image.open(image_file_name)

# with torch.no_grad():

#   if image_file_name.lower().endswith('.gif'): 
#     img_obj = convert_gif_to_image(img)  
#   else:
#     img_obj = img

#   image = preprocess(img_obj).unsqueeze(0).to(device)
#   model = clip_model.to(device)
#   output = model.encode_image(image).cpu().detach().numpy()

# print(output)

In [49]:
clip_model , preprocess , device = get_clip(clip_model_type= 'ViT-L-14',pretrained= 'laion2b_s32b_b82k')
model = get_classifier_model(model_type = model_type, tag = tag)

# If model not found then return
if model != {}:
    # Loop through each zip file.
    for file in [folder_path]:
        # Generating images
        for img, img_file_name in zip_gen(file):
            # Calculate score
            score = get_single_tag_score(img, img_file_name, model, clip_model, preprocess, device)
            print (f'[INFO] Score: {score}')
            if th_score < score:
                any_function_to_run(score)
else:
    print ('[INFO]: Model not found. No classification performed.')

print("[INFO] Finished.")

[INFO] Working on ZIP archive: ./datasets/testdata.zip
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_ee_d1_f0eed195037b8fc2135a44ceb5ed2044.jpg
[INFO] Score: 0.9013001399782239
OK
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f0_71_f0f071d89545fb378150180d9257f306.jpg
[INFO] Score: 0.976080593306135
OK
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f0_b4_f0f0b4f3f1d172a789589915fc4ad212.jpg
[INFO] Score: 0.966614591197742
OK
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f5_b8_f0f5b8c94169977da31274f36b0aa703.jpg
[INFO] Score: 0.990198054228875
OK
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f6_08_f0f608ca8c7e5bcd0b3ffc565442d74f.jpg
[INFO] Score: 0.9472227229847028
OK
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_fc_b5_f0fcb58b15c19afc5fc1f33ff2acda74.jpg
[INFO] Score: 0.95474768

## ZIP Archive: Reading Image Files, Compute CLIP and Send to Single Classifier (based on Type and Tag)

### Import Required Functions

In [50]:
import sys
sys.path.insert(0, './image_classifier_pipeline/classify/')
import os
import datetime
import numpy as np
from classify import zip_gen as zip_image_iterator
from classify_helper_functions import file_to_hash_zip, load_json, get_clip, get_classifier_model, clip_image_features_zip, classify_image_prob, get_bins_array, find_bin, make_dir

### Specify ZIP Archive Location, Model Type and Tag

In [51]:
# Specify ZIP archive containing images data
zip_file_path = './datasets/testdata.zip'
# Specify folder path to write classified images
output_dir = './output'
# Metadata json file location
json_file_path = './output/input-metadata.json'
# Specify the type of model
model_type = 'ovr-logistic-regression'
# Specify the tag string
tag = 'not-pixel-art'
# Number of classification score (probability) bins
n_bins = 10

### Create Classifier and CLIP Model

In [52]:
# Get classifier model
model = get_classifier_model(model_type = model_type, tag = tag)
if model != {}:
    classifier = model['classifier']
    torch_model = 'torch' in model['model_type']
    # Get CLIP model, to calculate CLIP embeddings if it's not in .json metadata file.
    clip_model , preprocess , device = get_clip(clip_model_type= 'ViT-L-14',pretrained= 'laion2b_s32b_b82k')
    # Creating bins
    bins  = get_bins_array(n_bins)
else:
    print ('[INFO]: Model not found. Unable to perform classification')

### Reading, Compute CLIP and Send to Single Classifier for Each File in ZIP Archive

In [53]:
# Loop through each zip file.
for file in [zip_file_path]:
    # Reading image files from ZIP archive
    for img, img_file_name in zip_image_iterator(file):

        # Hash
        hash_id = file_to_hash_zip(img, img_file_name)

        # Load the .json file.
        metadata_json_obj = load_json(json_file_path)

        # Clip features
        try : 
            # Check whether the hash_id exists in json file.
            image_features = np.array(metadata_json_obj[hash_id]["embeddings_vector"]).reshape(1,-1) 
        except:
            # hash_id does not exist in json file. Calculate image features.
            image_features = clip_image_features_zip(img, img_file_name, clip_model,preprocess,device) 

        # Calculate probability score
        score = classify_image_prob(image_features, classifier, torch_model=torch_model)
        print (score)

        # Get the bins 
        tag_bin, _= find_bin(bins , score) # get the bins 
        print (tag_bin)

        # Create folder for writing classified
        timestamp = datetime.datetime.now() 
        output_sub_dir = (f'tagging_output-{timestamp.year}_{timestamp.month}_{timestamp.day}_{timestamp.hour}_{timestamp.minute}_{timestamp.second}')
        tag_name_out_folder = make_dir([output_dir, output_sub_dir, f'{model_type}',f'{tag}',tag_bin])
    
        # Saving the image to file
        file_path = os.path.join(tag_name_out_folder, os.path.basename(img_file_name))
        img.save(file_path)
        print (f'[INFO] File {file_path} saved')

print("[INFO] Finished.")

[INFO] Working on ZIP archive: ./datasets/testdata.zip
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_ee_d1_f0eed195037b8fc2135a44ceb5ed2044.jpg
0.9013004690538235
0.9
[INFO] File ./output/tagging_output-2023_4_11_21_8_55/ovr-logistic-regression/not-pixel-art/0.9/https___i.pinimg.com_originals_f0_ee_d1_f0eed195037b8fc2135a44ceb5ed2044.jpg saved
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f0_71_f0f071d89545fb378150180d9257f306.jpg
0.9760805726198688
1.0
[INFO] File ./output/tagging_output-2023_4_11_21_8_55/ovr-logistic-regression/not-pixel-art/1.0/https___i.pinimg.com_originals_f0_f0_71_f0f071d89545fb378150180d9257f306.jpg saved
 Processing: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f0_b4_f0f0b4f3f1d172a789589915fc4ad212.jpg
0.9666145736725641
1.0
[INFO] File ./output/tagging_output-2023_4_11_21_8_55/ovr-logistic-regression/not-pixel-art/1.0/https___i.pinimg.com_originals_f0_f0_b4_f0f0b4f3f1

# Using Classifier Model API

Model API contains function that accesses existing classifier model pickle files.

### Create Classifier Model API Object

In [54]:
# Create model loader object with default model_path='./output/models/'
model_api = ModelApi()
# Or specify model_path explicitly'
# model_api = ModelApi(model_path='./output/models')

### Get List of Model Types

In [55]:
model_types = model_api.get_model_types()
print (model_types)

['ovr-logistic-regression', 'torch-logistic-regression', 'ovr-logistic-regression', 'ovr-svm', 'torch-logistic-regression', 'ovr-svm']


### Get List of Tags Based on Model Type

In [56]:
model_type = (model_types[0])
tags = model_api.get_tags_by_model_type(model_type)
print (tags)

['not-pixel-art-real-photo', 'not-pixel-art']


### Get List of Type and Tag Pairs from All Models'''

In [57]:
type_tag_pair = model_api.get_type_tag_pair()
print (type_tag_pair)
'''
type_tag_pair is a list of tupple with the following structure
[(<model_type>,<tag>)]
'''

[('ovr-logistic-regression', 'not-pixel-art-real-photo'), ('torch-logistic-regression', 'not-pixel-art'), ('ovr-logistic-regression', 'not-pixel-art'), ('ovr-svm', 'not-pixel-art-real-photo'), ('torch-logistic-regression', 'not-pixel-art-real-photo'), ('ovr-svm', 'not-pixel-art')]


'\ntype_tag_pair is a list of tupple with the following structure\n[(<model_type>,<tag>)]\n'

### Get Model Based On Model Type and Tag

In [58]:
# Specify model type and tag
model_type, tag = type_tag_pair[0]
# Get the model dictionary
model = model_api.get_model_by_type_tag(model_type, tag)
print(model)
'''
Model is a dictionary with the following structure
{'classifier' : <model object>,
'model_type' : <model type string>,
'train_start_time' : <training start time datetime object>
'tag' : <tag string>
}
'''

{'classifier': LogisticRegression(multi_class='ovr', random_state=0), 'model_type': 'ovr-logistic-regression', 'train_start_time': datetime.datetime(2023, 4, 11, 20, 43, 27, 658810), 'tag': 'not-pixel-art-real-photo'}


"\nModel is a dictionary with the following structure\n{'classifier' : <model object>,\n'model_type' : <model type string>,\n'train_start_time' : <training start time datetime object>\n'tag' : <tag string>\n}\n"

### Get Models Dictionary for All Model Pickle Files

In [59]:
models_dict = model_api.get_models_dict()
print(models_dict)
'''
Example stucture of models_dict
{<model_name>: 
    {'classifier' : <model object>,
    'model_type' : <model type string>,
    'train_start_time' : <training start time datetime object>
    'tag' : <tag string>
    }
}
'''

{'model-ovr-logistic-regression-tag-not-pixel-art-real-photo': {'classifier': LogisticRegression(multi_class='ovr', random_state=0), 'model_type': 'ovr-logistic-regression', 'train_start_time': datetime.datetime(2023, 4, 11, 20, 43, 27, 658810), 'tag': 'not-pixel-art-real-photo'}, 'model-torch-logistic-regression-tag-not-pixel-art': {'classifier': LogisticRegressionPytorch(
  (linear): Linear(in_features=768, out_features=1, bias=True)
), 'model_type': 'torch-logistic-regression', 'train_start_time': datetime.datetime(2023, 4, 11, 20, 43, 27, 658810), 'tag': 'not-pixel-art'}, 'model-ovr-logistic-regression-tag-not-pixel-art': {'classifier': LogisticRegression(multi_class='ovr', random_state=0), 'model_type': 'ovr-logistic-regression', 'train_start_time': datetime.datetime(2023, 4, 11, 20, 43, 27, 658810), 'tag': 'not-pixel-art'}, 'model-ovr-svm-tag-not-pixel-art-real-photo': {'classifier': SVC(decision_function_shape='ovo', probability=True), 'model_type': 'ovr-svm', 'train_start_time'

"\nExample stucture of models_dict\n{<model_name>: \n    {'classifier' : <model object>,\n    'model_type' : <model type string>,\n    'train_start_time' : <training start time datetime object>\n    'tag' : <tag string>\n    }\n}\n"

# Other Examples

#### Get Files List from Folder or ZIP Archive

In [60]:
from zipfile import ZipFile

# Specify the path to the images data in the folder_path variable
folder_path    = './datasets/testdata.zip'

def fetch_file_paths(data_file):
    '''Yielding contained file paths in data_file'''

    if data_file.endswith('.zip'):

        # Selected data_dir is a zip archive
        with ZipFile(data_file) as archive:
            '''Getting archive details'''
            # Listing content
            entries = archive.infolist()

            for entry in entries:
                # Do for every content in the zip file
                if not entry.is_dir():
                    
                    with archive.open(entry) as file:

                        if entry.filename.lower().endswith(('.zip')):
                            # Another zip file found in the content.
                            with ZipFile(file) as sub_archive:
                                '''Getting archive details'''
                                sub_entries = sub_archive.infolist()
                                for sub_entry in sub_entries:
                                    with sub_archive.open(sub_entry) as sub_file:
                                        img_file_name = f'{data_file}/{sub_entry.filename}'
                                        yield (img_file_name)

                        else:
                            # Should be image file.
                            img_file_name = entry.filename
                            yield (img_file_name)
    else:
        # Should be image file
        yield data_file

def get_file_paths(data_dir):

    # Placeholder for file in data_dir
    dir_list = []

    if not os.path.isfile(data_dir):
        # A normal directory
        for root, dirs, files in os.walk(data_dir):
            for file in files:
                dir_list.append(os.path.join(root, file))
    else:
        # A single file (could be a zip archive or image)
        dir_list = [data_dir]

    # Placeholder for file_path for files found
    file_path_list = []

    for file in dir_list:
        for file_path in fetch_file_paths(file):
            print (f'File: {file_path}')
            file_path_list.append(file_path)
    
    return file_path_list

get_file_paths(folder_path)


File: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_ee_d1_f0eed195037b8fc2135a44ceb5ed2044.jpg
File: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f0_71_f0f071d89545fb378150180d9257f306.jpg
File: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f0_b4_f0f0b4f3f1d172a789589915fc4ad212.jpg
File: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f5_b8_f0f5b8c94169977da31274f36b0aa703.jpg
File: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f6_08_f0f608ca8c7e5bcd0b3ffc565442d74f.jpg
File: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_fc_b5_f0fcb58b15c19afc5fc1f33ff2acda74.jpg
File: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_fe_b3_f0feb3bf4202a15d69629e9ddd990809.jpg
File: testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f1_00_9d_f1009d0dd411b52e827bbcd0d30080fa.jpg
File: testdata/not-pixel-art/https___i.pinimg.com_originals_f3_b

['testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_ee_d1_f0eed195037b8fc2135a44ceb5ed2044.jpg',
 'testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f0_71_f0f071d89545fb378150180d9257f306.jpg',
 'testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f0_b4_f0f0b4f3f1d172a789589915fc4ad212.jpg',
 'testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f5_b8_f0f5b8c94169977da31274f36b0aa703.jpg',
 'testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_f6_08_f0f608ca8c7e5bcd0b3ffc565442d74f.jpg',
 'testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_fc_b5_f0fcb58b15c19afc5fc1f33ff2acda74.jpg',
 'testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f0_fe_b3_f0feb3bf4202a15d69629e9ddd990809.jpg',
 'testdata/not-pixel-art-real-photo/https___i.pinimg.com_originals_f1_00_9d_f1009d0dd411b52e827bbcd0d30080fa.jpg',
 'testdata/not-pixel-art/https___i.pinimg.com_originals_f3_bb_e6_f3bbe6fa7aa5b9c

#### Get List of File Hash_ID from Tag Cache Based on List of Models or Specific Model

In [62]:
import json
from clip_cache.cache_tag import TagCache

# Specify path to tag cache file
tag_cache_path = './output/tag_cache.sqlite'

# Output placeholder
model_tag_cache_pair = {}

try:
    # Create tag cache object
    tag_cache = TagCache()
    # Create model api object
    model_api=ModelApi()

    # Getting models
    models_dict = model_api.get_models_dict()
    '''
    Example stucture of models_dict
    {<model_name>: 
        {'classifier' : <model object>,
        'model_type' : <model type string>,
        'train_start_time' : <traing start time datetime object>
        'tag' : <tag string>
        }
    }
    '''

    # Get tags from models_dict
    for model in models_dict:
        # Get list of images (hash_IDs) based on each model's tag name
        hash_ids = tag_cache.get_hash_by_tag(db_path = tag_cache_path, tag = models_dict[model]['tag'])
        # Append list of hash IDs to the result dict
        model_tag_cache_pair[model] = hash_ids
    
    # Output
    print(json.dumps(model_tag_cache_pair, indent=2))

except Exception as e:
    print (f'[ERROR] {e}: Getting data from tag cache failed')

[ERROR] no such table: tag_cache: Getting tag failed, tag cache database does not exist or might be in use!
[ERROR] no such table: tag_cache: Getting tag failed, tag cache database does not exist or might be in use!
[ERROR] no such table: tag_cache: Getting tag failed, tag cache database does not exist or might be in use!
[ERROR] no such table: tag_cache: Getting tag failed, tag cache database does not exist or might be in use!
[ERROR] no such table: tag_cache: Getting tag failed, tag cache database does not exist or might be in use!
[ERROR] no such table: tag_cache: Getting tag failed, tag cache database does not exist or might be in use!
{
  "model-ovr-logistic-regression-tag-not-pixel-art-real-photo": null,
  "model-torch-logistic-regression-tag-not-pixel-art": null,
  "model-ovr-logistic-regression-tag-not-pixel-art": null,
  "model-ovr-svm-tag-not-pixel-art-real-photo": null,
  "model-torch-logistic-regression-tag-not-pixel-art-real-photo": null,
  "model-ovr-svm-tag-not-pixel-art