**Import the required modules**

In [1]:
import torch
if torch.cuda.is_available():
    device = "cuda"
else:
    print ('[WARNING] CUDA/GPU is not available! Compute-intensive scripts on this notebook will be run on CPU.')
    device =  "cpu"

Clone the kcg-ml repo as it has all the required utils and codes from preprocessing the image datasets to training the models and using them to classify the images.

In [2]:
!git clone https://github.com/kk-digital/kcg-ml
%cd kcg-ml

Cloning into 'kcg-ml'...
remote: Enumerating objects: 3060, done.[K
remote: Counting objects: 100% (1052/1052), done.[K
remote: Compressing objects: 100% (423/423), done.[K
remote: Total 3060 (delta 666), reused 956 (delta 624), pack-reused 2008[K
Receiving objects: 100% (3060/3060), 26.62 MiB | 18.37 MiB/s, done.
Resolving deltas: 100% (1813/1813), done.
/content/kcg-ml


In [3]:
#@title Download Dataset
%%capture
# Mega CMD Requirements
!apt install libmms0 libc-ares2 libc6 libcrypto++6 libgcc1 libmediainfo0v5 libpcre3 libpcrecpp0v5 libssl1.1 libstdc++6 libzen0v5 zlib1g apt-transport-https
!apt --fix-broken install

# Mega CMD Download and Installation
!wget https://mega.nz/linux/MEGAsync/xUbuntu_18.04/amd64/megacmd-xUbuntu_18.04_amd64.deb
!sudo dpkg -i megacmd-xUbuntu_18.04_amd64.deb

import os
import contextlib
from subprocess import Popen, PIPE, STDOUT

# Download URL on Mega
tagged_dataset_url = 'https://mega.nz/file/8RhFRISJ#vlQhjBp5hrNtQzFnRVQtD_ilHfIyLOSrlwVXEb3t1UM'
#other_dataset_url  = 'https://mega.nz/file/5FhGkRjD#yFihfhr1RMHfPTffhPB4tQtJsnn_HBYFOSfqdPOrp78'
# Destination path for download
destination_path = './downloads'
os.makedirs(destination_path, exist_ok=True)

# Function for printing the download progress
def print_progress(proc, stream='stdout'):
  newlines = ['\n', '\r\n', '\r']
  stream = getattr(proc, stream)
  with contextlib.closing(stream):
      while True:
          out = []
          last = stream.read(1)
          # Don't loop forever
          if last == '' and proc.poll() is not None:
              break
          while last not in newlines:
              # Don't loop forever
              if last == '' and proc.poll() is not None:
                  break
              out.append(last)
              last = stream.read(1)
          out = ''.join(out)
          yield out

# Download dataset
for data_url in [tagged_dataset_url]:
  cmd = ["mega-get", data_url, destination_path]
  proc = Popen(cmd,stdout=PIPE, stderr=STDOUT, universal_newlines=True)
  for line in print_progress(proc):
    print(line)


In [None]:
# unzip the dataset
%%capture
!unzip /content/kcg-ml/downloads/pixel-art-tagged-v3.zip

In [20]:
%%capture
# add the required datasets and zip again
!zip -r ./datasets/tagged_dataset.zip ./pixel-art-tagged-v3/pos-pixel-art-environmental ./pixel-art-tagged-v3/other-training ./pixel-art-tagged-v3/other-validation

**Generate the CLIP vectors of the images and train the Logistic Regression Model**

In [7]:
%%capture
%pip install ascii_graph open_clip_torch patool fire

In [8]:
import sys
sys.path.insert(0, './image_classifier_pipeline/data_loader/')
from ImageDatasetProcessor import ImageDatasetProcessor

In [9]:
dataset_path = './datasets/tagged_dataset.zip'
output_folder = './output'
tagged_dataset = True
clip_model = 'ViT-L-14'
pretrained = 'laion2b_s32b_b82k'
batch_size = 32
num_threads = 4
device = None

In [21]:
ImageDatasetProcessor.process_dataset(
    dataset_path, 
    output_folder,
    tagged_dataset, 
    clip_model, 
    pretrained,
    batch_size, 
    num_threads, 
    device
)

# clean the temp storage
!rm -rf ./datasets/tagged_dataset-decompressed-tmp

patool: Extracting ./datasets/tagged_dataset.zip ...
patool: running /usr/bin/7z x -o./datasets/tagged_dataset-decompressed-tmp -- ./datasets/tagged_dataset.zip
patool: ... ./datasets/tagged_dataset.zip extracted to `./datasets/tagged_dataset-decompressed-tmp'.
is archive dataset
dataset folder path  = ./datasets/tagged_dataset-decompressed-tmp/pixel-art-tagged-v3
Processing...


100%|██████████| 25/25 [00:52<00:00,  2.12s/it]


[INFO] Writing to database table in ./output/image_dataset_cache.sqlite
[INFO] Finished.


**K Multilinear Logistic Regression**

In [30]:
import sys
sys.path.append('./image_classifier_pipeline/model_api/')
import warnings
import numpy as np
from datetime import datetime
from logistic_regression_pytorch import LogisticRegressionPytorch
from train_helper_functions import *
from model_api import ModelApi


warnings.filterwarnings('ignore')
# Number of Logistic Regression Models to train
K = 10

In [31]:
# Run from ./image-tagging-tools directory
# parameters required to train the model
metadata_json = './output/input-metadata.json' 
tag_to_hash_json = './output/input-tag-to-image-hash-list.json'
output_dir = './output'
test_per = 0.5 # percentage of the test data

In [36]:
def train(
        metadata_json    : str, 
        tag_to_hash_json : str,
        output_dir : str, 
        test_per : float,
        shuffle : bool = True,
        ):
    """main function to be running, calls other function for making trained models pickle files,
    and making of mapping files.

    :param metadata_json: path to the metadata json file containg embeddings and tags.
    :type metadata_json: str
    :param tag_to_hash_json: path to tag-to-hash json file containg embeddings and tags.
    :type tag_to_hash_json: str
    :param output_dir: directory for the classification models pickle files and mappings jsons. 
    :type output_dir: str
    :param test_per: percentage of the test embeddings 
    :type test_per: float
    :param shuffle: shuffle the data, defaults to True
    :type shuffle: bool, optional
    :rtype: None
    """

    # Classifier Model API opject
    model_api = ModelApi()

    # load tag to hash json and metadata json.
    metadata_dict    = load_json(metadata_json)
    tag_to_hash_json = load_json(tag_to_hash_json)
    if metadata_dict is None or tag_to_hash_json is None : # Problem happened with the json file loading.
        return

    # Get training start time
    t_start = datetime.now()
    # get the two output folder paths (models and reports) with respect to \
    # the output directory provided in the script.
    report_out_folder , models_out_folder = check_out_folder(output_dir) 

    # other training and other validation embeddings lists.
    other_all_emb_list     = [metadata_dict[hash_id]["embeddings_vector"] for hash_id in tag_to_hash_json['other-training']]
    other_val_all_emb_list = [metadata_dict[hash_id]["embeddings_vector"] for hash_id in tag_to_hash_json['other-validation']]

    # check if the shuffle flag is true, shuffle the data.
    if shuffle:
        np.random.shuffle(other_all_emb_list)
        np.random.shuffle(other_val_all_emb_list)

    # get embeddings from tag_emb_dict and make it reay for training the classifier 
    for tag in tag_to_hash_json:

        # make sure that it's a pixel art class tag. 
        if tag in ['other-training' ,'other-validation']:
            continue

        # get embedding list of tag images.
        tag_all_emb_list = [metadata_dict[hash_id]["embeddings_vector"] for hash_id in tag_to_hash_json[tag]]

        # check if the shuffle flag is true, shuffle the data.
        if shuffle:
            np.random.shuffle(tag_all_emb_list)

        # get train test embeddings and labels.
        train_emb, train_labels, test_emb, test_labels , t_n , o_n = get_train_test(tag_all_emb_list, other_all_emb_list , test_per)
        
        # torch-logistic-regression 
        model_type = 'torch-logistic-regression'

        # Check if classifier model with model_type and tag already exist.
        model = model_api.get_model_by_type_tag(model_type, tag)

        if len(model)>0:
                # Existing classifier model with model_type and tag found. Do not create new one
                print (f'Classifier model for type: {model_type} and tag: {tag} already exist. Training will use existing model')
                classifier = model['classifier']
                print (f"MODEL TYPE {model_type}, {model['model_type']}")
        else:
            # No existing classifier model with model_type and tag. Initialize new classifier model
            print (f'Initialize new classifier model for type: {model_type} and tag: {tag}')
            classifier = LogisticRegressionPytorch(output_dim=1)
            
        classifier = train_loop(model = classifier, train_emb=train_emb, train_labels=train_labels)
            

        test_emb = torch.from_numpy(test_emb.astype(np.float32))
        predictions = classifier(test_emb)
        predictions = predictions.round().view(1,-1).squeeze().detach().numpy()
            
        # get histogram data.
        in_tag_tagged  = histogram_list(np.array(tag_all_emb_list), classifier, other=False, using_torch=(model_type == 'torch-logistic-regression')) # histogram data for in-tag images 
        out_tag_tagged = histogram_list(np.array(other_val_all_emb_list), classifier,  other=True, using_torch=(model_type == 'torch-logistic-regression')) # histogram data for out-tag images
            
        # put all lines for text file report in one .
        text_file_lines = [ f"model: {model_type}\n", "task: binary-classification\n",
                                f"tag: [{tag}]\n\n", f"tag-set-image-count:   {len(tag_all_emb_list)} \n",
                                f"other-set-image-count: {len(other_all_emb_list)} \n",
                                f'validation-tag-image-count   : {t_n}  \n',f'validation-other-image-count : {o_n}  \n\n']
        # text_file_lines.extend(calc_confusion_matrix(test_labels ,predictions, tag)) 
        # text_file_lines.extend(histogram_lines(in_tag_tagged, 'in-distribution'))  
        # text_file_lines.extend(histogram_lines(out_tag_tagged,'out-distribution')) 
        # generate report for ovr logistic regression model.
        generate_report(report_out_folder , tag , text_file_lines , model_name=model_type)
        # generate model pickle file.
        generate_model_file(models_out_folder, classifier, model_type, t_start, tag)

    print("[INFO] Finished.")

In [35]:
# train K models
for _ in range(K):
  main(
      metadata_json = metadata_json,
      tag_to_hash_json = tag_to_hash_json,
      output_dir = output_dir,
      test_per = test_per
      shuffle=True
  )

Initialize new classifier model for type: torch-logistic-regression and tag: pos-pixel-art-environmental
[INFO] Finished.
