In [1]:
# !pip install tensorflow tensorlayerx scikit-learn 

# CNN Scratch

In endeavor finding the best model that fit for (Multi Label Image Classification Dataset)[https://www.kaggle.com/datasets/meherunnesashraboni/multi-label-image-classification-dataset/data], building CNN model from scratch was made. The work employing Tensorflow's deep learning backend with Tensorlayerx's API.

The process to make the model is as follow:

1. Dataset Pre-Processing
2. Dataset Loading
3. Model Architecture
4. Training the Model
5. Model Evaluation

## 1. Dataset Pre-Processing

The downloaded dataset structure as follow:

1. multilabel_classification.csv
2. multilabel_classification(2).csv
3. multilabel_classification(6)-reduced_modified.csv
4. multilabel_classification(7).csv
5. /images

During manual exploration, it is found there are 7844 images in total. So it is expected that the csv file, must also consist of only 7844 rows. With this requirement, the `multilabel_classification(6)-reduced_modified.csv` is chosen for its validity. 

In dataset pre-processing endeavor, the more precise data integrity checks is conducted. This to ensure the registered label within csv file is 1:1 with the image within `/images` folder. 

In [2]:
import pandas;
import os;
import pathlib;
from tqdm import tqdm;

In [None]:
# from google.colab import drive;
# drive.mount('/content/drive');

In [3]:
# Load csv file
dataframe = pandas.read_csv("/Users/yosuakristianto/Documents/Dataset Repository/FIN DL/Number 1/multilabel_classification(6)-reduced_modified.csv");
# dataframe = pandas.read_csv("D:\\tDatase\\FIN DL\\Number 1\\multilabel_classification(6)-reduced_modified.csv");
# dataframe = pandas.read_csv("/content/drive/MyDrive/Collab Dataset/FIN DL/Number 1/multilabel_classification(6)-reduced_modified.csv");

In [4]:
# Integrity checking initialization
image_folder_path = "/Users/yosuakristianto/Documents/Dataset Repository/FIN DL/Number 1/images/";
# image_folder_path = "D:\\tDatase\\FIN DL\\Number 1\\images\\";
# image_folder_path = "/content/drive/MyDrive/Collab Dataset/FIN DL/Number 1/images/";

In [5]:
# Starts integrity checking
print("Data integrity checking -", end = " ");

# Check for image data that not exist in the folder but exist in csv 
not_found_image = [];

for idx, row in tqdm(dataframe.iterrows()):
    if(not os.path.exists(path = image_folder_path + row["Image_Name"])):
        not_found_image.append(idx);

# Check for duplicated image name data in csv
dataframe = dataframe.drop_duplicates(subset = ["Image_Name"]);

dataframe.shape, len(not_found_image)

Data integrity checking - 

7943it [00:01, 5963.77it/s]


((7843, 12), 0)

## 2. Data Loading

With no not found image came from csv file, further steps to do data pre-processing is unnecessary. The step continue to data loading. Data loading part consist of these process below:

1. Load image data from disk as NumPy array
Why? Because the deep learning model cannot directly read the image file. Instead, image convertion into HWC (Height-Width-Channel) array format, where every channels contains RGB color value. 

Since the csv and image files are separated entity, sorting image files by its name is conducted for both csv and the image folder during data loading

2. Define feature - label for every loaded image data to replace the image name.

3. Train-Test-Val splits

The splits ratio is 70:30:30

4. Data Loading Pattern
Making data loading pattern for better batch data segmentation and transformation. This is necessary since the minimum image height is 33 and image width was 120. This can be done by standardizing all images to 224 x 224.

In [6]:
from sklearn.model_selection import train_test_split;
import cv2;

In [7]:
# Load image data from disk as NumPy array

# Sort image file by name
list_files = os.listdir(image_folder_path);
list_files.sort()

images = [];

heights, widths = [], [];

for i in tqdm(list_files):
    image = cv2.imread(filename = image_folder_path + i);
    images.append(image);

    height, width, channels = image.shape;

    heights.append(height);
    widths.append(width);

print(f"Minimal size: height: {min(heights)} - width: {min(widths)}");

100%|██████████| 7843/7843 [01:04<00:00, 121.94it/s]

Minimal size: height: 33 - width: 120





In [8]:
# Defining which is label and which is feature

# feature has been declared on the above cell as `images`
images = images;

# Convert dataframe's content by image name alp
sorted_dataframe = dataframe.sort_values(by=["Image_Name"], ascending=[True]);

# label
labels = sorted_dataframe.drop(columns = [" Classes(motorcycle, truck, boat, bus, cycle, , , , , , , sitar, ektara, flutes, tabla, harmonium)", "Image_Name"]);

(len(images)), labels.shape

(7843, (7843, 10))

In [9]:
# Train-Test-Split

# First to split it into 70:30
train_image, test_image, train_label, test_label = train_test_split(images, labels, train_size = 0.7, test_size = 0.3, random_state = 42); 

# Since the test is already 30, then to make it 15:15 with val, we need to split it to 50:50
test_image, val_image, test_label, val_label = train_test_split(test_image, test_label, train_size = 0.5, test_size = 0.5, random_state = 42);

In [10]:
import tensorlayerx;
import numpy;

os.environ["TL_BACKEND"] = "tensorflow";

2024-05-29 20:46:48.565465: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Using TensorFlow backend.


In [11]:
from tensorlayerx.dataflow import Dataset, DataLoader;
from tensorlayerx.vision.transforms import Compose, Normalize, Resize;

In [12]:
"""
image_standardizer

This function helps standardize the image before being load to the model. 
All images are standardized to 224 x 224. And all pixel values are standardized into -1 to 1. 
"""
def image_standardizer(image):
    transformer = Compose([
        Resize(size = (224, 224)),
        Normalize(mean=(127.5), std=(127.5), data_format='HWC')
    ]);

    return transformer(image);

# Data Loading Pattern
class MainDataset(Dataset):
    def __init__(self, image, label):
        self.data = image;
        # self.label = tensorlayerx.convert_to_tensor(label, dtype = tensorlayerx.float32);
        self.label = tensorlayerx.convert_to_tensor(label, dtype = tensorlayerx.float32);

    # Take item by index of data
    def __getitem__(self, index):
        return image_standardizer(self.data[index]), self.label[index];

    # Get length of data row
    def __len__(self):
        return len(self.data);

In [13]:
# Data loading and Transformation
train_set = MainDataset(train_image, train_label);
test_set = MainDataset(test_image, test_label);
val_set = MainDataset(val_image, val_label);

# TensorlayerX's Data Loader
train_set_loader = DataLoader(train_set, batch_size = 16);
test_set_loader = DataLoader(test_set);
val_set_loader = DataLoader(val_set);

## 3. Model Architecture

This part contains the model architecture within this code. This is where the model being defined. The structure of this part is as follow:

1. Model Architecture
2. Model Init 
3. Define Propagation

In [14]:
from tensorlayerx.nn import Module, Conv2d, MaxPool2d, Dropout, Flatten, Linear, Input;
from tensorlayerx import LeakyReLU, Softmax;

In [15]:
# Define model architecture
class CNNScratch(Module):
    def __init__(self):
        super(CNNScratch, self).__init__();

        self.input = Input(shape = (16, 256, 256));

        # Convolutional 1 
        self.conv1 = Conv2d(out_channels = 128, kernel_size = (3, 3), stride = (1, 1), act = LeakyReLU, padding = "SAME", name = "conv1");
        self.pool1 = MaxPool2d(kernel_size = (2, 2), name = "pool1");

        # Convolutional 2
        self.conv2 = Conv2d(out_channels = 256, kernel_size = (5, 5), stride = (1, 1), act = LeakyReLU, padding = "SAME", name = "conv2");
        self.pool2 = MaxPool2d(kernel_size = (2, 2), name = "pool2");

        # Convolutional 3
        self.conv3 = Conv2d(out_channels = 128, kernel_size = (3, 3), stride = (1, 1), act = LeakyReLU, padding = "SAME", name = "conv3");
        self.pool3 = MaxPool2d(kernel_size = (3, 3), name = "pool3");

        # Convolutional 4
        self.conv4 = Conv2d(out_channels = 32, kernel_size = (1, 1), stride = (1, 1), act = LeakyReLU, padding = "SAME", name = "conv4");
        self.pool4 = MaxPool2d(kernel_size = (3, 3), name = "pool4");

        # Fully Connected
        self.flat = Flatten(name = "flat");

        self.do1 = Dropout(p = 0.5, name = "do1");

        self.linear1 = Linear(out_features = 128, name = "lin1"); # Males ngitung in jadi suruh tensorflownya ngitung sendiri aja
        self.out = Linear(out_features = 10, in_features = 128, act = Softmax, name = "out");

    def forward(self, x):
        x = self.conv1(x);
        x = self.pool1(x);
        x = self.conv2(x);
        x = self.pool2(x);
        x = self.conv3(x);
        x = self.pool3(x);
        x = self.conv4(x);
        x = self.pool4(x);

        x = self.flat(x);
        x = self.do1(x);
        x = self.linear1(x);
        x = self.out(x);

        return x;

    def construct(self, x):
        x = self.input(x);
        
        x = self.conv1(x);
        x = self.pool1(x);
        x = self.conv2(x);
        x = self.pool2(x);
        x = self.conv3(x);
        x = self.pool3(x);
        x = self.conv4(x);
        x = self.pool4(x);

        x = self.flat(x);
        x = self.do1(x);
        x = self.linear1(x);
        x = self.out(x);

        return x;

In [16]:
class NetWithLoss(Module):
    
    def __init__(self, network: Module, loss_function):
        super(NetWithLoss, self).__init__();
        self.network = network;
        self.loss_fn = loss_function;

    def forward(self, data, ground_truth):
        prediction = self.network(data);

        loss = self.loss_fn(prediction, ground_truth);
        return loss;

In [17]:
from tensorlayerx.optimizers import SGD;
from tensorlayerx.model import Model, TrainOneStep;
from tensorlayerx.losses import sigmoid_cross_entropy, binary_cross_entropy;

In [18]:
# Hyperparameters

print("Start training network \n\n");

epoch = 1;
network = CNNScratch();
# net_W_loss = NetWithLoss(network = network, loss_function = sigmoid_cross_entropy);
net_W_loss = NetWithLoss(network = network, loss_function = binary_cross_entropy);

trainer = TrainOneStep(net_with_loss = net_W_loss, optimizer = SGD(lr = 1e-3), train_weights = network.trainable_weights);

metric_train = tensorlayerx.metrics.Accuracy();
metric_val = tensorlayerx.metrics.Accuracy();

Start training network 


[TLX] Input  _inputlayer_1: (16, 256, 256)
[TLX] Conv2d conv1: out_channels : 128 kernel_size: (3, 3) stride: (1, 1) pad: SAME act: LeakyReLU
[TLX] MaxPool2d pool1: kernel_size: (2, 2) stride: (2, 2) padding: SAME return_mask: False
[TLX] Conv2d conv2: out_channels : 256 kernel_size: (5, 5) stride: (1, 1) pad: SAME act: LeakyReLU
[TLX] MaxPool2d pool2: kernel_size: (2, 2) stride: (2, 2) padding: SAME return_mask: False
[TLX] Conv2d conv3: out_channels : 128 kernel_size: (3, 3) stride: (1, 1) pad: SAME act: LeakyReLU
[TLX] MaxPool2d pool3: kernel_size: (3, 3) stride: (2, 2) padding: SAME return_mask: False
[TLX] Conv2d conv4: out_channels : 32 kernel_size: (1, 1) stride: (1, 1) pad: SAME act: LeakyReLU
[TLX] MaxPool2d pool4: kernel_size: (3, 3) stride: (2, 2) padding: SAME return_mask: False
[TLX] Flatten flat:
[TLX] Dropout do1: p: 0.500000 
[TLX] Linear  lin1: 128 No Activation
[TLX] Linear  out: 10 Softmax


In [19]:
progress_epoch = [];
progress_train_loss = [];
progress_train_acc = [];
progress_val_loss = [];
progress_val_acc = [];

for i in range(epoch):

    print(f"Epoch [{i + 1} / {epoch}] - ", end = " ");

    # Training Phase
    network.set_train();
    train_loss, train_acc, train_n_iter = 0, 0, 0;

    for step, (X_batch, y_batch) in enumerate(tqdm(train_set_loader)):
        
        loss = trainer(X_batch, y_batch);

        train_loss += loss;
        train_n_iter += 1;

        logits = network(X_batch);

        batch_accuracy = 0;
        # Calculate accuracy
        for i in range(len(y_batch)):
            print(logits[i], [y_batch[i]])
            metric_train.update(logits[i], y_batch[i]);
            batch_accuracy += metric_train.result();
    
        train_acc += batch_accuracy / len(y_batch);

     # Validation phase

    network.set_eval();
    val_loss, val_acc, val_n_iter = 0, 0, 0;

    for X_batch, y_batch in enumerate(val_set_loader):
        loss = trainer(X_batch, y_batch);
        val_loss += loss;

        val_n_iter += 1;

        logits = network(X_batch);

        # Calculate accuracy
        metric_val.update(logits, y_batch);
        val_acc += metric_val.result();

    train_loss = train_loss / train_n_iter;
    train_acc = train_acc / train_n_iter;
    val_loss = val_loss / val_n_iter;
    val_acc = val_acc / val_n_iter;

    progress_epoch.append(i+1);
    progress_train_acc.append(train_acc);
    progress_train_loss.append(train_loss);
    progress_val_acc.append(val_acc);
    progress_val_loss.append(val_loss);

    print(f"Epoch {i+1} - train loss: {train_loss} - train acc: {train_acc} - val loss: {val_loss} - val acc: {val_acc}");

Epoch [1 / 1] -  

  0%|          | 0/344 [00:00<?, ?it/s]2024-05-29 20:47:21.458579: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: INVALID_ARGUMENT: Expected dimension in the range [-1, 1), but got 1
  0%|          | 0/344 [00:25<?, ?it/s]

tf.Tensor(
[0.09987678 0.09986376 0.09985402 0.10020021 0.09995823 0.09997252
 0.09990306 0.10004274 0.10007906 0.10024963], shape=(10,), dtype=float32) [<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0.], dtype=float32)>]





InvalidArgumentError: {{function_node __wrapped__ArgMax_device_/job:localhost/replica:0/task:0/device:CPU:0}} Expected dimension in the range [-1, 1), but got 1 [Op:ArgMax] name: 