# Pipeline

This pipeline includes all steps in a single notebook (For Google Colab setup and testing purposes). It is currently a work-in-progress, for now, please visit ./notebooks for the individual steps.

In [1781]:
import sys

# The current environment is automatically detected
# and set to either "GOOGLE_COLAB" or "LOCAL"
# depending on where the script is being run.

CURRENT_ENVIRONMENT = "GOOGLE_COLAB" if "google.colab" in sys.modules else "LOCAL"

In [1782]:
import os
import shutil

if CURRENT_ENVIRONMENT == "GOOGLE_COLAB":
    from google.colab import drive

    # Mount Google Drive
    drive.mount('/content/drive', force_remount=True)

    # If the utils directory exists, delete it
    if os.path.exists('/content/utils'):
        shutil.rmtree('/content/utils')

    # Copy the utils.zip file to the current directory
    !cp /content/drive/MyDrive/utils.zip /content/utils.zip

    # Unzip the utils.zip file
    !unzip -q /content/utils.zip -d /content/utils

    # Delete the utils.zip file
    !rm /content/utils.zip

In [1783]:
# Set to True if you want to skip all EDA
# and data preprocessing steps, otherwise set to False.
SKIP_TO_TRAINING = True

# Set to True if you want to train all models,
# otherwise set to False to save time.
TRAIN_ALL_MODELS = False

# Set to True if you want to generate all train/val/test
# splits, otherwise set to False to save time.
# FORCE_GENERATE_SPLITS = ["Category", "Style"]
FORCE_GENERATE_SPLITS = []

# Update to 1 to use the full preprocessed dataset,
# by default, our models are only trained on 20% of the
# dataset on local and 50% of the dataset on Google Colab.
# SAMPLE_FRACTION = 0.5 if CURRENT_ENVIRONMENT == "GOOGLE_COLAB" else 0.2

# Use full dataset
# SAMPLE_FRACTION = 0.005
SAMPLE_FRACTION = 1

# Set the random seed for reproducibility
RANDOM_SEED = 42

# Set the test and evaluation sizes
TEST_SIZE = 0.2  # 20% of the data
VAL_SIZE = 0.1  # 10% of the data

# Training callback constants
EPOCHS = 200
BATCH_SIZE = 128 if CURRENT_ENVIRONMENT == "GOOGLE_COLAB" else 32
EARLY_STOPPING_PATIENCE = 10
LEARNING_RATE_PATIENCE = 5

In [1784]:
MODELS = [
    {
        "task": 1,
        "models": [
            {
                "name": "CNN",
                "train": False,
            }
        ],
    },
    {
        "task": 2,
        "models": [
            {
                "name": "Siamese",
                "train": False,
            }
        ],
    },
    {
        "task": 3,
        "models": [
            {
                "name": "CNN",
                "train": False,
            },
            {
                "name": "ResNet4",
                "train": False,
            },
            {
                "name": "ResNet8",
                "train": False,
            },
        ],
    },
]

In [1785]:
def generate_model_save_dir(group, name, fraction):
    size = "large" if fraction >= 0.5 else "small"
    fraction = round(fraction * 100) if fraction >= 0.01 else fraction * 100
    name = name.lower()
    return f"t{group}_{size}_{fraction}_{name}"


# Update model save directory
for tasks in MODELS:
    for model in tasks["models"]:
        model["save_dir"] = generate_model_save_dir(
            tasks["task"],
            model["name"],
            SAMPLE_FRACTION,
        )

# Display the model save directories
for tasks in MODELS:
    for model in tasks["models"]:
        print(f"Model save directory: {model['save_dir']}")

Model save directory: t1_large_100_cnn
Model save directory: t2_large_100_siamese
Model save directory: t3_large_100_cnn
Model save directory: t3_large_100_resnet4
Model save directory: t3_large_100_resnet8


In [1786]:
if CURRENT_ENVIRONMENT == "LOCAL":
    from utils.constants import *
elif CURRENT_ENVIRONMENT == "GOOGLE_COLAB":
    from utils.constants_colab import *
elif CURRENT_ENVIRONMENT == "KAGGLE":
    from utils.constants_kaggle import *

In [1787]:
import os
from utils.zipper import read_file, write_file

install_file = os.path.join(DATA_DIR, "install.txt")

# Create the directory if it does not exist
os.makedirs(DATA_DIR, exist_ok=True)

install_content = read_file(install_file)

if ENVIRONMENT == "LOCAL":
    requirements_file = "requirements.txt"
    requirements_content = read_file(requirements_file)
    if install_content != requirements_content:
        get_ipython().system("pip install -r requirements.txt")
        write_file(install_file, requirements_content)
elif ENVIRONMENT == "GOOGLE_COLAB" and not SKIP_TO_TRAINING:
    if install_content != "imagehash":
        get_ipython().system("pip install imagehash")
        write_file(install_file, "imagehash")

In [1788]:
if ENVIRONMENT == "LOCAL":
    from utils.cache import reload_custom_libraries

    # Refresh library cache
    reload_custom_libraries()

In [1789]:
if ENVIRONMENT == "LOCAL":
    from utils.zipper import zip_dir

    # Zip the reloaded libraries to be used in Google Colab
    zip_dir(UTILS_DIR, f"{ZIPPED_RESOURCES_DIR}/utils.zip")

In [1790]:
from utils.zipper import unzip_file

if ENVIRONMENT != "LOCAL" or (ENVIRONMENT == "LOCAL" and not SKIP_TO_TRAINING):
    # Check if the zipped preprocessed datasets file exists and ROOT_DATASET_DIR does not exist
    if os.path.exists(PREPROCESSED_DATASETS_ZIP):
        print("Cached preprocessed datasets found.")
        if not os.path.exists(ROOT_DATASET_DIR):
            print("Extracting preprocessed datasets...")
            unzip_file(
                PREPROCESSED_DATASETS_ZIP,
                PREPROCESSED_DATASETS_EXTRACT_DIR,
            )
            print("Preprocessed datasets extracted.")
        else:
            print("Directory with preprocessed datasets found. Skipping extraction...")
    else:
        print("Cached preprocessed datasets not found. Skipping extraction...")

In [1791]:
if ENVIRONMENT == "LOCAL" and not SKIP_TO_TRAINING:
    import tensorflow as tf
    import tensorflow_hub as hub

    print("TF version:", tf.__version__)
    print("Hub version:", hub.__version__)
    print(
        "GPU is",
        "available" if tf.config.list_physical_devices("GPU") else "NOT AVAILABLE",
    )

In [1792]:
if not SKIP_TO_TRAINING:
    from tqdm.notebook import tqdm
    import zipfile
    import shutil
    import os

    # if force unzip or DATASET_DIR doesn't exist or empty
    if FORCE_UNZIP or not os.path.isdir(DATASET_DIR) or not os.listdir(DATASET_DIR):
        if not os.path.isdir(f"{DATASET_EXTRACT_DIR}/Furniture_Data"):
            print("Starting unzip process...")
            # Extract the zip file
            with zipfile.ZipFile(RAW_DATASET_ZIP, "r") as zip_ref:
                zip_ref.extractall(DATASET_EXTRACT_DIR)
            print("Unzip process completed.")
            print("Checking for `__MACOSX` directory...")

            # Check if the `__MACOSX` directory exists
            if os.path.isdir(MACOS_DIR):
                # Delete the `__MACOSX` directory
                shutil.rmtree(MACOS_DIR)
                print("`__MACOSX` directory found and deleted.")
            else:
                print("`__MACOSX` directory not found. Skipping deletion.")

            if ENVIRONMENT == "GOOGLE_COLAB":
                # Check if the new dataset already directory exists
                print("Checking for existing train dataset directory...")
                if os.path.isdir(DATASET_DIR):
                    # Delete the new dataset directory
                    shutil.rmtree(DATASET_DIR)
                    print("Existing train dataset directory found and deleted.")
                # Move the extracted dataset to the correct directory
                if os.path.isdir(f"{DATASET_EXTRACT_DIR}/Furniture_Data"):
                    print("Moving extracted dataset to the correct directory...")
                    shutil.move(
                        f"{DATASET_EXTRACT_DIR}/Furniture_Data", ROOT_DATASET_DIR
                    )
                    print("Dataset moved successfully.")
            elif ENVIRONMENT == "LOCAL":
                # Rename the extracted directory to the correct directory
                if os.path.isdir(f"{DATASET_EXTRACT_DIR}/Furniture_Data"):
                    print("Renaming extracted dataset directory...")
                    os.rename(f"{DATASET_EXTRACT_DIR}/Furniture_Data", DATASET_DIR)
                    print("Dataset renamed successfully.")
    else:
        print(
            "Skipping unzip process as DATASET_DIR exists and FORCE_UNZIP is not set."
        )

In [1793]:
if not SKIP_TO_TRAINING:
    from IPython.display import display
    import matplotlib.pyplot as plt
    import pandas as pd
    import numpy as np
    from PIL import Image
    from utils.image_process import (
        get_category_styles,
        process_images,
        get_category_image_paths,
        resize_images,
        get_majority_class,
        identify_minority_classes,
        calculate_category_oversampling,
        calculate_style_oversampling,
        oversample_minority_classes,
    )

In [1794]:
if not SKIP_TO_TRAINING:
    categories = ["beds", "chairs", "dressers", "lamps", "sofas", "tables"]

    paths = {
        category: get_category_image_paths(DATASET_DIR, category)
        for category in categories
    }

    print("Successfully extracted all paths!")

In [1795]:
if not SKIP_TO_TRAINING:
    # Calculate the total

    total = 0

    # Print the amount for each category

    for category in categories:

        amount = len(paths[category])

        print(f"Amount of {category}: {amount}")

        total += amount

    # Print the total

    print("Total:", total)

In [1796]:
if not SKIP_TO_TRAINING:
    plt.bar(paths.keys(), [len(v) for v in paths.values()], width=0.5)
    plt.title("Number of images in each category")
    plt.xlabel("Categories")
    plt.ylabel("Amount")
    plt.show()

## Get duplicate images from each categories
### Check duplicants and remove duplicants from each categories

In [1797]:
if not SKIP_TO_TRAINING:
    # Create a new dictionary to store the paths and image sizes for each category
    data = {}
    for category, items in paths.items():
        imageSizes = process_images(items, category, DATASET_DIR)
        data[category] = {
            "paths": get_category_image_paths(CLEANED_DATASET_DIR, category),
            "imageSizes": imageSizes,
        }

    paths = {
        category: get_category_image_paths(CLEANED_DATASET_DIR, category)
        for category in categories
    }

    print("Successfully extracted all paths and image sizes!")

### Recheck the categories after removing duplicants

In [1798]:
if not SKIP_TO_TRAINING:
    # Calculate the total
    total = 0

    # Print the amount for each category
    for category in categories:
        amount = len(data[category]["paths"])
        print(f"Amount of {category}: {amount}")
        total += amount

    # Print the total
    print("Total:", total)

### Visualization

In [1799]:
if not SKIP_TO_TRAINING:
    plt.bar(
        categories, [len(data[category]["paths"]) for category in categories], width=0.5
    )
    plt.title("Number of images in each category")
    plt.xlabel("Categories")
    plt.ylabel("Amount")

-   In the plot we can see the differences between of the amount in different columns. Because of that, we will need to check for the imbalances of our dataset

## Get different amount of image size of each category

In [1800]:
if not SKIP_TO_TRAINING:
    # Define a dictionary to store the dataframes
    # each category in df_dims is data[category][imageSizes]
    df_dims = {
        category: pd.DataFrame(
            data[category]["imageSizes"], columns=["width", "height"]
        )
        for category in categories
    }

    print("Successfully extracted all dimensions!")

In [1801]:
if not SKIP_TO_TRAINING:
    import matplotlib.pyplot as plt

    # Define the number of rows and columns for the subplot grid
    n_rows = 2
    n_cols = 3

    # Create the subplots
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(15, 10))

    # Flatten the axes array
    axs = axs.flatten()

    # Iterate over the categories and axes
    for ax, category in zip(axs, categories):
        # Create the scatter plot for the current category
        df_dims[category].plot.scatter(
            x="width", y="height", ax=ax, title=category.capitalize()
        )

    # Adjust the layout
    plt.tight_layout()
    plt.show()

In [1802]:
if not SKIP_TO_TRAINING:
    # Iterate over the categories
    for category in categories:
        print(category.capitalize())
        display(df_dims[category].describe().T)

- The dataset contains 6 categories of items with 90083 items total
- The dataset contains all file with JPEG type
- The dataset contains image mainly in two dimensions: 224x224 and 350x350. However there are one anomaly dimensions in the table category which is: 500x446

## Data Preprocessing

We will start with resizing all images to 350x350. Here are some reasons:
- The vast majority of the images are already at 350x350. Upscaling the smaller images will introduce minimal distortion compared to downscaling the majority to 224x224.
- Upscaling generally retains more information from the original image than downscaling. While some interpolation artifacts might be introduced, they are less likely to significantly impact model performance compared to the information loss from downscaling.
- The single image with a size of 500x448 is an outlier. which we can either upscale it to match the majority or exclude it from our dataset without significant impact.

In [1803]:
if not SKIP_TO_TRAINING:
    Image.open(data["beds"]["paths"][1])

In [1804]:
if not SKIP_TO_TRAINING:
    for category in categories:
        print(
            f"Number of images in category '{category}': {len(data[category]['paths'])}"
        )

        # Total number of images
        total = sum(len(data[category]["paths"]) for category in categories)
        print(f"Total number of images: {total}")

In [1805]:
if not SKIP_TO_TRAINING:
    # check all file extensions in data[*]["paths"]
    for category in categories:
        print(f"Checking file extensions for category '{category}'")
        for img_path in data[category]["paths"]:
            if not img_path.endswith((".jpg", ".jpeg", ".png")):
                print(f"Invalid file extension: {img_path}")
                break
        print(f"All file extensions are valid for category '{category}'")

In [1806]:
if not SKIP_TO_TRAINING:
    size = (350, 350)

    for category in data.keys():
        resize_images(data, category, size)

In [1807]:
if not SKIP_TO_TRAINING:
    # Display the minority classes
    majority_class, max_count = get_majority_class(data)
    minority_classes = identify_minority_classes(data, 1.0)

    # Display the classes
    print("Majority class:", majority_class)
    print("Minority classes:", minority_classes)

In [1808]:
if not SKIP_TO_TRAINING:
    import seaborn as sns

    # Set the style of seaborn
    sns.set(style="whitegrid")

    # calculate the how much oversampling is needed
    oversampling = calculate_category_oversampling(
        data, minority_classes + [majority_class], max_count
    )

    # Visualize the data
    categories = list(data.keys())
    original_counts = [len(data[category]["paths"]) for category in categories]
    oversampled_counts = [oversampling.get(category, 0) for category in categories]

    # Create DataFrame for seaborn
    df = pd.DataFrame(
        {
            "Category": categories,
            "Original": original_counts,
            "Oversampled": oversampled_counts,
        }
    )

    # Melt DataFrame
    df_melt = df.melt(id_vars="Category", var_name="Type", value_name="Count")

    # Create barplot
    plt.figure(figsize=(10, 6))
    sns.barplot(x="Category", y="Count", hue="Type", data=df_melt, palette="muted")

    # Add some text for labels, title and axes ticks
    plt.title("Number of images in each category with oversampling")
    plt.xlabel("Categories")
    plt.ylabel("Amount")

    plt.show()

    print("Total number of images before oversampling:", sum(original_counts))
    print(
        "Total number of images after oversampling:",
        sum(original_counts) + sum(oversampled_counts),
    )

In [1809]:
if not SKIP_TO_TRAINING:
    # Set the style of seaborn
    sns.set(style="whitegrid")

    # Prepare data for visualization
    categories, styles, original_counts = [], [], []

    for category, paths in data.items():
        style_counts = {}
        for path in paths["paths"]:
            # Extract style from the path
            style = os.path.dirname(path).split("/")[-1]
            if style not in style_counts:
                style_counts[style] = 0
            style_counts[style] += 1

        for style, count in style_counts.items():
            categories.append(category)
            styles.append(style)
            original_counts.append(count)

    # Create DataFrame for seaborn
    df = pd.DataFrame(
        {
            "Category": categories,
            "Style": styles,
            "Original": original_counts,
        }
    )

    # Melt DataFrame
    df_melt = df.melt(
        id_vars=["Category", "Style"], var_name="Type", value_name="Count"
    )

    # Create barplot
    plt.figure(figsize=(10, 6))
    sns.barplot(x="Category", y="Count", hue="Style", data=df_melt, palette="muted")

    # Add some text for labels, title and axes ticks
    plt.title("Number of images in each style within each category before oversampling")
    plt.xlabel("Categories")
    plt.ylabel("Amount")

    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)

    plt.show()

In [1810]:
if not SKIP_TO_TRAINING:
    # Set the style of seaborn
    sns.set(style="whitegrid")

    # calculate the how much oversampling is needed
    oversampling = calculate_style_oversampling(
        data, minority_classes + [majority_class]
    )

    # Prepare data for visualization
    categories, styles, original_counts, oversampled_counts = [], [], [], []

    for category, paths in data.items():
        style_counts = {}
        for path in paths["paths"]:
            # Extract style from the path
            style = os.path.dirname(path).split("/")[-1]
            if style not in style_counts:
                style_counts[style] = 0
            style_counts[style] += 1

        for style, count in style_counts.items():
            categories.append(category)
            styles.append(style)
            original_counts.append(count)
            oversampled_counts.append(oversampling.get(category, {}).get(style, 0))

    # Create DataFrame for seaborn
    df = pd.DataFrame(
        {
            "Category": categories,
            "Style": styles,
            "Original": original_counts,
            "Oversampled": oversampled_counts,
        }
    )

    # Melt DataFrame
    df_melt = df.melt(
        id_vars=["Category", "Style"], var_name="Type", value_name="Count"
    )

    # Create barplot
    plt.figure(figsize=(10, 6))
    sns.barplot(x="Category", y="Count", hue="Style", data=df_melt, palette="muted")

    # Add some text for labels, title and axes ticks
    plt.title("Number of images in each style within each category with oversampling")
    plt.xlabel("Categories")
    plt.ylabel("Amount")

    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)

    plt.show()

In [1811]:
if not SKIP_TO_TRAINING:
    # print all projected sum to check if all categories have roughly the same amount of images
    categories = list(data.keys())
    for category in categories:
        oversampling_count = sum(oversampling.get(category, {}).values())
        print(
            f"Projected sum for category '{category}':",
            len(data[category]["paths"]) + oversampling_count,
        )

    # Create a bar chart of the number of images in each category
    plt.bar(
        categories,
        [len(data[category]["paths"]) + oversampling_count for category in categories],
        width=0.5,
    )
    plt.title("Number of images in each category")
    plt.xlabel("Categories")
    plt.ylabel("Amount")
    plt.show()

In [1812]:
if not SKIP_TO_TRAINING:
    print("Total number of images before oversampling:", sum(original_counts))
    print(
        "Total number of images after oversampling:",
        sum(original_counts) + sum(oversampled_counts),
    )

In [1813]:
if not SKIP_TO_TRAINING:
    # Oversample minority classes
    data = oversample_minority_classes(data, minority_classes + [majority_class])

    # Calculate the total
    total = 0
    # Print the amount for each category
    for category in categories:
        amount = len(data[category]["paths"])
        print(f"Amount of {category}: {amount}")
        total += amount

    # Print the total
    print("Total:", total)

In [1814]:
if not SKIP_TO_TRAINING:
    plt.bar(
        categories, [len(data[category]["paths"]) for category in categories], width=0.5
    )
    plt.title("Number of images in each category")
    plt.xlabel("Categories")
    plt.ylabel("Amount")

In [1815]:
if not SKIP_TO_TRAINING:
    paths = {
        category: get_category_image_paths(PROCESSED_DATASET_DIR, category)
        for category in categories
    }

    for category, items in paths.items():
        imageSizes = process_images(items, category, PROCESSED_DATASET_DIR)
        data[category] = {
            "paths": get_category_image_paths(PROCESSED_DATASET_DIR, category),
            "imageSizes": imageSizes,
        }

    print("Successfully extracted all paths and image sizes!")

In [1816]:
from utils.converter import convert_to_df

raw_dataset_df = convert_to_df(DATASET_DIR, TRAIN_DATA_CSV)
cleaned_dataset_df = convert_to_df(CLEANED_DATASET_DIR, CLEANED_TRAIN_DATA_CSV)
processed_dataset_df = convert_to_df(PROCESSED_DATASET_DIR, PROCESSED_TRAIN_DATA_CSV)

CSV file already exists at ./data/datasets/raw.csv. Loading from cache...
CSV file already exists at ./data/datasets/cleaned.csv. Loading from cache...
CSV file already exists at ./data/datasets/processed.csv. Loading from cache...


### Task 1:

#### Model 1: Convolutional Neural Network (CNN) Model

Convolutional Neural Networks (CNNs) are a class of deep learning models that are primarily used for analyzing visual data. CNNs are particularly effective for tasks such as image classification, object detection, and facial recognition.

The CNN model in this code is a simple yet effective architecture for image classification. It consists of two convolutional layers, each followed by a max pooling layer, a flattening layer, and two dense layers.

The convolutional layers are the first layers of the network, where the model learns several high-level features of the images. Each convolutional layer in this model uses a 3x3 kernel and a ReLU (Rectified Linear Unit) activation function. The 'same' padding is used to preserve the spatial dimensions of the volume such that the output volume size matches the input volume size.

The max pooling layers are used to reduce the spatial dimensions of the output volume. It is a down-sampling operation that is performed after each convolutional layer to reduce the dimensionality and to prevent overfitting.

The flattening layer is used to convert the 2D matrix into a 1D vector, which can be fed into the dense layers.

The dense layers, also known as fully connected layers, perform classification on the features extracted by the convolutional layers and down-sampled by the max pooling layers. The final dense layer uses a softmax activation function to output a probability distribution over the 6 classes of furniture.

The model is trained using the Adam optimizer and the sparse categorical cross-entropy loss function, which is suitable for multi-class classification problems. The model's performance is evaluated using accuracy as the metric.

The model is trained on a training dataset and validated on a validation dataset. After training, the model is saved to a file and then evaluated on a test dataset to measure its performance.

In [1817]:
# Task 1 Models
task_1_model_1 = MODELS[0]["models"][0]  # CNN

In [1818]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

TARGET = "Category"

# Create a data generator
datagen = ImageDataGenerator(rescale=1.0 / 255)

if task_1_model_1["train"] or TRAIN_ALL_MODELS or TARGET in FORCE_GENERATE_SPLITS:
    from sklearn.model_selection import train_test_split

    # Take a sample of the entire dataset, stratifying on TARGET
    sample_df = processed_dataset_df.groupby(TARGET, group_keys=False).apply(
        lambda x: x.sample(frac=SAMPLE_FRACTION, random_state=RANDOM_SEED)
    )

    train_df, temp_df = train_test_split(
        sample_df,
        test_size=TEST_SIZE + VAL_SIZE,
        random_state=RANDOM_SEED,
        stratify=sample_df[TARGET],
    )

    val_df, test_df = train_test_split(
        temp_df,
        test_size=TEST_SIZE / (TEST_SIZE + VAL_SIZE),
        random_state=RANDOM_SEED,
        stratify=temp_df[TARGET],
    )

    # Load images from dataframes for Category classification
    train_generator_category = datagen.flow_from_dataframe(
        dataframe=train_df,
        x_col="Full_Path",
        y_col=TARGET,
        target_size=(350, 350),
        batch_size=BATCH_SIZE,
        class_mode="categorical",
        shuffle=True,
    )

    val_generator_category = datagen.flow_from_dataframe(
        dataframe=val_df,
        x_col="Full_Path",
        y_col=TARGET,
        target_size=(350, 350),
        batch_size=BATCH_SIZE,
        class_mode="categorical",
        shuffle=False,
    )

    test_generator_category = datagen.flow_from_dataframe(
        dataframe=test_df,
        x_col="Full_Path",
        y_col=TARGET,
        target_size=(350, 350),
        batch_size=BATCH_SIZE,
        class_mode="categorical",
        shuffle=False,
    )

In [1819]:
from tensorflow.keras import layers, Model, Input

input_layer = Input(shape=(350, 350, 3))

# Initial layers
x = layers.Conv2D(16, (3, 3), activation="relu", padding="same")(input_layer)
x = layers.BatchNormalization()(x)
x = layers.Conv2D(16, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Dropout(0.25)(x)

# Layer Block 1
x = layers.Conv2D(32, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Conv2D(32, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Dropout(0.25)(x)

# Layer Block 2
x = layers.Conv2D(64, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Conv2D(64, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Dropout(0.25)(x)

# Flatten and Fully-connected layers
x = layers.Flatten()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.5)(x)
output_layer = layers.Dense(6, activation="softmax")(x)

task_1_cnn = Model(inputs=input_layer, outputs=output_layer)

In [1820]:
target_model = task_1_model_1

from utils.models import train_model

if target_model["train"] or TRAIN_ALL_MODELS:
    train_model(
        model=task_1_cnn,
        model_conf=target_model,
        environment=ENVIRONMENT,
        epochs=EPOCHS,
        sample_fraction=SAMPLE_FRACTION,
        early_stopping_patience=EARLY_STOPPING_PATIENCE,
        learning_rate_patience=LEARNING_RATE_PATIENCE,
        train_generator=train_generator_category,
        val_generator=val_generator_category,
        test_generator=test_generator_category,
        root_dir=(
            f"{GOOGLE_DRIVE_ROOT_DIR}/Models"
            if ENVIRONMENT == "GOOGLE_COLAB"
            else MODEL_DIR
        ),
    )

### Task 3:

#### Model 1: Convolutional Neural Network (CNN) Model

In [1821]:
# Task 3 Models
task_3_model_1 = MODELS[2]["models"][0]  # CNN
task_3_model_2 = MODELS[2]["models"][1]  # ResNet (4 blocks)
task_3_model_3 = MODELS[2]["models"][2]  # ResNet (8 blocks)

In [1822]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

TARGET = "Style"

# Create a data generator
datagen = ImageDataGenerator(rescale=1.0 / 255)

if (
    task_3_model_1["train"]
    or task_3_model_2["train"]
    or task_3_model_3["train"]
    or TRAIN_ALL_MODELS
    or TARGET in FORCE_GENERATE_SPLITS
):
    from sklearn.model_selection import train_test_split

    # Take a sample of the entire dataset, stratifying on TARGET
    sample_df = processed_dataset_df.groupby(TARGET, group_keys=False).apply(
        lambda x: x.sample(frac=SAMPLE_FRACTION, random_state=RANDOM_SEED)
    )

    train_df, temp_df = train_test_split(
        sample_df,
        test_size=TEST_SIZE + VAL_SIZE,
        random_state=RANDOM_SEED,
        stratify=sample_df[TARGET],
    )

    val_df, test_df = train_test_split(
        temp_df,
        test_size=TEST_SIZE / (TEST_SIZE + VAL_SIZE),
        random_state=RANDOM_SEED,
        stratify=temp_df[TARGET],
    )

    # Load images from dataframes for Category classification
    train_generator_style = datagen.flow_from_dataframe(
        dataframe=train_df,
        x_col="Full_Path",
        y_col=TARGET,
        target_size=(350, 350),
        batch_size=BATCH_SIZE,
        class_mode="categorical",
        shuffle=True,
    )

    val_generator_style = datagen.flow_from_dataframe(
        dataframe=val_df,
        x_col="Full_Path",
        y_col=TARGET,
        target_size=(350, 350),
        batch_size=BATCH_SIZE,
        class_mode="categorical",
        shuffle=False,
    )

    test_generator_style = datagen.flow_from_dataframe(
        dataframe=test_df,
        x_col="Full_Path",
        y_col=TARGET,
        target_size=(350, 350),
        batch_size=BATCH_SIZE,
        class_mode="categorical",
        shuffle=False,
    )

In [1823]:
from tensorflow.keras import layers, Model, Input
from tensorflow.keras.regularizers import l2

input_layer = Input(shape=(350, 350, 3))

# Initial layers
x = layers.Conv2D(
    16,
    (3, 3),
    activation="relu",
    padding="same",
    kernel_regularizer=l2(0.001),
)(input_layer)
x = layers.BatchNormalization()(x)
x = layers.Conv2D(16, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Dropout(0.25)(x)

# Convolution Block 1
x = layers.Conv2D(32, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Conv2D(32, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Dropout(0.25)(x)

# Convolution Block 2
x = layers.Conv2D(64, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Conv2D(64, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Conv2D(64, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Dropout(0.25)(x)

# Fully-connected layers
x = layers.Flatten()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(64, activation="relu")(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.5)(x)
output_layer = layers.Dense(17, activation="softmax")(x)

task_3_cnn = Model(inputs=input_layer, outputs=output_layer)

In [1824]:
target_model = task_3_model_1

from utils.models import train_model

if target_model["train"] or TRAIN_ALL_MODELS:
    train_model(
        model=task_3_cnn,
        model_conf=target_model,
        environment=ENVIRONMENT,
        epochs=EPOCHS,
        sample_fraction=SAMPLE_FRACTION,
        early_stopping_patience=EARLY_STOPPING_PATIENCE,
        learning_rate_patience=LEARNING_RATE_PATIENCE,
        train_generator=train_generator_style,
        val_generator=val_generator_style,
        test_generator=test_generator_style,
        root_dir=(
            f"{GOOGLE_DRIVE_ROOT_DIR}/Models"
            if ENVIRONMENT == "GOOGLE_COLAB"
            else MODEL_DIR
        ),
    )

In [1825]:
from tensorflow.keras import layers, Model, Input

input_layer = Input(shape=(350, 350, 3))

# Initial layers
x = layers.Conv2D(16, (3, 3), activation="relu", padding="same")(input_layer)
x = layers.BatchNormalization()(x)
x = layers.Conv2D(16, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Dropout(0.25)(x)

# Residual Block 1
shortcut = x
x = layers.Conv2D(32, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Conv2D(32, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
shortcut = layers.Conv2D(32, (1, 1), padding="same")(shortcut)
x = layers.Add()([shortcut, x])  # Residual Connection
x = layers.Activation("relu")(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Dropout(0.25)(x)

# Residual Block 2
shortcut = x
x = layers.Conv2D(64, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Conv2D(64, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
shortcut = layers.Conv2D(64, (1, 1), padding="same")(shortcut)
x = layers.Add()([shortcut, x])  # Residual Connection
x = layers.Activation("relu")(x)
x = layers.Conv2D(64, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Dropout(0.25)(x)

# Additional layers
x = layers.Conv2D(128, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Conv2D(128, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Dropout(0.25)(x)

# Fully-connected layers
x = layers.Flatten()(x)
x = layers.Dense(256, activation="relu")(x)  # Increased units
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.5)(x)
output_layer = layers.Dense(17, activation="softmax")(x)

task_3_resnet_4 = Model(inputs=input_layer, outputs=output_layer)

In [1826]:
target_model = task_3_model_2

from utils.models import train_model

if target_model["train"] or TRAIN_ALL_MODELS:
    train_model(
        model=task_3_resnet_4,
        model_conf=target_model,
        environment=ENVIRONMENT,
        epochs=EPOCHS,
        sample_fraction=SAMPLE_FRACTION,
        early_stopping_patience=EARLY_STOPPING_PATIENCE,
        learning_rate_patience=LEARNING_RATE_PATIENCE,
        train_generator=train_generator_style,
        val_generator=val_generator_style,
        test_generator=test_generator_style,
        root_dir=(
            f"{GOOGLE_DRIVE_ROOT_DIR}/Models"
            if ENVIRONMENT == "GOOGLE_COLAB"
            else MODEL_DIR
        ),
    )

In [1827]:
from tensorflow.keras import layers, Model, Input

input_layer = Input(shape=(350, 350, 3))

# Initial layers
x = layers.Conv2D(16, (3, 3), activation="relu", padding="same")(input_layer)
x = layers.BatchNormalization()(x)
x = layers.Conv2D(16, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Dropout(0.25)(x)

# Residual Block 1
shortcut = x
x = layers.Conv2D(32, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Conv2D(32, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
shortcut = layers.Conv2D(32, (1, 1), padding="same")(shortcut)
x = layers.Add()([shortcut, x])  # Residual Connection
x = layers.Activation("relu")(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Dropout(0.25)(x)

# Residual Block 2
shortcut = x
x = layers.Conv2D(64, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Conv2D(64, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
shortcut = layers.Conv2D(64, (1, 1), padding="same")(shortcut)
x = layers.Add()([shortcut, x])  # Residual Connection
x = layers.Activation("relu")(x)
x = layers.Conv2D(64, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Dropout(0.25)(x)

# Residual Block 3
shortcut = x
x = layers.Conv2D(128, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Conv2D(128, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
shortcut = layers.Conv2D(128, (1, 1), padding="same")(shortcut)
x = layers.Add()([shortcut, x])  # Residual Connection
x = layers.Activation("relu")(x)
x = layers.Conv2D(128, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Dropout(0.25)(x)

# Residual Block 4
shortcut = x
x = layers.Conv2D(256, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Conv2D(256, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
shortcut = layers.Conv2D(256, (1, 1), padding="same")(shortcut)
x = layers.Add()([shortcut, x])  # Residual Connection
x = layers.Activation("relu")(x)
x = layers.Conv2D(256, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Dropout(0.25)(x)

# Additional layers
x = layers.Conv2D(128, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Conv2D(128, (3, 3), activation="relu", padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Dropout(0.25)(x)

# Fully-connected layers
x = layers.Flatten()(x)
x = layers.Dense(256, activation="relu")(x)  # Increased units
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.5)(x)
output_layer = layers.Dense(17, activation="softmax")(x)

task_3_resnet_8 = Model(inputs=input_layer, outputs=output_layer)

In [1828]:
target_model = task_3_model_3

from utils.models import train_model

if target_model["train"] or TRAIN_ALL_MODELS:
    train_model(
        model=task_3_resnet_8,
        model_conf=target_model,
        environment=ENVIRONMENT,
        epochs=EPOCHS,
        sample_fraction=SAMPLE_FRACTION,
        early_stopping_patience=EARLY_STOPPING_PATIENCE,
        learning_rate_patience=LEARNING_RATE_PATIENCE,
        train_generator=train_generator_style,
        val_generator=val_generator_style,
        test_generator=test_generator_style,
        root_dir=(
            f"{GOOGLE_DRIVE_ROOT_DIR}/Models"
            if ENVIRONMENT == "GOOGLE_COLAB"
            else MODEL_DIR
        ),
    )

In [1829]:
from utils.models import evaluate_models, find_best_model

# evaluate_models(f"{MODEL_DIR}/qchan", test_generator_category, SAMPLE_FRACTION)

# best_small_model = find_best_model([f"{MODEL_DIR}/t1_small_0.5_cnn"])
# best_small_model.summary()

### Task 2

#### Model 1: Siamese Network

In [1830]:
# Task 2 Models
task_2_model_1 = MODELS[1]["models"][0]  # Siamese

In [1831]:
target_model = task_2_model_1

if target_model["train"] or TRAIN_ALL_MODELS:
    from keras.models import Model, load_model
    from keras.layers import Conv2D

    category_model = load_model(f"{MODEL_DIR}/t1_large_100_cnn/epoch_21.h5")
    style_model = load_model(f"{MODEL_DIR}/t3_small_20_resnet/epoch_41.h5")

    # get last non-output dense layer of each model
    category_embedding_layer = category_model.layers[-4]
    style_embedding_layer = style_model.layers[-4]

In [1832]:
if target_model["train"] or TRAIN_ALL_MODELS:
    # create new models with the same inputs as the original models
    category_embedding_model = Model(
        inputs=category_model.input, outputs=category_embedding_layer.output
    )

    style_embedding_model = Model(
        inputs=style_model.input, outputs=style_embedding_layer.output
    )

In [1833]:
if target_model["train"] or TRAIN_ALL_MODELS:
    import os
    import numpy as np
    from utils.models import get_combined_embeddings, normalize_path

    # only use 5% if the dataset for training
    cleaned_dataset_df = cleaned_dataset_df.sample(frac=0.05, random_state=RANDOM_SEED)

    # Combine 'Category' and 'Style' into a single column 'Label'
    cleaned_dataset_df["Label"] = cleaned_dataset_df[["Category", "Style"]].apply(
        lambda x: ",".join(x), axis=1
    )

    cleaned_dataset_df["Full_Path"] = (
        cleaned_dataset_df["Full_Path"].astype(str).apply(normalize_path)
    )

    # Create or load combined embeddings and mapping
    if not (
        os.path.exists("combined_embeddings.npy")
        and os.path.exists("embedding_index_mapping.npy")
    ):
        result = get_combined_embeddings(
            datagen.flow_from_dataframe(
                dataframe=cleaned_dataset_df,
                x_col="Full_Path",
                y_col="Label",
                target_size=(350, 350),
                batch_size=BATCH_SIZE // 2,
                class_mode="categorical",
            ),
            category_embedding_model,
            style_embedding_model,
            return_index_mapping=True,
            cache_file="combined_embeddings.npy",
        )
        combined_embeddings, embedding_index_mapping = result  # Unpack the result
    else:
        # Load combined embeddings and mapping (explicitly convert to NumPy array)
        result = np.load("combined_embeddings.npy", allow_pickle=True)

        if (
            isinstance(result, tuple) and len(result) == 2
        ):  # Check if mapping is included
            combined_embeddings, embedding_index_mapping = result
        else:
            # If only embeddings were saved, create an empty mapping
            combined_embeddings = np.array(result)  # Explicitly convert to NumPy array
            embedding_index_mapping = {}

In [1834]:
if target_model["train"] or TRAIN_ALL_MODELS:
    from keras.layers import Input, Lambda
    from keras.models import Model
    import keras.backend as K
    import tensorflow as tf

    # Initialize embedding dimensions
    CATEGORY_EMBEDDING_DIM = 128
    STYLE_EMBEDDING_DIM = 256

    # Siamese network input for combined embeddings
    input_a_combined = Input(shape=combined_embeddings.shape[1:])
    input_b_combined = Input(shape=combined_embeddings.shape[1:])

    # Reshape inputs to be 2D before feeding to Lambda layer
    input_a_combined_reshaped = tf.expand_dims(input_a_combined, axis=0)
    input_b_combined_reshaped = tf.expand_dims(input_b_combined, axis=0)

    # Slice the combined embeddings into category and style embeddings
    encoded_a_category = Lambda(lambda x: x[:, :CATEGORY_EMBEDDING_DIM])(
        input_a_combined_reshaped
    )
    encoded_b_category = Lambda(lambda x: x[:, :CATEGORY_EMBEDDING_DIM])(
        input_b_combined_reshaped
    )

    encoded_a_style = Lambda(lambda x: x[:, CATEGORY_EMBEDDING_DIM:])(
        input_a_combined_reshaped
    )
    encoded_b_style = Lambda(lambda x: x[:, CATEGORY_EMBEDDING_DIM:])(
        input_b_combined_reshaped
    )

    # Calculate distance (e.g., Euclidean) for both category and style
    distance_category = Lambda(
        lambda x: K.sqrt(K.sum(K.square(x[0] - x[1]), axis=1, keepdims=True))
    )([encoded_a_category, encoded_b_category])
    distance_style = Lambda(
        lambda x: K.sqrt(K.sum(K.square(x[0] - x[1]), axis=1, keepdims=True))
    )([encoded_a_style, encoded_b_style])

    # Siamese model with multiple outputs for different distances
    siamese_model = Model(
        inputs=[input_a_combined, input_b_combined],
        outputs=[distance_category, distance_style],
    )

In [1835]:
if target_model["train"] or TRAIN_ALL_MODELS:
    from utils.models import create_image_pairs

    # Create the pairs and labels
    image_pairs, labels = create_image_pairs(cleaned_dataset_df)

In [1836]:
if target_model["train"] or TRAIN_ALL_MODELS:
    from utils.models import SiameseDataGenerator, multi_task_contrastive_loss_wrapper
    from keras.callbacks import EarlyStopping, ModelCheckpoint
    from sklearn.model_selection import train_test_split

    # Split pairs and labels into training and validation sets
    train_pairs, val_pairs, train_labels, val_labels = train_test_split(
        image_pairs, labels, test_size=TEST_SIZE, random_state=RANDOM_SEED
    )

    # augment the data
    augmentation_params = {
        "rotation_range": 20,
        "width_shift_range": 0.2,
        "height_shift_range": 0.2,
        "horizontal_flip": True,
    }

    # Create the data generators
    train_generator = SiameseDataGenerator(
        train_pairs,
        train_labels,
        BATCH_SIZE,
        combined_embeddings,
        embedding_index_mapping,
        augmentation_params,
    )

    val_generator = SiameseDataGenerator(
        val_pairs, val_labels, BATCH_SIZE, combined_embeddings, embedding_index_mapping
    )

    # Compile and fit the Siamese model
    siamese_model.compile(
        loss=multi_task_contrastive_loss_wrapper(batch_size=BATCH_SIZE),
        optimizer="adam",
        metrics=["accuracy"],
    )

In [1837]:
if target_model["train"] or TRAIN_ALL_MODELS:
    # Early stopping
    early_stopping = EarlyStopping(
        monitor="val_loss", patience=5, restore_best_weights=True
    )

    # Model checkpoint
    checkpoint = ModelCheckpoint(
        "best_siamese_model.h5", monitor="val_loss", save_best_only=True
    )

    siamese_model.fit(
        train_generator,
        validation_data=val_generator,
        epochs=EPOCHS,
        callbacks=[early_stopping, checkpoint],
    )

In [1838]:
from utils.models import evaluate_models

# evaluate_models(
#     f"{MODEL_DIR}/t1_small_20_cnn", test_generator_category, SAMPLE_FRACTION
# )
# evaluate_models(
#     f"{MODEL_DIR}/t1_large_100_cnn", test_generator_category, SAMPLE_FRACTION
# )
# evaluate_models(
#     f"{MODEL_DIR}/t3_small_20_resnet", test_generator_style, SAMPLE_FRACTION
# )
# evaluate_models(
#     f"{MODEL_DIR}/t3_large_50_resnet", test_generator_style, SAMPLE_FRACTION
# )
# evaluate_models(
#     f"{MODEL_DIR}/t3_large_100_resnet", test_generator_style, SAMPLE_FRACTION
# )

# evaluate_models(f"{MODEL_DIR}/t3_large_100_cnn", test_generator_style, SAMPLE_FRACTION)
# evaluate_models(f"{MODEL_DIR}/t3_small_40_cnn", test_generator_style, SAMPLE_FRACTION)
# evaluate_models(f"{MODEL_DIR}/t3_small_30_cnn", test_generator_style, SAMPLE_FRACTION)
# evaluate_models(f"{MODEL_DIR}/t3_small_20_cnn", test_generator_style, SAMPLE_FRACTION)