# Google Colab Pipeline

## Task 3: CNN Model for Furniture Style Recognition

In [1]:
import sys

CURRENT_ENVIRONMENT = "GOOGLE_COLAB" if "google.colab" in sys.modules else "LOCAL"

In [2]:
import sys

if CURRENT_ENVIRONMENT == "LOCAL":
    # go out by one level of directory
    sys.path.append("..")

In [3]:
import os
import shutil

if CURRENT_ENVIRONMENT == "GOOGLE_COLAB":
    from google.colab import drive

    # Mount Google Drive
    drive.mount('/content/drive', force_remount=True)

    # If the utils directory exists, delete it
    if os.path.exists('/content/utils'):
        shutil.rmtree('/content/utils')

    # Copy the utils.zip file to the current directory
    !cp /content/drive/MyDrive/utils.zip /content/utils.zip

    # Unzip the utils.zip file
    !unzip -q /content/utils.zip -d /content/utils

    # Delete the utils.zip file
    !rm /content/utils.zip

In [4]:
if CURRENT_ENVIRONMENT == "LOCAL":
    from utils.constants import *
elif CURRENT_ENVIRONMENT == "GOOGLE_COLAB":
    from utils.constants_colab import *

In [5]:
import os
from utils.zipper import read_file, write_file

install_file = os.path.join(DATA_DIR, "install.txt")

# Create the directory if it does not exist
os.makedirs(DATA_DIR, exist_ok=True)

install_content = read_file(install_file)

if ENVIRONMENT == "LOCAL":
    requirements_file = "requirements.txt"
    requirements_content = read_file(requirements_file)
    if install_content != requirements_content:
        get_ipython().system("pip install -r requirements.txt")
        write_file(install_file, requirements_content)

In [6]:
import tensorflow as tf
import tensorflow_hub as hub

print("TF version:", tf.__version__)
print("Hub version:", hub.__version__)
print(
    "GPU is", "available" if tf.config.list_physical_devices("GPU") else "NOT AVAILABLE"
)

TF version: 2.10.1
Hub version: 0.16.1
GPU is available


In [7]:
if ENVIRONMENT == "LOCAL":
    from utils.cache import reload_custom_libraries

    # Refresh library cache
    reload_custom_libraries()

In [8]:
from utils.zipper import unzip_file

# Check if the zipped preprocessed datasets file exists and ROOT_DATASET_DIR does not exist
if os.path.exists(MODEL_TRAINING_DATA_ZIP):
    print("Cached preprocessed datasets found.")
    if not os.path.exists(ROOT_DATASET_DIR):
        print("Extracting preprocessed datasets...")
        unzip_file(
            MODEL_TRAINING_DATA_ZIP,
            MODEL_TRAINING_DATA_EXTRACT_DIR,
        )
        print("Preprocessed datasets extracted.")
    else:
        print("Directory with preprocessed datasets found. Skipping extraction...")
else:
    print("Cached preprocessed datasets not found. Skipping extraction...")

Cached preprocessed datasets found.
Directory with preprocessed datasets found. Skipping extraction...


In [9]:
from utils.converter import convert_to_df
import pandas as pd

processed_train_df = pd.read_csv(PROCESSED_TRAIN_DATA_CSV)

In [10]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split

# Take a random 20% sample of the entire dataset, stratifying on "Category"
# sample_df = processed_train_df.groupby("Category", group_keys=False).apply(
#     lambda x: x.sample(frac=0.2, random_state=42)
# )

# Take 100% of the processed dataset
sample_df = processed_train_df

# Now, split this sample into training, validation, and testing sets
# Let's use 70% for training, 15% for validation, and 15% for testing

train_df, temp_df = train_test_split(
    sample_df,
    test_size=0.3,  # Use 30% of the sample for validation and testing
    random_state=42,
    stratify=sample_df["Category"],
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,  # Split the remaining data evenly between validation and testing
    random_state=42,
    stratify=temp_df["Category"],
)

# Create a data generator
datagen = ImageDataGenerator(rescale=1.0 / 255)

### Task 3:

#### Model 1: Convolutional Neural Network (CNN) Model

In [11]:
# Load images from dataframes for Style classification
train_generator_style = datagen.flow_from_dataframe(
    dataframe=train_df,
    x_col="Full_Path",
    y_col="Style",
    target_size=(350, 350),
    batch_size=32,
    class_mode="categorical",
)

val_generator_style = datagen.flow_from_dataframe(
    dataframe=val_df,
    x_col="Full_Path",
    y_col="Style",
    target_size=(350, 350),
    batch_size=32,
    class_mode="categorical",
)

test_generator_style = datagen.flow_from_dataframe(
    dataframe=test_df,
    x_col="Full_Path",
    y_col="Style",
    target_size=(350, 350),
    batch_size=32,
    class_mode="categorical",
)

Found 326083 validated image filenames belonging to 17 classes.
Found 69875 validated image filenames belonging to 17 classes.
Found 69876 validated image filenames belonging to 17 classes.


In [13]:
MODEL_NAME = "task_3_CNN_model"
FORCE_TRAIN = True

# Check if MODEL_NAME is in MODELS_TO_RETRAIN
if FORCE_TRAIN:
    import os
    import csv
    import tensorflow as tf
    from tensorflow.keras.preprocessing.image import ImageDataGenerator
    from tensorflow.keras import layers, Sequential, models
    from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger
    from tensorflow.keras.models import load_model
    from sklearn.utils.class_weight import compute_class_weight
    import numpy as np
    import json
    import re

    # Load images from directories
    train_generator = train_generator_style
    val_generator = val_generator_style
    test_generator = test_generator_style

    # Define the base model file path without the epoch number
    if CURRENT_ENVIRONMENT == "GOOGLE_COLAB":
        base_model_file_path = f"{GOOGLE_DRIVE_ROOT_DIR}/Models/{MODEL_NAME}_at_epoch_"
        csv_logger_path = (
            f"{GOOGLE_DRIVE_ROOT_DIR}/Models/{MODEL_NAME}_training_log.csv"
        )
    else:
        base_model_file_path = f"{MODEL_DIR}/{MODEL_NAME}_at_epoch_"
        csv_logger_path = f"{MODEL_DIR}/{MODEL_NAME}_training_log.csv"

    # If the model directory does not exist, create it
    if CURRENT_ENVIRONMENT == "LOCAL" and not os.path.exists(MODEL_DIR):
        os.makedirs(MODEL_DIR)

    # When resuming training, list all files in the model directory
    model_files = os.listdir(
        MODEL_DIR
        if CURRENT_ENVIRONMENT == "LOCAL"
        else f"{GOOGLE_DRIVE_ROOT_DIR}/Models"
    )

    # Find the file with the highest epoch number in its name
    latest_model_file = max(
        (
            f
            for f in model_files
            if f.startswith(MODEL_NAME) and re.search(r"(\d+).h5", f)
        ),
        key=lambda f: int(re.search(r"(\d+).h5", f).group(1)),
        default=None,
    )

    # Load that model if it exists
    if latest_model_file is not None:
        model = load_model(f"{MODEL_DIR}/{latest_model_file}")

        # Extract the epoch number from the file name and use it as the starting epoch
        start_epoch = int(re.search(r"_at_epoch_(\d+).h5", latest_model_file).group(1))
    else:
        model = Sequential(
            [
                layers.Conv2D(
                    16,
                    (3, 3),
                    activation="relu",
                    padding="same",
                    input_shape=(350, 350, 3),
                    kernel_regularizer=tf.keras.regularizers.l2(0.001),
                ),
                layers.BatchNormalization(),
                layers.Conv2D(16, (3, 3), activation="relu", padding="same"),
                layers.BatchNormalization(),
                layers.MaxPooling2D((2, 2)),
                layers.Dropout(0.25),
                layers.Conv2D(32, (3, 3), activation="relu", padding="same"),
                layers.BatchNormalization(),
                layers.Conv2D(32, (3, 3), activation="relu", padding="same"),
                layers.BatchNormalization(),
                layers.MaxPooling2D((2, 2)),
                layers.Dropout(0.25),
                layers.Conv2D(64, (3, 3), activation="relu", padding="same"),
                layers.BatchNormalization(),
                layers.Conv2D(64, (3, 3), activation="relu", padding="same"),
                layers.BatchNormalization(),
                layers.Conv2D(
                    64, (3, 3), activation="relu", padding="same"
                ),  # Additional Conv2D layer
                layers.BatchNormalization(),
                layers.MaxPooling2D((2, 2)),
                layers.Dropout(0.25),
                layers.Flatten(),
                layers.Dense(128, activation="relu"),
                layers.BatchNormalization(),
                layers.Dropout(0.5),
                layers.Dense(64, activation="relu"),  # Additional Dense layer
                layers.BatchNormalization(),
                layers.Dropout(0.5),
                layers.Dense(len(train_generator.class_indices), activation="softmax"),
            ]
        )

        # Compile the model
        model.compile(
            optimizer="adam",
            loss="categorical_crossentropy",
            metrics=["accuracy"],
        )

        # Start from the first epoch
        start_epoch = 0

    # Callbacks
    early_stopping = EarlyStopping(
        monitor="val_loss", patience=7, restore_best_weights=True
    )
    csv_logger = CSVLogger(csv_logger_path, append=True)
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss", factor=0.2, patience=3, min_lr=0.00001
    )

    # Define the checkpoint
    checkpoint = ModelCheckpoint(
        filepath=f"{base_model_file_path}{{epoch:02d}}.h5",
        monitor="val_loss",
        verbose=1,
        save_best_only=False,
        save_weights_only=False,
        mode="auto",
        save_freq="epoch",
    )

    # add class weights for imbalanced dataset
    class_weights = compute_class_weight(
        class_weight="balanced",
        classes=np.unique(train_generator.classes),
        y=train_generator.classes,
    )

    # Train the model
    history = model.fit(
        train_generator,
        epochs=100,
        validation_data=val_generator,
        callbacks=[checkpoint, early_stopping, csv_logger, reduce_lr],
        class_weight=dict(enumerate(class_weights)),
        initial_epoch=start_epoch,
    )

    # Evaluate the model on the test set
    test_loss, test_acc = model.evaluate(test_generator)
    print("Test accuracy:", test_acc)

Epoch 1/100
  217/10191 [..............................] - ETA: 31:42 - loss: 3.5267 - accuracy: 0.0693

KeyboardInterrupt: 