## LAB3 B: Fusion 2D CNN

### Proceedure:

- Make `DataLoader`.
- Build, train, evaluate, and compare early fusion vs late fusion.
- Try with pretrained backbones CNNs, compare.

In [1]:
import os
conda_bin = r"E:\Miniconda\envs\acv_tf\Library\bin"
os.environ['PATH'] = conda_bin + os.pathsep + os.environ['PATH']

import numpy as np
import tensorflow as tf
import warnings 
warnings.filterwarnings('ignore')

print("TensorFlow version:", tf.__version__)
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))

TensorFlow version: 2.10.1
Num GPUs Available: 1


In [2]:
class VideoDataset:
    """Video Dataset loader for Fusion experiments (Early & Late)."""

    def __init__(self, root_dir, split="train", num_frames=16):
        self.split = split
        self.num_frames = num_frames

        # Load .npz
        npz_path = os.path.join(root_dir, f"{split}.npz")
        data = np.load(npz_path, allow_pickle=True)
        self.videos = list(data["X"])
        self.labels = np.array(data["y"])
        self.ids = list(data["ids"])

        # Class maps
        all_classes = sorted({vid.split("/")[0] for vid in self.ids})
        self.class_to_idx = {cls: i for i, cls in enumerate(all_classes)}
        self.idx_to_class = {i: cls for cls, i in self.class_to_idx.items()}

        print(f"✅ Loaded {split}.npz: {len(self.videos)} samples, {len(all_classes)} classes")

    def _uniform_sample(self, frames, num_samples):
        n = len(frames)
        if n >= num_samples:
            idxs = np.linspace(0, n - 1, num_samples).astype(int)
            return frames[idxs]
        else:
            pad_len = num_samples - n
            pad = np.zeros((pad_len, *frames.shape[1:]), dtype=frames.dtype)
            return np.concatenate([pad, frames], axis=0)

    def __len__(self):
        return len(self.videos)

    def __getitem__(self, idx, fusion="late"):
        frames = self.videos[idx].astype(np.float32) / 255.0
        sampled = self._uniform_sample(frames, self.num_frames)
        label = int(self.labels[idx])
        class_name = self.idx_to_class[label]

        if fusion == "early":
            # Concatenate frames along channel axis: (T, H, W, 3) -> (H, W, 3*T)
            H, W, C = sampled.shape[1:]
            sampled = sampled.reshape(H, W, C * self.num_frames)

        return sampled, label, class_name


def make_tf_dataset(dataset: VideoDataset, batch_size=4, shuffle=True, fusion="late"):
    """
    Returns a tf.data.Dataset that yields (X, y) only for Keras.
    """
    def gen():
        for i in range(len(dataset)):
            # ✅ Explicitly call __getitem__ with fusion
            x, y, _ = dataset.__getitem__(i, fusion=fusion)
            yield x, y

    if fusion == "early":
        output_shapes = ((112, 112, 3 * dataset.num_frames), ())
    else:
        output_shapes = ((dataset.num_frames, 112, 112, 3), ())

    output_types = (tf.float32, tf.int32)

    ds = tf.data.Dataset.from_generator(gen, output_types=output_types, output_shapes=output_shapes)
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataset))
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

In [3]:
root_dir = "data/UCF50_npz"

train_dataset = VideoDataset(root_dir, split="train", num_frames=16)
test_dataset  = VideoDataset(root_dir, split="test", num_frames=16)

batch_size = 8

train_ds_early = make_tf_dataset(train_dataset, batch_size=batch_size, shuffle=True, fusion="early")
test_ds_early  = make_tf_dataset(test_dataset, batch_size=batch_size, shuffle=False, fusion="early")

train_ds_late = make_tf_dataset(train_dataset, batch_size=batch_size, shuffle=True, fusion="late")
test_ds_late  = make_tf_dataset(test_dataset, batch_size=batch_size, shuffle=False, fusion="late")

✅ Loaded train.npz: 524 samples, 5 classes
✅ Loaded test.npz: 132 samples, 5 classes


In [4]:
save_dir = "models/LAB3_B"
os.makedirs(save_dir, exist_ok=True)

num_frames = 16
input_height = 112
input_width = 112
batch_size = 16
num_classes = len(train_dataset.idx_to_class)
epochs = 5

# ---------------------------
# Build 2D CNN Backbone
# ---------------------------
def build_backbone(input_shape, num_classes, first_conv_channels=16):
    model = models.Sequential()
    model.add(layers.Conv2D(first_conv_channels, (3,3), activation='relu', padding='same', input_shape=input_shape))
    model.add(layers.MaxPooling2D((2,2)))
    model.add(layers.Dropout(0.3))

    model.add(layers.Conv2D(32, (3,3), activation='relu', padding='same'))
    model.add(layers.MaxPooling2D((2,2)))
    model.add(layers.Dropout(0.2))

    model.add(layers.Conv2D(64, (3,3), activation='relu', padding='same'))
    model.add(layers.MaxPooling2D((2,2)))
    model.add(layers.Dropout(0.1))

    model.add(layers.Conv2D(128, (3,3), activation='relu', padding='same'))
    model.add(layers.MaxPooling2D((2,2)))

    model.add(layers.Flatten())
    model.add(layers.Dense(256, activation='relu'))
    model.add(layers.Dense(num_classes, activation='softmax'))
    return model

In [5]:
from tensorflow.keras import layers, models, optimizers, losses, metrics

# ---------------------------
# Early Fusion Model
# ---------------------------
early_input_shape = (112, 112, 3*16)
early_model = build_backbone(early_input_shape, num_classes=len(train_dataset.idx_to_class), first_conv_channels=16)

early_model.compile(
    optimizer=optimizers.Adam(),
    loss=losses.SparseCategoricalCrossentropy(),
    metrics=[metrics.SparseCategoricalAccuracy()]
)

early_model.summary()

# ---------------------------
# Late Fusion Model
# ---------------------------
num_classes = len(train_dataset.idx_to_class)
late_input_shape = (16, 112, 112, 3)
late_input = layers.Input(shape=late_input_shape)

# Backbone without final Flatten+Dense
backbone_for_late = build_backbone((112, 112, 3), num_classes)

# Grab the last conv layer before Flatten/Dense
feature_extractor = models.Model(
    backbone_for_late.input,
    backbone_for_late.layers[-4].output  # should be (H, W, C)
)

# Apply TimeDistributed to extract features from each frame
td_features = layers.TimeDistributed(feature_extractor)(late_input)

# Aggregate features across time and spatial dimensions
# td_features shape: (B, T, H, W, C)
# reduce_mean across time + spatial dims -> (B, C)
aggregated = layers.Lambda(lambda x: tf.reduce_mean(x, axis=[1, 2, 3]))(td_features)

# Dense layers for final classification
x = layers.Dense(256, activation='relu')(aggregated)
late_output = layers.Dense(num_classes, activation='softmax')(x)

late_model = models.Model(inputs=late_input, outputs=late_output)
late_model.compile(
    optimizer=optimizers.Adam(),
    loss=losses.SparseCategoricalCrossentropy(),
    metrics=[metrics.SparseCategoricalAccuracy()]
)

late_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 112, 112, 16)      6928      
                                                                 
 max_pooling2d (MaxPooling2D  (None, 56, 56, 16)       0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 56, 56, 16)        0         
                                                                 
 conv2d_1 (Conv2D)           (None, 56, 56, 32)        4640      
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 28, 28, 32)       0         
 2D)                                                             
                                                                 
 dropout_1 (Dropout)         (None, 28, 28, 32)        0

### Early vs Late Fusion 2D CNNs

In [6]:
# ---------------------------
# Train & Save Early Fusion
# ---------------------------
history_early = early_model.fit(
    train_ds_early,
    validation_data=test_ds_early,
    epochs=10
)
early_model.save(os.path.join(save_dir, "early_fusion_model.h5"))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [7]:
# ---------------------------
# Train & Save Late Fusion
# ---------------------------
history_late = late_model.fit(
    train_ds_late,
    validation_data=test_ds_late,
    epochs=10
)
late_model.save(os.path.join(save_dir, "late_fusion_model.h5"))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [8]:
# ---------------------------
# Evaluate & Compare
# ---------------------------
early_eval = early_model.evaluate(test_ds_early)
late_eval  = late_model.evaluate(test_ds_late)

print(f"Early Fusion Test Loss: {early_eval[0]:.4f}, Accuracy: {early_eval[1]:.4f}")
print(f"Late Fusion  Test Loss: {late_eval[0]:.4f}, Accuracy: {late_eval[1]:.4f}")

Early Fusion Test Loss: 0.7499, Accuracy: 0.7348
Late Fusion  Test Loss: 0.5032, Accuracy: 0.7727


### Pretrained Backbone Test

Note: Yeah I'm not switching to PyTorch just for this, probably gonna do PyTorch for the next 2 notebooks.

In [10]:
from tensorflow.keras.applications import ResNet50, EfficientNetB0
import numpy as np

# ---------------------------
# Config
# ---------------------------
NUM_CLASSES = len(train_dataset.idx_to_class)  # adjust as needed
EPOCHS = 10
BATCH_SIZE = 8

backbones = {
    "resnet": ResNet50,
    "efficientnet": EfficientNetB0
}

fusion_types = ["early", "late"]

results = {}

# ---------------------------
# Helper: build backbone
# ---------------------------
def build_backbone(backbone_fn, input_shape=(112,112,3), num_classes=NUM_CLASSES):
    base = backbone_fn(
        include_top=False, 
        input_shape=input_shape,
        weights=None,  # training from scratch
        pooling='avg'
    )
    x = layers.Dense(num_classes, activation='softmax')(base.output)
    model = models.Model(inputs=base.input, outputs=x)
    return model

# ---------------------------
# Training & Evaluation Loop
# ---------------------------
for bb_name, bb_fn in backbones.items():
    for fusion in fusion_types:
        print(f"Running {fusion} fusion with {bb_name} backbone...")

        if fusion == "early":
            # Early fusion: stack frames along channels
            early_input_shape = (112, 112, 3*16)
            early_model = build_backbone(bb_fn, input_shape=early_input_shape, num_classes=NUM_CLASSES)

            early_model.compile(
                optimizer=optimizers.Adam(),
                loss=losses.SparseCategoricalCrossentropy(),
                metrics=[metrics.SparseCategoricalAccuracy()]
            )

            # Train
            history = early_model.fit(
                train_ds_early,   # should yield (B,112,112,3*16), y
                validation_data=test_ds_early,
                epochs=EPOCHS
            )

            # Evaluate
            eval_res = early_model.evaluate(test_ds_early)
            results[f"{fusion}_{bb_name}"] = eval_res

        elif fusion == "late":
            # Late fusion: per-frame features
            late_input = layers.Input(shape=(16,112,112,3))
            backbone_model = build_backbone(bb_fn, input_shape=(112,112,3), num_classes=NUM_CLASSES)

            # Remove final Dense to get features
            feature_extractor = models.Model(
                backbone_model.input,
                backbone_model.layers[-2].output  # shape (C,)
            )

            td_features = layers.TimeDistributed(feature_extractor)(late_input)  # (B,T,C)
            aggregated = layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(td_features)  # average over time

            x = layers.Dense(256, activation='relu')(aggregated)
            late_output = layers.Dense(NUM_CLASSES, activation='softmax')(x)

            late_model = models.Model(inputs=late_input, outputs=late_output)

            late_model.compile(
                optimizer=optimizers.Adam(),
                loss=losses.SparseCategoricalCrossentropy(),
                metrics=[metrics.SparseCategoricalAccuracy()]
            )

            # Train
            history = late_model.fit(
                train_ds_late,   # should yield (B,16,112,112,3), y
                validation_data=test_ds_late,
                epochs=EPOCHS
            )

            # Evaluate
            eval_res = late_model.evaluate(test_ds_late)
            results[f"{fusion}_{bb_name}"] = eval_res

# ---------------------------
# Comparison
# ---------------------------
print("\n--- Evaluation Comparison ---")
for k,v in results.items():
    print(f"{k}: Loss={v[0]:.4f}, Accuracy={v[1]:.4f}")

Running early fusion with resnet backbone...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Running late fusion with resnet backbone...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Running early fusion with efficientnet backbone...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Running late fusion with efficientnet backbone...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

--- Evaluation Comparison ---
early_resnet: Loss=0.9551, Accuracy=0.7045
late_resnet: Loss=1.5061, Accuracy=0.6061
early_efficientnet: Loss=1.1045, Accuracy=0.7273
late_efficientnet: Loss=0.7243, Accuracy=0.8333
