In [1]:
import tensorflow as tf
import os
import pandas as pd

def load_features_from_csv(csv_path):
    """
    Loads features from a CSV file into a dictionary.

    Args:
        csv_path (str): Path to the CSV file.

    Returns:
        dict: Dictionary mapping image filenames to their features.
    """
    df = pd.read_csv(csv_path)
    return {os.path.splitext(row['filename'])[0]: row['label'] for _, row in df.iterrows()}

def load_image_dataset_with_names(data_dir, img_height=224, img_width=224):
    """
    Loads an image dataset with filenames.

    Args:
        data_dir (str): Path to the data directory.
        img_height (int): Height of the images.
        img_width (int): Width of the images.

    Returns:
        tf.data.Dataset: Dataset containing (image, filename).
    """
    files = tf.data.Dataset.list_files(os.path.join(data_dir, '*'), shuffle=False)
    
    def process_path(file_path):
        img = tf.io.read_file(file_path)
        img = tf.image.decode_image(img, channels=3)
        img.set_shape([None, None, 3])
        img = tf.image.resize(img, [img_height, img_width]) / 255.0
        filename = tf.strings.split(file_path, os.sep)[-1]
        filename = tf.strings.regex_replace(filename, b'\..*', b'')

        return img, filename

    return files.map(process_path, num_parallel_calls=tf.data.AUTOTUNE)

def match_images_with_features(dataset, features_dict):
    """
    Matches images with their corresponding features (activity labels).

    Args:
        dataset (tf.data.Dataset): Dataset containing tuples (image, filename).
        features_dict (dict): Dictionary mapping image filenames to their features.

    Returns:
        tf.data.Dataset: Dataset containing tuples (image, feature).
    """
    def add_features(image, filename):
        """
        Adds features (activity labels) to the dataset.

        Args:
            image (tf.Tensor): Image tensor.
            filename (tf.Tensor): Filename tensor.

        Returns:
            tuple: (image, feature)
        """
        # Decode filename tensor to a string and find feature
        filename_str = filename.numpy().decode('utf-8')
        feature = features_dict.get(filename_str, "unknown")  # Default to "unknown" if not found
        return image, feature

    def tf_add_features(image, filename):
        """
        Wraps `add_features` to make it compatible with TensorFlow's `map` function.
        """
        return tf.py_function(
            func=add_features,
            inp=[image, filename],
            Tout=(tf.float32, tf.string)  # Specify output types: image is float32, feature is string
        )

    # Apply the function to the dataset
    return dataset.map(tf_add_features, num_parallel_calls=tf.data.AUTOTUNE)


# Paths
train_dir = '/Users/rafalszulinski/Desktop/developing/ml-zoomcamp-2024/capstone/HAR/train'
test_dir = '/Users/rafalszulinski/Desktop/developing/ml-zoomcamp-2024/capstone/HAR/test'
csv_path = '/Users/rafalszulinski/Desktop/developing/ml-zoomcamp-2024/capstone/HAR/Training_set.csv'

# Load data
features_dict = load_features_from_csv(csv_path)
train_dataset = load_image_dataset_with_names(train_dir)
test_dataset = load_image_dataset_with_names(test_dir)
annotated_train_dataset = []
for image, filename in train_dataset.take(5):
    annotated_train_dataset.append((image, features_dict[filename.numpy().decode('utf-8')]))

annotated_train_dataset

2024-12-11 09:54:30.633032: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


[(<tf.Tensor: shape=(224, 224, 3), dtype=float32, numpy=
  array([[[8.94537807e-01, 6.28151298e-01, 3.33893567e-01],
          [9.02941167e-01, 6.46218538e-01, 3.46498579e-01],
          [8.70728314e-01, 6.32913172e-01, 3.21988821e-01],
          ...,
          [1.35434195e-01, 1.40057737e-03, 3.22127994e-03],
          [1.29831955e-01, 0.00000000e+00, 1.44257192e-02],
          [1.21848792e-01, 0.00000000e+00, 1.94677617e-02]],
  
         [[8.74209702e-01, 6.21348560e-01, 3.27090830e-01],
          [8.79731894e-01, 6.38215303e-01, 3.39695871e-01],
          [8.62324953e-01, 6.35714233e-01, 3.35594237e-01],
          ...,
          [1.32793143e-01, 1.00041227e-03, 1.38054858e-03],
          [1.25590265e-01, 0.00000000e+00, 1.01840319e-02],
          [1.19527847e-01, 0.00000000e+00, 1.28251184e-02]],
  
         [[8.63325357e-01, 6.35094047e-01, 3.38755488e-01],
          [8.63685429e-01, 6.46138430e-01, 3.51240516e-01],
          [8.60584259e-01, 6.56942725e-01, 3.72048825e-01],
     

In [2]:
list(train_dataset.as_numpy_iterator())

2024-12-11 09:54:47.957661: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


[(array([[[8.94537807e-01, 6.28151298e-01, 3.33893567e-01],
          [9.02941167e-01, 6.46218538e-01, 3.46498579e-01],
          [8.70728314e-01, 6.32913172e-01, 3.21988821e-01],
          ...,
          [1.35434195e-01, 1.40057737e-03, 3.22127994e-03],
          [1.29831955e-01, 0.00000000e+00, 1.44257192e-02],
          [1.21848792e-01, 0.00000000e+00, 1.94677617e-02]],
  
         [[8.74209702e-01, 6.21348560e-01, 3.27090830e-01],
          [8.79731894e-01, 6.38215303e-01, 3.39695871e-01],
          [8.62324953e-01, 6.35714233e-01, 3.35594237e-01],
          ...,
          [1.32793143e-01, 1.00041227e-03, 1.38054858e-03],
          [1.25590265e-01, 0.00000000e+00, 1.01840319e-02],
          [1.19527847e-01, 0.00000000e+00, 1.28251184e-02]],
  
         [[8.63325357e-01, 6.35094047e-01, 3.38755488e-01],
          [8.63685429e-01, 6.46138430e-01, 3.51240516e-01],
          [8.60584259e-01, 6.56942725e-01, 3.72048825e-01],
          ...,
          [1.27451003e-01, 9.00371233e-04, 0.00

In [107]:
import matplotlib.pyplot as plt

In [171]:
# first image
image, label = annotated_train_dataset[0][0]
plt.imshow(image)
plt.title(label)
plt.axis('off')
plt.show()


ValueError: too many values to unpack (expected 2)