# Data pipeline

This data pipeline has the objective to download and transform the original dataset to obtain our final dataset. Moreover, we will use DVC to keep data versioning control in every step.

### 0. Necessary imports and functions

In [1]:
from kaggle.api.kaggle_api_extended import KaggleApi
from pathlib import Path
import os
from dotenv import load_dotenv
import yaml
import subprocess
import random
import shutil
from PIL import Image
import cv2

In [2]:
def copy_files(sample_list, src_images_dir, src_masks_dir, dest_images_dir, dest_masks_dir):
    for sample in sample_list:
        # Copy image file
        src_image_path = os.path.join(src_images_dir, sample)
        dest_image_path = os.path.join(dest_images_dir, sample)
        shutil.copyfile(src_image_path, dest_image_path)

        # Copy mask file (assuming the mask file has the same name as the image file)
        sample_mask = sample.replace('jpg', 'png')

        src_mask_path = os.path.join(src_masks_dir, sample_mask)
        dest_mask_path = os.path.join(dest_masks_dir, sample_mask)
        shutil.copyfile(src_mask_path, dest_mask_path)

In [3]:
def from_raw_masks_to_image_masks(input_dirs: list[str], output_dirs: list[str]) -> None:
    for input_dir, output_dir in zip(input_dirs, output_dirs):
        # Process each directories masks
        palette: list[int] = [
                0, 0, 0, # For background -> Black
                255, 0, 0, # For persons -> Red
            ]

        for j in os.listdir(input_dir):
            if j == '.DS_Store':
                continue
            
            image_path = input_dir / j
            mask = Image.open(image_path).convert('P')
            
            # Ensure that all non-zero values are set to 1
            mask_data = mask.load()
            width, height = mask.size
            for y in range(height):
                for x in range(width):
                    if mask_data[x, y] > 0:
                        mask_data[x, y] = 1
            
            mask.putpalette(palette)
            save_path = output_dir / j
            mask.save(save_path, 'PNG')

In [4]:
def from_image_masks_to_labels(input_dirs: list[str], output_dirs: list[str]) -> None:
    for input_dir, output_dir in zip(input_dirs, output_dirs):
        for j in os.listdir(input_dir):
            if j == '.DS_Store':
                continue

            image_path = os.path.join(input_dir, j)
            # load the binary mask and get its contours
            mask = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            _, mask = cv2.threshold(mask, 1, 255, cv2.THRESH_BINARY)

            H, W = mask.shape
            contours, hierarchy = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

            # convert the contours to polygons
            polygons = []
            for cnt in contours:
                if cv2.contourArea(cnt) > 200:
                    polygon = []
                    for point in cnt:
                        x, y = point[0]
                        polygon.append(x / W)
                        polygon.append(y / H)
                    polygons.append(polygon)

            # print the polygons
            with open('{}.txt'.format(os.path.join(output_dir, j)[:-4]), 'w') as f:
                for polygon in polygons:
                    for p_, p in enumerate(polygon):
                        if p_ == len(polygon) - 1:
                            f.write('{}\n'.format(p))
                        elif p_ == 0:
                            f.write('0 {} '.format(p))
                        else:
                            f.write('{} '.format(p))

                f.close()

### 1. Varibale definition

All variables are going to be defined based in a configuration file that will ease the process of variable modification.

In [5]:
# Read the YAML configuration file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Declare some configuration variables
DATASET_LINK = config['dataPipelines']['dataDownloading']['datasetLink']
DATA_DIR = Path(config['dataPipelines']['dataDownloading']['dataDirectory'])
TRAIN_SIZE = config['dataPipelines']['splitData']['trainSize']
VAL_SIZE = config['dataPipelines']['splitData']['valSize']
TEST_SIZE = config['dataPipelines']['splitData']['testSize']
SPLIT_DATA_DIR = Path(config['dataPipelines']['splitData']['dataDirectory'])
TRANSFORM_DATA_DIR = Path(config['dataPipelines']['transformMasks']['dataDirectory'])
LABELS_DATA_DIR = Path(config['dataPipelines']['createLabels']['dataDirectory'])

# Create data directory if it does not exist
DATA_DIR.mkdir(parents = True, exist_ok = True)

# Load environment variables from a .env file and set up Kaggle credentials from environment variables
load_dotenv()
os.environ['KAGGLE_USERNAME'] = os.getenv('KAGGLE_USERNAME')
os.environ['KAGGLE_KEY'] = os.getenv('KAGGLE_KEY')

### 2. Working with the original data

The first thing we need to do is to download the original data. To do this we will use `KaggleAPI`.

In [6]:
# Initialize the Kaggle API
api = KaggleApi()
api.authenticate()

# Download the dataset
api.dataset_download_files(DATASET_LINK, path = DATA_DIR, unzip = True)

Dataset URL: https://www.kaggle.com/datasets/mariarisques/dataset-person-yolos


### 3. Split data

In this section we are going to split the data into train, validation and test splits.

In [7]:
# Define directories
images_dir = DATA_DIR / 'dataset_person-yolos/data/images'
masks_dir = DATA_DIR / 'dataset_person-yolos/data/masks'
images_dir_train = SPLIT_DATA_DIR / 'images/train'
masks_dir_train = SPLIT_DATA_DIR / 'masks/train'
images_dir_val = SPLIT_DATA_DIR / 'images/val'
masks_dir_val = SPLIT_DATA_DIR / 'masks/val'
images_dir_test = SPLIT_DATA_DIR / 'images/test'
masks_dir_test = SPLIT_DATA_DIR / 'masks/test'

# Get the list of samples and shuffle them
samples = os.listdir(images_dir)
random.shuffle(samples)

# Calculate split indices
num_samples = len(samples)
train_end = int(TRAIN_SIZE * num_samples)
val_end = train_end + int(VAL_SIZE * num_samples)

# Split samples
train_samples = samples[:train_end]
val_samples = samples[train_end:val_end]
test_samples = samples[val_end:]

# Create necessary directories
SPLIT_DATA_DIR.mkdir(parents = True, exist_ok = True)
images_dir_train.mkdir(parents = True, exist_ok = True)
masks_dir_train.mkdir(parents = True, exist_ok = True)
images_dir_val.mkdir(parents = True, exist_ok = True)
masks_dir_val.mkdir(parents = True, exist_ok = True)
images_dir_test.mkdir(parents = True, exist_ok = True)
masks_dir_test.mkdir(parents = True, exist_ok = True)

# Copy files to respective directories
copy_files(train_samples, images_dir, masks_dir, images_dir_train, masks_dir_train)
copy_files(val_samples, images_dir, masks_dir, images_dir_val, masks_dir_val)
copy_files(test_samples, images_dir, masks_dir, images_dir_test, masks_dir_test)

### 4. Transformations

In this step we need to go from the original masks to ones that can be later transformed to labels that yolo is able to understand.

In [8]:
input_dir_train = SPLIT_DATA_DIR / 'masks/train'
output_dir_train = TRANSFORM_DATA_DIR / 'masks/train'
input_dir_val = SPLIT_DATA_DIR / 'masks/val'
output_dir_val = TRANSFORM_DATA_DIR / 'masks/val'
input_dir_test = SPLIT_DATA_DIR / 'masks/test'
output_dir_test = TRANSFORM_DATA_DIR / 'masks/test'

output_dir_train.mkdir(parents = True, exist_ok = True)
output_dir_val.mkdir(parents = True, exist_ok = True)
output_dir_test.mkdir(parents = True, exist_ok = True)

from_raw_masks_to_image_masks(
    input_dirs = [input_dir_train, input_dir_val, input_dir_test],
    output_dirs = [output_dir_train, output_dir_val, output_dir_test]
)

images_dir_trans_train = TRANSFORM_DATA_DIR / 'images/train'
images_dir_trans_val = TRANSFORM_DATA_DIR / 'images/val'
images_dir_trans_test = TRANSFORM_DATA_DIR / 'images/test'

shutil.copytree(images_dir_train, images_dir_trans_train, dirs_exist_ok = True)
shutil.copytree(images_dir_val, images_dir_trans_val, dirs_exist_ok = True)
shutil.copytree(images_dir_test, images_dir_trans_test, dirs_exist_ok = True)

PosixPath('/Users/nachogris/Desktop/UNI/GCED/QUART/TAED2/LAB/TAED2_YOLOs/data/interim/transformed/images/test')

### 5. Create labels

In this last step we convert the previous transformed masks to some labels, 

In [9]:
input_dir_train = TRANSFORM_DATA_DIR / 'masks/train'
output_dir_train = LABELS_DATA_DIR / 'labels/train'
input_dir_val = TRANSFORM_DATA_DIR / 'masks/val'
output_dir_val = LABELS_DATA_DIR / 'labels/val'
input_dir_test = TRANSFORM_DATA_DIR / 'masks/test'
output_dir_test = LABELS_DATA_DIR / 'labels/test'

output_dir_train.mkdir(parents = True, exist_ok = True)
output_dir_val.mkdir(parents = True, exist_ok = True)
output_dir_test.mkdir(parents = True, exist_ok = True)

from_image_masks_to_labels(
    input_dirs = [input_dir_train, input_dir_val, input_dir_test],
    output_dirs = [output_dir_train, output_dir_val, output_dir_test]
)

images_dir_labels_train = LABELS_DATA_DIR / 'images/train'
images_dir_labels_val = LABELS_DATA_DIR / 'images/val'
images_dir_labels_test = LABELS_DATA_DIR / 'images/test'

shutil.copytree(images_dir_train, images_dir_labels_train, dirs_exist_ok = True)
shutil.copytree(images_dir_val, images_dir_labels_train, dirs_exist_ok = True)
shutil.copytree(images_dir_test, images_dir_labels_train, dirs_exist_ok = True)

PosixPath('/Users/nachogris/Desktop/UNI/GCED/QUART/TAED2/LAB/TAED2_YOLOs/data/processed/images/train')