# **Apply mask to dataset**


## **Upload kaggle json**

In [None]:
!pip install -q kaggle
from google.colab import files

# Download API Credentials
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets list

Saving kaggle.json to kaggle.json
ref                                                     title                                       size  lastUpdated          downloadCount  voteCount  usabilityRating  
------------------------------------------------------  -----------------------------------------  -----  -------------------  -------------  ---------  ---------------  
thedrcat/daigt-v2-train-dataset                         DAIGT V2 Train Dataset                      29MB  2023-11-16 01:38:36           3077        252  1.0              
muhammadbinimran/housing-price-prediction-data          Housing Price Prediction Data              763KB  2023-11-21 17:56:32          12966        224  1.0              
henryshan/2023-data-scientists-salary                   2023 Data Scientists Salary                 25KB  2023-12-14 10:14:34           2422         49  1.0              
thedrcat/daigt-external-train-dataset                   DAIGT External Train Dataset               435MB  2023-

## **Load data from kaggle**

In [None]:
!pip install opendatasets
import opendatasets as od
import os

od.download(
    "https://www.kaggle.com/datasets/kmader/rsna-bone-age")
trainpath = 'rsna-bone-age/boneage-training-dataset/boneage-training-dataset'
testpath = 'rsna-bone-age/boneage-test-dataset/boneage-test-dataset'

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22
Downloading rsna-bone-age.zip to ./rsna-bone-age


100%|██████████| 9.29G/9.29G [01:42<00:00, 97.4MB/s]





## **Connect to TPU**

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
import tensorflow_datasets as tfds
import time
import os

resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


<tensorflow.python.tpu.topology.Topology at 0x7bb945deb220>

## **Apply mask to dataset and save result to `trainning`**




In [None]:
!unzip unet_mask.zip

In [None]:
import os
import cv2
import numpy as np

maskspath = '/content/unet'

# path to save mask merge result
outputpath = '/content/trainning'


def apply_mask(original_image, mask):
    # transfer mask to binary format
    mask_binary = (mask > 0.5).astype(np.uint8) * 255

    # apply nask to original image
    result_image = cv2.bitwise_and(original_image, original_image, mask=mask_binary)

    return result_image

def apply_mask_to_dataset(input_folder, masks_folder, output_folder):

    os.makedirs(output_folder, exist_ok=True)

    for filename in os.listdir(input_folder):
        if filename.endswith(".png"):
            original_image = cv2.imread(os.path.join(input_folder, filename), cv2.IMREAD_GRAYSCALE)
            mask_filename = os.path.join(masks_folder, filename)

            # make sure mask exist
            if os.path.exists(mask_filename):
                mask = cv2.imread(mask_filename, cv2.IMREAD_GRAYSCALE)

                result_image = apply_mask(original_image, mask)

                # save the result
                output_filename = filename.replace(".png", "_result.png")
                cv2.imwrite(os.path.join(output_folder, output_filename), result_image)

apply_mask_to_dataset(trainpath, maskspath, outputpath)


**make sure the result**

In [None]:
print(len(os.listdir(maskspath)))
print(len(os.listdir(outputpath)))

14213
12588


## **Save the merge result**

In [None]:
!zip -r /content/merge.zip /content/trainning

In [None]:
from google.colab import files
files.download("/content/merge.zip")