In [None]:
%load_ext watermark
%watermark -v -p numpy,pandas,torch,torchvision,PIL,sklearn,matplotlib,wandb,captum --conda

In [None]:
# Setting the experiment environment

import time
import numpy as np
import pandas as pd
import torch
from utils.set_seed import set_seed
from utils.preprocess_utils import add_augmentation_multiplier, save_augmented_images

# Set seed for reproducibility
SEED = 0
set_seed(SEED)

# Get start time of the current experiment
start_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())

# Set the device to GPU if available
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Device: {torch.cuda.get_device_name(DEVICE)}")


In [None]:
# Loading data from csv file

df = pd.read_csv("Data\ccs_dataset.csv")
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Splitting data into train, validation and test sets with scikit-learn
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df.age_group)
test_df, val_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df.age_group)

# Deleting temp_df to free memory
del temp_df

In [None]:
# Adding augmentation multiplier to the train set

train_df = add_augmentation_multiplier(train_df, min_multiplier=3, max_multiplier=3)


In [None]:
# verify image count after augmentation by multiplying augmentation_multiplier and value_counts of age_group

calc_df = train_df.groupby("age_group").agg(
    {"age_group": "count", "augmentation_multiplier": "mean"}
)
calc_df['images_after_augmentation'] = calc_df['age_group'] * calc_df['augmentation_multiplier']
print('Total augmented images: ', calc_df['images_after_augmentation'].sum())

calc_df = calc_df.rename(columns={'age_group': 'images_before_augmentation'})

calc_df.sort_values('images_after_augmentation').style

In [None]:
# Change augmentation_multiplier to 3 for age_group with images_before_augmentation > 400

calc_df.loc[calc_df['images_before_augmentation'] > 400, 'augmentation_multiplier'] = 3

# Update augmentation_multiplier in train_df

train_df = train_df.merge(calc_df['augmentation_multiplier'], on='age_group', how='left')

# Rename drop old augmentation_multiplier column and rename new augmentation_multiplier column

train_df = train_df.rename(columns={'augmentation_multiplier_y': 'augmentation_multiplier'})
train_df = train_df.drop(columns=['augmentation_multiplier_x'])

# Set new augmentation_multiplier to integer

train_df['augmentation_multiplier'] = train_df['augmentation_multiplier'].astype(int)

train_df.head()



In [None]:
# Check new augmentation_multiplier values and total augmented images

print('Total augmented images: ', calc_df['images_after_augmentation'].sum())
calc_df.sort_values('images_after_augmentation').style

In [None]:
# Setting the input size and augmentation configuration

SAVE_DIR = "data\pre_processed"
INPUT_SIZE = (299, 299)
AUGMENT_CONFIG = {
    "augment_prob": 1,
    "flip_horizontal": True,
    "flip_vertical": False,
    "flip_prob": 0.5,
    "random_brightness": True,
    "brightness_factor": 0.15,
    "random_contrast": True,
    "contrast_factor": 0.15,
    "random_rotation": True,
    "rotation_factor": 3,
    "random_translation": True,
    "translation_factor": (0.05, 0.05),
    "random_zoom": True,
    "zoom_factors": (0.95, 1.05),
    "random_erasing": True,
    "erasing_prob": 0.15,
    "erasing_scale": (0.05, 0.10),
    "erasing_ratio": (0.3, 3.3),
}

In [None]:
# Saving augmented images to disk

save_augmented_images(train_df, input_size=INPUT_SIZE, augment_config=AUGMENT_CONFIG, save_dir=SAVE_DIR)