# Animals 10
Made by Samuel TvrdoÅˆ and Michal Weis

## Import Dataset

In [None]:
from pathlib import Path
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
from PIL import Image
import torch
from torch import nn
from torchvision.transforms import v2
from torchvision import datasets
from torch.utils.data import DataLoader
import shutil
import wandb
from wandb.integration.keras import WandbCallback
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay
import tensorflow as tf
import time

In [None]:
def make_data_frame(path):
  data = []
  for animal_class in path.iterdir():
    if animal_class.is_dir():
      for img in animal_class.glob("*"):
        data.append({"label": animal_class.name, "path": str(img)})

  return pd.DataFrame(data)

In [None]:
dataset_url = "https://www.kaggle.com/api/v1/datasets/download/alessiocorrado99/animals10"
archive = tf.keras.utils.get_file(origin=dataset_url, extract=True)
data_dir = Path(archive).with_suffix('') / "raw-img"

## EDA

In [None]:
df = make_data_frame(data_dir)

In [None]:
df.info()

In [None]:
df.describe()

Our dataset contains over 26000 pictures of 10 different animals. The most number of sample images are for the class dog. We will visualise the distribution of the whole dataset

In [None]:
print("Total images:", len(df))

plt.figure(figsize=(8,4))
sns.countplot(x='label', data=df)
plt.title("Class distribution")
plt.show()

We can see that spider and dog images are the most abundant, while the counts for other animals are mostly the same.

Next we will check if we have any corrupted files # TODO: too long

In [None]:
# from PIL import UnidentifiedImageError

# # Check if all images are valid
# bad_images = []
# for path in df['path']:
#     try:
#         Image.open(path).verify()
#     except UnidentifiedImageError:
#         bad_images.append(path)

# print("Number of bad images:", len(bad_images))

To verify image quality we will inspect random images from each class.

In [None]:
fig, axes = plt.subplots(1, 10, figsize=(15,3))
classes = df['label'].unique()

for i, c in enumerate(random.sample(list(classes), k=10)):
    sample_path = random.choice(df[df['label']==c]['path'].values)
    img = Image.open(sample_path)
    axes[i].imshow(img)
    axes[i].set_title(c)
    axes[i].axis('off')

plt.tight_layout()
plt.show()

## Experiment Setup

In [None]:
config = {
    "batch_size": 64,
    "learning_rate": 0.0003,
    "epochs": 15,
    "img_size": 224
}

In [None]:
# TODO: used only with pytorch
def get_metrics(total_loss, batch_count, preds, ys):
    # Average loss in the given epoch
    avg_loss = total_loss / batch_count

    accuracy = accuracy_score(ys, preds)

    # For multi-class classification, use 'weighted' average for f1_score
    f1 = f1_score(ys, preds, average='weighted')
    precision = precision_score(ys, preds, average='weighted')
    recall = recall_score(ys, preds, average='weighted')

    return {
        "loss": avg_loss,
        "accuracy": accuracy,
        "f1_score": f1,
        "precision": precision,
        "recall": recall
    }

## Data split

We need to split the data before starting with the preprocessing and augmentation. We opted for a 70-30 split, creating a new split folder storing both training and validation data.

In [None]:
# Set new folder locations
split_dir = Path('./split')
train_dir = split_dir / 'train'
val_dir = split_dir / 'val'

# Create new folders
train_dir.mkdir(parents=True)
val_dir.mkdir(parents=True)

# Set split ration
ratio = 0.7
random.seed(23)

for animal_class in data_dir.iterdir():
  if animal_class.is_dir():
    images = [p for p in animal_class.glob("*") if p.is_file()]
    random.shuffle(images)

    n_train = int(len(images) * ratio)

    train_images = images[:n_train]
    val_images = images[n_train:]

    # Destination folders for current class
    train_class_dir = train_dir / animal_class.name
    val_class_dir = val_dir / animal_class.name
    train_class_dir.mkdir(parents=True)
    val_class_dir.mkdir(parents=True)

    # Copy images to destination
    for img in train_images:
      shutil.copy(img, train_class_dir / img.name)

    for img in val_images:
      shutil.copy(img, val_class_dir / img.name)

We will make sure the split worked as intended.

In [None]:
train_df = make_data_frame(train_dir)
print("Total images:", len(train_df))

plt.figure(figsize=(8,4))
sns.countplot(x='label', data=train_df)
plt.title("Training class distribution")
plt.show()

In [None]:
val_df = make_data_frame(val_dir)
print("Total images:", len(val_df))

plt.figure(figsize=(8,4))
sns.countplot(x='label', data=val_df)
plt.title("Validation class distribution")
plt.show()


## Image preprocessing and augmentations


In [None]:
# # Training dataset
# train_transform = v2.Compose([
#     v2.Resize((224, 224)),
#     v2.RandomHorizontalFlip(0.5),
#     v2.RandomRotation(15),
#     v2.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1),
#     v2.ToImage(),
#     v2.ToDtype(torch.float32, scale=True),
#     # TODO:
#     # v2.Normalize(mean=[0.485, 0.456, 0.406],
#     #             std=[0.229, 0.224, 0.225])
# ])

# train_dataset = datasets.ImageFolder(
#     root=train_dir,
#     transform=train_transform
# )

# train_loader = DataLoader(
#     train_dataset,
#     batch_size=config["batch_size"],
#     shuffle=True,
# )

# # Validation dataset
# val_transform = v2.Compose([
#     v2.Resize((224, 224)),
#     v2.ToImage(),
#     v2.ToDtype(torch.float32, scale=True),
#     # v2.Normalize(mean=[0.485, 0.456, 0.406],
#     #             std=[0.229, 0.224, 0.225])
# ])

# val_dataset = datasets.ImageFolder(
#     root=val_dir,
#     transform=val_transform
# )

# val_loader = DataLoader(
#     val_dataset,
#     batch_size=config["batch_size"],
#     shuffle=False,
# )

## Model
We define our model.

In [None]:
# class AnimalModel(nn.Module):
#     def __init__(self, num_classes=10):
#         super().__init__()
#         self.stack = nn.Sequential(
#             nn.Conv2d(3, 64, kernel_size=3, padding=0),
#             nn.ReLU(),
#             nn.MaxPool2d(2),

#             nn.Conv2d(64, 32, kernel_size=3, padding=0),
#             nn.ReLU(),
#             nn.MaxPool2d(2),

#             nn.Conv2d(32, 16, kernel_size=3, padding=0),
#             nn.ReLU(),
#             nn.MaxPool2d(2),

#             nn.Flatten(),
#             # TODO: calculate size after changing conv layers
#             nn.Linear(16 * 26 * 26, 128),
#             nn.ReLU(),
#             nn.Linear(128, num_classes)
#         )

#     def forward(self, x):
#         return self.stack(x)

## Training
We begin trainig.

In [None]:
# # Init wandb
# with wandb.init(project="zneus-2", config=config) as run:
#   model = AnimalModel()
#   device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#   print(f"Running on {device}")
#   model.to(device)

#   # TODO: check other loss functions?
#   loss_fn = nn.CrossEntropyLoss()

#   # TODO: other optimizers?
#   optimizer = torch.optim.Adam(params=model.parameters(), lr=config["learning_rate"])

#   # Log both parameters and gradient
#   run.watch(model, criterion=loss_fn, log_freq=100, log="all")

#   # Start training
#   for epoch_idx in range(config["epochs"]):
#       epoch_start_time = time.time()
#       print(f"epoch {epoch_idx + 1}")

#       # Train
#       train_total_loss = 0
#       train_preds = []
#       train_ys = []

#       model.train()
#       # Include dataloader time
#       batch_start_time = time.time()
#       for batch_idx, (X, y) in enumerate(train_loader):
#           #print(f"batch {batch_idx + 1}")
#           # Move to device
#           X = X.to(device)
#           y = y.to(device)

#           # Predict
#           pred = model(X)
#           loss = loss_fn(pred, y)

#           # Optimize
#           loss.backward()
#           optimizer.step()
#           optimizer.zero_grad()

#           # Save values
#           train_total_loss += loss.item()
#           train_preds.extend(torch.argmax(torch.softmax(pred, dim=1), dim=1).detach().cpu().numpy().ravel())
#           train_ys.extend(y.detach().cpu().numpy().ravel())

#           #print(f"batch took: {time.time() - batch_start_time:.4f}s")
#           #batch_start_time = time.time()

#       train_metrics = get_metrics(
#           train_total_loss, len(train_loader), train_preds, train_ys
#       )
#       print("training:")
#       for k, v in train_metrics.items():
#           print(f"{k}: {v}")
#       print()

#       # Validate
#       val_total_loss = 0
#       val_preds = []
#       val_ys = []

#       model.eval()
#       with torch.no_grad():
#           for X, y in val_loader:
#               # Move to device
#               X = X.to(device)
#               y = y.to(device)

#               # Predict
#               pred = model(X)
#               loss = loss_fn(pred, y)

#               # Save values
#               val_total_loss += loss.item()
#               val_preds.extend(torch.argmax(torch.softmax(pred, dim=1), dim=1).detach().cpu().numpy().ravel())
#               val_ys.extend(y.detach().cpu().numpy().ravel())


#       val_metrics = get_metrics(
#           val_total_loss, len(val_loader), val_preds, val_ys
#       )
#       print("validation:")
#       for k, v in val_metrics.items():
#           print(f"{k}: {v}")
#       print()

#       print(f"epoch took: {time.time() - epoch_start_time:.2f}s")

#       # Wandb logging
#       run.log(
#           {
#               **{f"train_{k}": v for k, v in train_metrics.items()},
#               **{f"val_{k}": v for k, v in val_metrics.items()},
#           }
#       )

#   # TODO: add test set

In [None]:
# Data augmentations
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip("horizontal"),
    tf.keras.layers.RandomRotation(0.1),
    tf.keras.layers.RandomContrast(0.1),
    tf.keras.layers.Lambda(
        lambda x: tf.image.random_brightness(x, max_delta=0.1)
    ),
    tf.keras.layers.Lambda(
        lambda x: tf.image.random_saturation(x, lower=0.9, upper=1.1)
    ),
    tf.keras.layers.Lambda(
        lambda x: tf.image.random_hue(x, max_delta=0.1)
    ),
], name="data_augmentation")

In [None]:
# Calculate class weights
labels = df['label']
classes = sorted(labels.unique())
class_counts = labels.value_counts().sort_index()

total = class_counts.sum()
num_classes = len(classes)

class_weight = {
    i: total / (num_classes * class_counts)
    for i, count in enumerate(class_counts)
}

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
if tf.config.list_physical_devices('GPU'):
  print("GPU is available and being used.")
else:
  print("No GPU available. Please change the runtime type to GPU.")

In [None]:
train_data = tf.keras.utils.image_dataset_from_directory(
  train_dir,
  # TODO: remove as data is split before-hand
  # validation_split=0.2,
  # subset="training",
  seed=123,
  image_size=(config["img_size"], config["img_size"]),
  batch_size=config["batch_size"])

val_data = tf.keras.utils.image_dataset_from_directory(
  val_dir,
  # validation_split=0.2,
  # subset="validation",
  seed=123,
  image_size=(config["img_size"], config["img_size"]),
  batch_size=config["batch_size"])

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

# TODO: cache on disk, cache in ram crashes session
# train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
# val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

train_ds = train_data.prefetch(buffer_size=AUTOTUNE)
val_ds = val_data.prefetch(buffer_size=AUTOTUNE)

In [None]:
# TODO: add wandb logging -> https://docs.wandb.ai/models/tutorials/keras_models
num_classes = 10

model = tf.keras.Sequential([
  data_augmentation,
  tf.keras.layers.Rescaling(1./255),
  tf.keras.layers.Conv2D(64, 3, activation='relu'),
  tf.keras.layers.MaxPooling2D(),
  tf.keras.layers.Conv2D(32, 3, activation='relu'),
  tf.keras.layers.MaxPooling2D(),
  tf.keras.layers.Conv2D(16, 3, activation='relu'),
  tf.keras.layers.MaxPooling2D(),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(num_classes)
])

model.compile(
  optimizer='adam',
  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=['accuracy'])

In [None]:
# Intiate wandb
wandb.init(project="zneus-2", config=config)

In [None]:
model.fit(
  train_data,
  validation_data=val_data,
  epochs=config["epochs"],
  # class_weight=class_weight,
  # callbacks=[WandbCallback(
  #     save_model=True,
  #     log_weights=True,
  #     log_evaluation=True
  # )]
)

In [None]:
# Create confusion matrix
y_true = np.concatenate([y.numpy() for _, y in val_data])
y_pred = np.argmax(model.predict(val_data), axis=1)

cm = confusion_matrix(y_true, y_pred)
labels = val_data.class_names

# Plot confusion matrix
plt.figure(figsize=(10, 8))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
plt.title("Confusion Matrix")
plt.xticks(rotation=45)
plt.tight_layout()
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()


early stopping, meaningful changes based on convergence, overfitting, loss