In [1]:
import os
import sys
import numpy as np
import pandas as pd
import math
from tqdm.notebook import tqdm
import random
import matplotlib.pyplot as plt
import cv2
import imagesize

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import timm
from sklearn.manifold import TSNE
import wandb
import IPython.display as ipd

## Notebook Config

In [2]:
class CFG:
    seed          = 42
    base_path     = '../data/happy-whale-and-dolphin'
    embed_path    = '../data/embedding-dataset' # `None` for creating embeddings otherwise load
#     ckpt_path     = '../input/arcface-gem-dataset/Loss15.2453_epoch3.bin' # checkpoint for finetuned model by debarshichanda
    num_samples   = None #  None for all samples
    device        = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    competition   = 'happywhale'
    _wandb_kernel = 'trungngo'

In [3]:
def seed_torch(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
    if torch.backends.cudnn.is_available:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    print('# SEEDING DONE')
seed_torch(CFG.seed)

# SEEDING DONE


# WandB ⭐
<div align=center> <img src="https://camo.githubusercontent.com/dd842f7b0be57140e68b2ab9cb007992acd131c48284eaf6b1aca758bfea358b/68747470733a2f2f692e696d6775722e636f6d2f52557469567a482e706e67" width=500></div>

Weights & Biases (W&B) is MLOps platform for tracking our experiemnts. We can use it to Build better models faster with experiment tracking, dataset versioning, and model management. Some of the cool features of W&B:

* Track, compare, and visualize ML experiments
* Get live metrics, terminal logs, and system stats streamed to the centralized dashboard.
* Explain how your model works, show graphs of how model versions improved, discuss bugs, and demonstrate progress towards milestones.

In [4]:
import wandb

wandb.init(project="kaggle-happywhale-2022-test", entity="trungngo")

[34m[1mwandb[0m: Currently logged in as: [33mtrungngo[0m (use `wandb login --relogin` to force relogin)


# Meta Data 📈
* `train_images/` - a folder containing the training images
* `train.csv` - provides the species and the individual_id for each of the training images
* `test_images/` - a folder containing the test images; for each image, your task is to predict the individual_id; no species information is given for the test data; there are individuals in the test data that are not observed in the training data, which should be predicted as new_individual.
* `sample_submission.csv` - a sample submission file in the correct format

> Note: We don't have access to `species` column for **test** data. So, we can't direcly use `species` for **train**.

In [5]:
df = pd.read_csv(f'{CFG.base_path}/train.csv')
df['image_path'] = CFG.base_path + '/train_images/' + df['image']
df['split'] = 'Train'

test_df = pd.read_csv(f'{CFG.base_path}/sample_submission.csv')
test_df['image_path'] = CFG.base_path + '/test_images/' + test_df['image']
test_df['split'] = 'Test'

print('Train Images: {:,} | Test Images: {:,}'.format(len(df), len(test_df)))

Train Images: 51,033 | Test Images: 27,956


In [6]:
df.head(5)

Unnamed: 0,image,species,individual_id,image_path,split
0,00021adfb725ed.jpg,melon_headed_whale,cadddb1636b9,../data/happy-whale-and-dolphin/train_images/0...,Train
1,000562241d384d.jpg,humpback_whale,1a71fbb72250,../data/happy-whale-and-dolphin/train_images/0...,Train
2,0007c33415ce37.jpg,false_killer_whale,60008f293a2b,../data/happy-whale-and-dolphin/train_images/0...,Train
3,0007d9bca26a99.jpg,bottlenose_dolphin,4b00fe572063,../data/happy-whale-and-dolphin/train_images/0...,Train
4,00087baf5cef7a.jpg,humpback_whale,8e5253662392,../data/happy-whale-and-dolphin/train_images/0...,Train


In [7]:
test_df.head(5)

Unnamed: 0,image,predictions,image_path,split
0,000110707af0ba.jpg,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,../data/happy-whale-and-dolphin/test_images/00...,Test
1,0006287ec424cb.jpg,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,../data/happy-whale-and-dolphin/test_images/00...,Test
2,000809ecb2ccad.jpg,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,../data/happy-whale-and-dolphin/test_images/00...,Test
3,00098d1376dab2.jpg,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,../data/happy-whale-and-dolphin/test_images/00...,Test
4,000b8d89c738bd.jpg,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,../data/happy-whale-and-dolphin/test_images/00...,Test


## Clip Data
Size of the dataset is huge. We can control the size using `CFG.num_samples`

In [8]:
if CFG.num_samples:
    df = df.iloc[:CFG.num_samples]
    test_df = test_df.iloc[:CFG.num_samples]

## Fix Meta Data
Folowing cells,
* Converts `beluga`, `globis` to `whales` for 2class label.
* Fixes Duplicate Labels.

In [9]:
# convert beluga, globis to whales
df.loc[df.species.str.contains('beluga'), 'species'] = 'beluga_whale'
df.loc[df.species.str.contains('globis'), 'species'] = 'short_finned_pilot_whale'
df.loc[df.species.str.contains('pilot_whale'), 'species'] = 'short_finned_pilot_whale'
df['class'] = df.species.map(lambda x: 'whale' if 'whale' in x else 'dolphin')

# fix duplicate labels
# https://www.kaggle.com/c/happy-whale-and-dolphin/discussion/304633
df['species'] = df['species'].str.replace('bottlenose_dolpin','bottlenose_dolphin')
df['species'] = df['species'].str.replace('kiler_whale','killer_whale')

## Find Image Size

In [10]:
def get_imgsize(row):
    row['width'], row['height'] = imagesize.get(row['image_path'])
    return row

In [13]:
# Train
tqdm.pandas(desc='Train ')
df = df.progress_apply(get_imgsize, axis=1)
df.to_csv('train.csv', index=False)

# Test
tqdm.pandas(desc='Test ')
test_df = test_df.progress_apply(get_imgsize, axis=1)
test_df.to_csv('test.csv',index=False)

Train :   0%|          | 0/51033 [00:00<?, ?it/s]

Test :   0%|          | 0/27956 [00:00<?, ?it/s]

In [14]:
print('Train:')
display(df.head(2))

print('Test:')
display(test_df.head(2))

Train:


Unnamed: 0,image,species,individual_id,image_path,split,class,width,height
0,00021adfb725ed.jpg,melon_headed_whale,cadddb1636b9,../data/happy-whale-and-dolphin/train_images/0...,Train,whale,804,671
1,000562241d384d.jpg,humpback_whale,1a71fbb72250,../data/happy-whale-and-dolphin/train_images/0...,Train,whale,3504,2336


Test:


Unnamed: 0,image,predictions,image_path,split,width,height
0,000110707af0ba.jpg,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,../data/happy-whale-and-dolphin/test_images/00...,Test,3599,2399
1,0006287ec424cb.jpg,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,../data/happy-whale-and-dolphin/test_images/00...,Test,3600,2400
