# Humpback Whale Indentification Challenge

### Importing Libraries

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
from fastai.vision import *
from fastai.metrics import accuracy
from fastai.basic_data import *
from skimage.util import montage
import pandas as pd
from torch import optim
import re


In [32]:
import fastai
from fastprogress import force_console_behavior
import fastprogress
fastprogress.fastprogress.NO_BAR = False
master_bar, progress_bar = force_console_behavior()
fastai.basic_train.master_bar, fastai.basic_train.progress_bar = master_bar, progress_bar

### Reading in data file in CSV.

In [5]:
df = pd.read_csv('.kaggle/train.csv')


### We are keeping only file for validation in our final training model so that we are able to train our model better before we submit results to Kaggle. This provide us with more training data.
### It is hereby suggested that the validation loss in this file is not useful because of this approach.

In [1]:
val_fns = {'69823499d.jpg'}

### Helper Functions to import Whale Image data more Easily

In [6]:
fn2label = {row[1].Image: row[1].Id for row in df.iterrows()}
path2fn = lambda path: re.search('\w*\.jpg$', path).group(0)

In [7]:
name = f'res50-full-train'

### Defining parameters, SZ= Image width, BS= Batch Size, Num_workers= number of cores

In [8]:
SZ = 224
BS = 64
NUM_WORKERS = 12
SEED=0

 ### FastAI provides quickly loading of train data using DataLoader, ImageList and DataBunch function of PyTorch.  We use from_df, split_by_valid_func, add_test and label_from_func to quickly define train, validation and test set and also allows to label images from .CSV file.
 
 ### transform() allows to apply transformation to images, Image Augmentation to get high quality data set.
 

In [9]:
data = (
    ImageList
        .from_df(df[df.Id != 'new_whale'], '.kaggle/train', cols=['Image'])
        .split_by_valid_func(lambda path: path2fn(path) in val_fns)
        .label_from_func(lambda path: fn2label[path2fn(path)])
        .add_test(ImageList.from_folder('.kaggle/test'))
        .transform(get_transforms(do_flip=False), size=SZ, resize_method=ResizeMethod.SQUISH)
        .databunch(bs=BS, num_workers=NUM_WORKERS, path='.kaggle')
        .normalize(imagenet_stats)
)

### Here we create a Convolutional Res-Net 50 Architecture  and train the model with Cyclical learning rate approach. This approach allows powerfull training and helps the model out of local minimas and direct it towards the minimums.

### We also use unfreeze technique to tune earlier layers of the network, and use differential learning rates for different layers groups of the Res-Net Model. FastAI allows helpful abstractions and takes care of lot of work in the background, providing faster prototyping and less need of hyperparameter tuning.

In [11]:
%%time

learn = cnn_learner(data, models.resnet50, lin_ftrs=[2048])
learn.clip_grad();

learn.fit_one_cycle(14, 1e-2)
learn.save(f'{name}-stage-1')

learn.unfreeze()

max_lr = 1e-3
lrs = [max_lr/100, max_lr/10, max_lr]

learn.fit_one_cycle(5, lrs)
learn.save(f'{name}-stage-2')

epoch     train_loss  valid_loss  time    
0         7.478219    0.902079    02:30     
1         6.756791    1.233457    02:31     
2         6.085188    0.584572    02:31     
3         5.186265    0.740844    02:31     
4         4.292631    3.689270    02:31     
5         3.467510    0.000226    02:31     
6         2.652100    0.000036    02:31     
7         1.812225    1.108416    02:31     
8         1.201279    0.002174    02:31     
9         0.744257    0.000312    02:31     
10        0.386350    0.000525    02:31     
11        0.236899    0.000360    02:31     
12        0.139014    0.000121    02:31     
13        0.107251    0.000219    02:31     
epoch     train_loss  valid_loss  time    
0         0.223310    0.000349    03:22     
1         0.417714    0.016265    03:21     
2         0.327807    0.001612    03:21     
3         0.173747    0.000095    03:21     
4         0.110111    0.000047    03:21     
CPU times: user 37min 34s, sys: 15min 41s, total: 53min 15s

### Now we increase the size of the images width. This is an approach used by a lot of Kaggle kernels and it allows the model to train for longer without overfiting on a particular size.

In [10]:
SZ = 224 * 2
BS = 64 // 4
NUM_WORKERS = 12
SEED=0

In [11]:
data = (
    ImageList
        .from_df(df[df.Id != 'new_whale'], '.kaggle/train', cols=['Image'])
        .split_by_valid_func(lambda path: path2fn(path) in val_fns)
        .label_from_func(lambda path: fn2label[path2fn(path)])
        .add_test(ImageList.from_folder('.kaggle/test'))
        .transform(get_transforms(do_flip=False), size=SZ, resize_method=ResizeMethod.SQUISH)
        .databunch(bs=BS, num_workers=NUM_WORKERS, path='.kaggle')
        .normalize(imagenet_stats)
)

In [12]:
print(f'{name}-stage-2')

res50-full-train-stage-2


In [13]:
%%time
learn = cnn_learner(data, models.resnet50, lin_ftrs=[2048])
learn.clip_grad();
learn.load(f'{name}-stage-2')
learn.freeze_to(-1)

learn.fit_one_cycle(7, 1e-2 / 4)
learn.save(f'{name}-stage-3')

learn.unfreeze()

max_lr = 1e-3 / 4
lrs = [max_lr/100, max_lr/10, max_lr]

learn.fit_one_cycle(15, lrs)
learn.save(f'{name}-stage-4')

epoch     train_loss  valid_loss  time    
0         1.194606    0.000004    09:32     
1         1.772817    0.000003    09:25     
2         2.070666    0.000003    09:25     
3         1.757798    0.000006    09:25     
4         1.307763    0.000000    09:25     
5         0.942226    0.000000    09:25     
6         0.663974    0.000000    09:25     
epoch     train_loss  valid_loss  time    
0         0.659680    0.000001    12:52     
1         0.659124    0.000000    12:51     
2         0.762137    0.000019    12:51     
3         0.845313    0.000002    12:51     
4         0.862025    0.000020    12:51     
5         0.670531    0.000000    12:51     
6         0.725686    0.000000    12:51     
7         0.623230    0.000000    12:51     
8         0.570611    0.000003    12:51     
9         0.573705    0.000002    12:51     
10        0.420998    0.000001    12:51     
11        0.433227    0.000006    12:51     
12        0.396316    0.000001    12:51     
13        0.30

### Now we will use oversampled data created by Radek, this data is available on the kaggle for anyone to use. It allows us to do over sampling for Whale Categories with low number of images.

In [14]:
# with oversampling
df = pd.read_csv('.kaggle/oversampled_train_and_val.csv')

In [25]:
data = (
    ImageList
        .from_df(df, '.kaggle/train', cols=['Image'])
        .split_by_valid_func(lambda path: path2fn(path) in val_fns)
        .label_from_func(lambda path: fn2label[path2fn(path)])
        .add_test(ImageList.from_folder('.kaggle/test'))
        .transform(get_transforms(do_flip=False), size=SZ, resize_method=ResizeMethod.SQUISH)
        .databunch(bs=BS, num_workers=NUM_WORKERS, path='.kaggle')
        .normalize(imagenet_stats)
)

In [30]:
print(f'{name}-stage-4')

res50-full-train-stage-4


In [45]:
%%time
learn = cnn_learner(data, models.resnet50, lin_ftrs=[2048])
learn.clip_grad();
learn.load(f'{name}-stage-4')
learn.freeze_to(-1)

learn.fit_one_cycle(2, 1e-2 / 4)
learn.save(f'{name}-stage-5')

learn.unfreeze()

max_lr = 1e-3 / 4
lrs = [max_lr/100, max_lr/10, max_lr]

learn.fit_one_cycle(2, lrs)
learn.save(f'{name}-stage-6')

epoch     train_loss  valid_loss  time    
Epoch 1/2 :                                                                               

Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/multiprocessing/queues.py", line 242, in _feed
    send_bytes(obj)
  File "/opt/anaconda3/lib/python3.7/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/multiprocessing/queues.py", line 242, in _feed
    send_bytes(obj)
  File "/opt/anaconda3/lib/python3.7/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/opt/anaconda3/lib/python3.7/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/multiprocessing/queues.py", line 242, in _feed
    send_bytes(obj)
  File "/opt/anaconda3/lib/python3.7/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
Tracebac

KeyboardInterrupt: 

In [49]:
learn = cnn_learner(data, models.resnet50, lin_ftrs=[2048])
learn.load(f'{name}-stage-6')

                                                                         

### The total train time of the notebook is around 4 hours. We used an Nvidia Tesla P4 GPU with 16 GB memory to train the model on the Google Cloud Platform.

#### Predicitons, we predict New_whale Category for an image only iff other 4 predictions in MAP5 metric are less than 0.06. This threshold was derived from another publicly available kernel who has tried different values and found this one to work better.

In [None]:
preds, _ = learn.get_preds(DatasetType.Test)

In [50]:
preds = torch.cat((preds, torch.ones_like(preds[:, :1])), 1)

In [51]:
preds[:, 5004] = 0.06

In [52]:
classes = learn.data.classes + ['new_whale']

#### This function helps to make submission csv file

In [66]:
create_submission(preds, learn.data, name, classes)

In [67]:
pd.read_csv('.kaggle/submission.csv.gz').head()

Unnamed: 0,Image,Id
0,0cd888d7e.jpg,w_8e74601 new_whale w_f314516 w_760c469 w_27be996
1,847fda2fa.jpg,new_whale w_0886321 w_71dfca3 w_fd3e556 w_22b8752
2,272146b21.jpg,w_f20333d new_whale w_16dfac7 w_c63c580 w_3bdf2c6
3,44647d086.jpg,w_b475feb new_whale w_edf43c1 w_fe03daf w_34120de
4,e979920ad.jpg,w_2365d55 new_whale w_27597ff w_dea40e2 w_c4c380f


In [69]:
pd.read_csv('.kaggle/submission.csv.gz').Id.str.split().apply(lambda x: x[0] == 'new_whale').mean()

0.3028894472361809

### submitting results to Kaggle

In [64]:
!kaggle competitions submit -c humpback-whale-identification -f subs/{name}.csv.gz -m "{name}"

subs/<function name at 0x7fcfa70a1158>.csv.gz


### Performance is evaluated based on Mean Average Precision from at most 5 labels. In the public leaderboard, our score is 0.71991; In the private leaderboard, our score is 0.74815; Our model performs better than 57% of the teams in the competition