In [None]:
import sys
import warnings
from os import path

import cv2
import numpy as np
import pandas as pd
import seaborn as sns
import torch
from IPython.core.interactiveshell import InteractiveShell
from PIL import Image
from timm import create_model

sys.path.append(path.abspath('..'))

from src.train.config.train_config import get_train_cfg  # noqa: E402
from src.train.data_utils.dataset_finder import find_dataset  # noqa: E402
from src.train.data_utils.transforms import PadResizeOCR  # noqa: E402
from src.train.dataset import BarcodeDataset  # noqa: E402

InteractiveShell.ast_node_interactivity = 'all'
warnings.simplefilter('ignore')
%load_ext autoreload
%autoreload 2

# Get the data

In [None]:
cfg = get_train_cfg()
dataset_splits = find_dataset(cfg.datamodule_cfg.data_source_cfg)
train_paths = dataset_splits.train
train_dataset = BarcodeDataset(anns_path=train_paths.ann_file, data_folder=train_paths.img_folder)
df = pd.read_csv(train_paths.ann_file)

# Find the height for resize

In [None]:
train_shapes = []
for i in range(len(train_dataset)):
    train_shapes.append(train_dataset[i][0].shape)
train_shapes_np = np.array(train_shapes)

In [None]:
np.median(train_shapes_np[:, 0])
_ = sns.distplot(train_shapes_np[:, 0])

## Original images

In [None]:
for i in range(100):
    cv_image = train_dataset[i][0]
    Image.fromarray(cv_image)
    cv_image.shape

## Rescale images by height

Images can be rescaled down without significant loss of information. Let's pick a height divisible by 32 and look at the barcodes: 96.

In [None]:
new_height = 96

for i in range(10):
    image = train_dataset[i][0]
    scale = new_height / image.shape[0]
    scaled_image = cv2.resize(image, None, fx=scale, fy=scale)
    Image.fromarray(scaled_image)
    scaled_image.shape

In fact, barcodes are still readable after resize of height to 96.

## Find width for letterbox resizing

We will resize images keeping their aspect ratios and using zero padding (letterbox resize), so we need to find a new width. Let's take a look at the distribution of images width after resizing to the new height.

In [None]:
train_width = train_shapes_np[:, 1] * new_height / train_shapes_np[:, 0]
np.max(train_width)
_ = sns.distplot(train_width)

Here we see a few outliers, so let's filter them out.

In [None]:
q1 = np.percentile(train_width, 25)
q3 = np.percentile(train_width, 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

train_width_filtered = train_width[(train_width > lower_bound) & (train_width < upper_bound)]
np.max(train_width_filtered)
sns.distplot(train_width_filtered)

To get the new widths of the images we'll take the max value and round it up to the next value divisible by 32. Given the max width of 322, the new width divisible by 32 is 352. Let's apply this resize to barcodes and see the results.

In [None]:
new_width = 352
transform = PadResizeOCR(target_width=new_width, target_height=new_height, mode='left')

for i in range(10):
    image = train_dataset[i][0]
    transformed_image = transform(image=image)['image']
    Image.fromarray(transformed_image)
    transformed_image.shape

# Find backbone feature map size

To get the best of CNN + RNN + CTC architecture, we need to find an optimal size of the feature map returned by CNN backbone. Feature map is sliced before inputting to LSTM, and the recommended number of slices per character is used to 3 or more.

Since maximum number of characters is 13 and max width is 323, we have: 323 / (13 * 3) = 8.28, rounded up to 9 pixels per slice. But as we resized width to 352, the featuremap must have at least 352 / 9 = 39.11 -> 40 slices.

Let's pick a small and fast backbone like `mobilenetv3`.

In [None]:
backbone = create_model(
    'resnet18',
    pretrained=True,
    features_only=True,
)

with torch.no_grad():
    feature_maps = backbone(torch.rand(1, 3, 96, 352))

for layer_idx, feature_map in enumerate(feature_maps):
    print(layer_idx + 1, feature_map.shape)

Feature maps from layers #4 and #5 are too narrow, maps from layers #1 and #2 are too wide.

Finally, feature map from layer #3 seems to fit, since its widths is 44 > required 40.

This should work for a baseline, however, there is a room for improvement by tuning backbone and RNN parameters. For example, we can try a different backbone and set `output_stride` parameter to get a feature map of suitable size from a deeper layer like this:

In [None]:
backbone_stride = create_model(
    'resnet34',
    pretrained=True,
    features_only=True,
    output_stride=8,
)

with torch.no_grad():
    feature_maps_stride = backbone_stride(torch.rand(1, 3, 96, 352))

for layer_idx, feature_map_stride in enumerate(feature_maps_stride):
    print(layer_idx + 1, feature_map_stride.shape)

This way we can take feature map from 4th or even 5th level since these maps will have a suitable width of 44 > 40.

**Important notes regarding RNN hyperparameters.**

Let `(BS x C_fm x H_fm x W_fm)` be the shape of feature map. In our RCNN, it's processed as follows:
1. 1x1 conv (gate) is applied to the feature map to change the number of channels to `rnn_features_num`. Output: `(BS x rnn_features_num x H_fm x W_fm)`
2. `W_fm` vertical slices are created out of this feature map, where length of each slice is `H_fm * rnn_features_num = rnn_input_size`. Output: `(W_fm x BS x rnn_input_size)`.
3. This tensor is passed to RNN with `hidden_size`, outputting  `(W_fm x BS x hidden_size)` (`hidden_size * 2` in case of bidirectional RNN.
4. RNN output is passed to a linear layer with softmax.

Thus:
1. `rnn_features_num` is the new number of channels of backbone's output feature map.
2. `rnn_input_size` is the length of vertical slices fed to RNN across the entire width of the feature map. Must be set equal to `H_fm * rnn_features_num`.