# Midterm Expectation

- Baseline
- Experiment (i.e., something different; e.x., spreadsheet)
- Timeline (what we have done, and what we will be doing)

# Dependencies

In [1]:
import os

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torchvision
from torchvision.transforms import (
    CenterCrop,
    Compose,
    RandAugment,
    RandomPerspective,
    RandomHorizontalFlip,
    RandomRotation,
    ToTensor,
)
from PIL import Image
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import math

# Mount Google Drive

## Install gdfuse

In [2]:
!sudo add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!sudo apt-get update -qq 2>&1 > /dev/null
!sudo apt -y install -qq google-drive-ocamlfuse 2>&1 > /dev/null
!google-drive-ocamlfuse





## Fetch API Key

In [3]:
!sudo apt-get install -qq w3m # to act as web browser 
!xdg-settings set default-web-browser w3m.desktop # to set default browser
%cd /content
!mkdir drive
%cd drive
!mkdir MyDrive
%cd ..
%cd ..
!google-drive-ocamlfuse /content/drive/MyDrive

/content
mkdir: cannot create directory ‘drive’: File exists
/content/drive
mkdir: cannot create directory ‘MyDrive’: File exists
/content
/
fuse: mountpoint is not empty
fuse: if you are sure this is safe, use the 'nonempty' mount option


# Unzip Data

In [4]:
# ! pip install gdown
# ! cd /content/ && gdown https://drive.google.com/uc?id=1zD7fUAt12L16ywPSjRsj4IqL-Lqg13zK
# # ! cp /content/drive/MyDrive/11785/project/data.zip /content/
# ! cd /content && unzip data.zip
# ! rm /content/data.zip

# Global Variables

In [5]:
DATA_PATH = "/content/data"
METADATA_DIR = f"{DATA_PATH}/metadata"
IMAGERY_DIR = f"{DATA_PATH}/imagery/realsense_overhead"

# Helper Functions for Data

In [6]:
def read_csv_variable_cols(filepath: str) -> pd.DataFrame:
    """https://stackoverflow.com/a/57824142.
    We only read the first 6 columns to retrieve required labels.
    """
    ### Loop the data lines
    with open(filepath, 'r') as temp_f:
        # get No of columns in each line
        col_count = [ len(l.split(",")) for l in temp_f.readlines() ]

    ### Generate column names  (names will be 0, 1, 2, ..., maximum columns - 1)
    column_names = [i for i in range(0, max(col_count))]

    ### Read csv
    return pd.read_csv(filepath, header=None, delimiter=",", names=column_names, low_memory=False).iloc[:,:6]

# Ingredient and Dish Metadata (Groun Truths)

## Data Format

### Training-testing data

- "imagery/realsense_overhead/dish_<id>" contains the images as the input data
- "dish_ids" contains training-testing splits

### Labels (metadata)

All labels need to be preprocessed. For each dish, we need to extract the following:
- total calorie
- mass (optional according to the paper)
- the amount of the three macronutrients (fat, carb, protein)

It doesn't seem that bad - we don't need to process the ingredients because they are purely there for constructing the labels. For our multi-task learning, we only need to have the above labels. The three tasks are:

1. Calorie
2. Macronutrients (fat, carb, protein)
3. Mass (optional)

这样一来我们可以把labels和image data放在一起，每次返回input和expected output.

## Dish Metadata

In [7]:
# Metadata for dishes has variable numbers of columns per row.
# Can do similar stuff to dish_metadata_cafe2.csv
# The first 6 columns: [dish_id, total_calories, total_mass, total_fat, total_carb, total_protein]
dish_metadata_1 = read_csv_variable_cols(f"{METADATA_DIR}/dish_metadata_cafe1.csv")
# Rename the columns
dish_metadata_1 = dish_metadata_1.rename(columns={0:"dish_id", 1:"total_calories", 2:"total_mass", 3:"total_fat", 4:"total_carb", 5:"total_protein"})

dish_metadata_2 = read_csv_variable_cols(f"{METADATA_DIR}/dish_metadata_cafe2.csv")
# Rename the columns
dish_metadata_2 = dish_metadata_2.rename(columns={0:"dish_id", 1:"total_calories", 2:"total_mass", 3:"total_fat", 4:"total_carb", 5:"total_protein"})

dish_metadata = pd.concat((dish_metadata_1, dish_metadata_2), ignore_index=True)
# Convert to dictionary
labels_dict = dish_metadata.set_index("dish_id").to_dict("index")

# Hyperparameters

In [8]:
config = {
    'epochs': 150,
    'batch_size': 32,
    'lr': 2e-4,
}

class Config:
    def __init__(self, config):
        for k, v in config.items():
            setattr(self, k, v)

config = Config(config)

# Datasets and DataLoaders

In [9]:
transforms_augmented = Compose([CenterCrop((256, 256)), RandAugment(3), RandomHorizontalFlip(), ToTensor()])
transforms_default = Compose([CenterCrop((256, 256)), ToTensor()])
transforms = Compose([CenterCrop((256, 256)), RandomPerspective(), RandomHorizontalFlip(), ToTensor()])
class RGBDataset(Dataset):
    """4.2 The input resolution to the
    network is a 256x256 image, where images were downsized
    and center cropped in order to retain the most salient dish
    region.

    我们baseline应该只用RGB就行 (根据4.2).
    """

    def __init__(self, data_dir, transforms=transforms_default, labels=labels_dict, train=True):
        self.data_dir = data_dir
        self.transforms = transforms
        self.labels = labels
        self.train = train

        # # ['dish_1556572657', 'dish_1556573514', 'dish_1556575014', 'dish_1556575083', 'dish_1556575124', 'dish_1556575273', 'dish_1556575327']
        dirs = os.listdir(self.data_dir)

        self.dish_ids = []
        for dir in dirs:
            if "rgb.png" in os.listdir(os.path.join(self.data_dir,dir)):
                self.dish_ids.append(dir)

        self.dish_ids.sort()

        self.img_paths = list(
            map(
                lambda fname: os.path.join(self.data_dir, fname),
                self.dish_ids,
            )
        )

    def __len__(self):
        return len(self.img_paths)

    def __getitem__(self, idx):
        rgb_path = f"{self.img_paths[idx]}/rgb.png"
        dish_id = self.dish_ids[idx]
        transformed_img = self.transforms(Image.open(rgb_path))
        if self.train:
            label = torch.tensor(list(self.labels[dish_id].values()))
            return transformed_img, label
        else:
            return transformed_img

In [10]:
TRAIN_DIR = f"{IMAGERY_DIR}/train"
VALID_DIR = f"{IMAGERY_DIR}/test"
# TEST_DIR = IMAGERY_DIR

train_dataset = RGBDataset(TRAIN_DIR, labels=labels_dict)
valid_dataset = RGBDataset(VALID_DIR, labels=labels_dict)
# test_dataset = RGBDataset(TEST_DIR, train=False)

train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=2)
valid_loader = DataLoader(valid_dataset, batch_size=config.batch_size, shuffle=False, num_workers=2)
# test_loader = DataLoader(test_dataset, batch_size=config.batch_size, shuffle=False, num_workers=2)

num_training_batches = len(train_loader)

# InceptionV2

In [11]:
def ConvBNReLU(in_channels,out_channels,kernel_size,stride=1,padding=0):
    return nn.Sequential(
        nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride,padding=padding),
        nn.BatchNorm2d(out_channels),
        nn.ReLU6(inplace=True),
    )

def ConvBNReLUFactorization(in_channels,out_channels,kernel_sizes,paddings):
    return nn.Sequential(
        nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_sizes, stride=1,padding=paddings),
        nn.BatchNorm2d(out_channels),
        nn.ReLU6(inplace=True)
    )

class InceptionV2ModuleA(nn.Module):
    def __init__(self, in_channels,out_channels1,out_channels2reduce, out_channels2, out_channels3reduce, out_channels3, out_channels4):
        super(InceptionV2ModuleA, self).__init__()

        self.branch1 = ConvBNReLU(in_channels=in_channels,out_channels=out_channels1,kernel_size=1)

        self.branch2 = nn.Sequential(
            ConvBNReLU(in_channels=in_channels, out_channels=out_channels2reduce, kernel_size=1),
            ConvBNReLU(in_channels=out_channels2reduce, out_channels=out_channels2, kernel_size=3, padding=1),
        )

        self.branch3 = nn.Sequential(
            ConvBNReLU(in_channels=in_channels,out_channels=out_channels3reduce,kernel_size=1),
            ConvBNReLU(in_channels=out_channels3reduce, out_channels=out_channels3, kernel_size=3, padding=1),
            ConvBNReLU(in_channels=out_channels3, out_channels=out_channels3, kernel_size=3, padding=1),
        )

        self.branch4 = nn.Sequential(
            nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
            ConvBNReLU(in_channels=in_channels, out_channels=out_channels4, kernel_size=1),
        )

    def forward(self, x):
        out1 = self.branch1(x)
        out2 = self.branch2(x)
        out3 = self.branch3(x)
        out4 = self.branch4(x)
        out = torch.cat([out1, out2, out3, out4], dim=1)
        return out

class InceptionV2ModuleB(nn.Module):
    def __init__(self, in_channels,out_channels1,out_channels2reduce, out_channels2, out_channels3reduce, out_channels3, out_channels4):
        super(InceptionV2ModuleB, self).__init__()

        self.branch1 = ConvBNReLU(in_channels=in_channels,out_channels=out_channels1,kernel_size=1)

        self.branch2 = nn.Sequential(
            ConvBNReLU(in_channels=in_channels, out_channels=out_channels2reduce, kernel_size=1),
            ConvBNReLUFactorization(in_channels=out_channels2reduce, out_channels=out_channels2reduce, kernel_sizes=[1,3],paddings=[0,1]),
            ConvBNReLUFactorization(in_channels=out_channels2reduce, out_channels=out_channels2, kernel_sizes=[3,1],paddings=[1, 0]),
        )

        self.branch3 = nn.Sequential(
            ConvBNReLU(in_channels=in_channels,out_channels=out_channels3reduce,kernel_size=1),
            ConvBNReLUFactorization(in_channels=out_channels3reduce, out_channels=out_channels3reduce,kernel_sizes=[1, 3], paddings=[0, 1]),
            ConvBNReLUFactorization(in_channels=out_channels3reduce, out_channels=out_channels3reduce,kernel_sizes=[3, 1], paddings=[1, 0]),
            ConvBNReLUFactorization(in_channels=out_channels3reduce, out_channels=out_channels3reduce, kernel_sizes=[1, 3], paddings=[0, 1]),
            ConvBNReLUFactorization(in_channels=out_channels3reduce, out_channels=out_channels3,kernel_sizes=[3, 1], paddings=[1, 0]),
        )

        self.branch4 = nn.Sequential(
            nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
            ConvBNReLU(in_channels=in_channels, out_channels=out_channels4, kernel_size=1),
        )

    def forward(self, x):
        out1 = self.branch1(x)
        out2 = self.branch2(x)
        out3 = self.branch3(x)
        out4 = self.branch4(x)
        out = torch.cat([out1, out2, out3, out4], dim=1)
        return out

class InceptionV2ModuleC(nn.Module):
    def __init__(self, in_channels,out_channels1,out_channels2reduce, out_channels2, out_channels3reduce, out_channels3, out_channels4):
        super(InceptionV2ModuleC, self).__init__()

        self.branch1 = ConvBNReLU(in_channels=in_channels,out_channels=out_channels1,kernel_size=1)

        self.branch2_conv1 = ConvBNReLU(in_channels=in_channels, out_channels=out_channels2reduce, kernel_size=1)
        self.branch2_conv2a = ConvBNReLUFactorization(in_channels=out_channels2reduce, out_channels=out_channels2, kernel_sizes=[1,3],paddings=[0,1])
        self.branch2_conv2b = ConvBNReLUFactorization(in_channels=out_channels2reduce, out_channels=out_channels2, kernel_sizes=[3,1],paddings=[1,0])

        self.branch3_conv1 = ConvBNReLU(in_channels=in_channels,out_channels=out_channels3reduce,kernel_size=1)
        self.branch3_conv2 = ConvBNReLU(in_channels=out_channels3reduce, out_channels=out_channels3, kernel_size=3,stride=1,padding=1)
        self.branch3_conv3a = ConvBNReLUFactorization(in_channels=out_channels3, out_channels=out_channels3, kernel_sizes=[3, 1],paddings=[1, 0])
        self.branch3_conv3b = ConvBNReLUFactorization(in_channels=out_channels3, out_channels=out_channels3, kernel_sizes=[1, 3],paddings=[0, 1])

        self.branch4 = nn.Sequential(
            nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
            ConvBNReLU(in_channels=in_channels, out_channels=out_channels4, kernel_size=1),
        )

    def forward(self, x):
        out1 = self.branch1(x)
        x2 = self.branch2_conv1(x)
        out2 = torch.cat([self.branch2_conv2a(x2), self.branch2_conv2b(x2)],dim=1)
        x3 = self.branch3_conv2(self.branch3_conv1(x))
        out3 = torch.cat([self.branch3_conv3a(x3), self.branch3_conv3b(x3)], dim=1)
        out4 = self.branch4(x)
        out = torch.cat([out1, out2, out3, out4], dim=1)
        return out

class InceptionV3ModuleD(nn.Module):
    def __init__(self, in_channels,out_channels1reduce,out_channels1,out_channels2reduce, out_channels2):
        super(InceptionV3ModuleD, self).__init__()

        self.branch1 = nn.Sequential(
            ConvBNReLU(in_channels=in_channels, out_channels=out_channels1reduce, kernel_size=1),
            ConvBNReLU(in_channels=out_channels1reduce, out_channels=out_channels1, kernel_size=3,stride=2,padding=1)
        )

        self.branch2 = nn.Sequential(
            ConvBNReLU(in_channels=in_channels, out_channels=out_channels2reduce, kernel_size=1),
            ConvBNReLU(in_channels=out_channels2reduce, out_channels=out_channels2, kernel_size=3, stride=1, padding=1),
            ConvBNReLU(in_channels=out_channels2, out_channels=out_channels2, kernel_size=3, stride=2,padding=1),
        )

        self.branch3 = nn.MaxPool2d(kernel_size=3,stride=2,padding=1)

    def forward(self, x):
        out1 = self.branch1(x)
        out2 = self.branch2(x)
        out3 = self.branch3(x)
        out = torch.cat([out1, out2, out3], dim=1)
        return out

class InceptionAux(nn.Module):
    def __init__(self, in_channels,out_channels):
        super(InceptionAux, self).__init__()

        self.auxiliary_avgpool = nn.AvgPool2d(kernel_size=5, stride=3)
        self.auxiliary_conv1 = ConvBNReLU(in_channels=in_channels, out_channels=128, kernel_size=1)
        self.auxiliary_conv2 = nn.Conv2d(in_channels=128, out_channels=768, kernel_size=5,stride=1)
        self.auxiliary_dropout = nn.Dropout(p=0.7)
        self.auxiliary_linear1 = nn.Linear(in_features=768, out_features=out_channels)

    def forward(self, x):
        x = self.auxiliary_conv1(self.auxiliary_avgpool(x))
        x = self.auxiliary_conv2(x)
        x = x.view(x.size(0), -1)
        out = self.auxiliary_linear1(self.auxiliary_dropout(x))
        return out

class InceptionV2(nn.Module):
    def __init__(self, num_classes=1000, stage='train'):
        super(InceptionV2, self).__init__()
        self.stage = stage

        self.block1 = nn.Sequential(
            ConvBNReLU(in_channels=3, out_channels=64, kernel_size=7,stride=2,padding=3),
            nn.MaxPool2d(kernel_size=3,stride=2,padding=1),
        )

        self.block2 = nn.Sequential(
            ConvBNReLU(in_channels=64, out_channels=192, kernel_size=3, stride=1, padding=1),
            nn.MaxPool2d(kernel_size=3, stride=2,padding=1),
        )

        self.block3 = nn.Sequential(
            InceptionV2ModuleA(in_channels=192,out_channels1=64,out_channels2reduce=64, out_channels2=64, out_channels3reduce=64, out_channels3=96, out_channels4=32),
            InceptionV2ModuleA(in_channels=256, out_channels1=64, out_channels2reduce=64, out_channels2=96,out_channels3reduce=64, out_channels3=96, out_channels4=64),
            InceptionV3ModuleD(in_channels=320, out_channels1reduce=128, out_channels1=160, out_channels2reduce=64,out_channels2=96),
        )

        self.block4 = nn.Sequential(
            InceptionV2ModuleB(in_channels=576, out_channels1=224, out_channels2reduce=64, out_channels2=96,out_channels3reduce=96, out_channels3=128, out_channels4=128),
            InceptionV2ModuleB(in_channels=576, out_channels1=192, out_channels2reduce=96, out_channels2=128,out_channels3reduce=96, out_channels3=128, out_channels4=128),
            InceptionV2ModuleB(in_channels=576, out_channels1=160, out_channels2reduce=128, out_channels2=160,out_channels3reduce=128, out_channels3=128, out_channels4=128),
            InceptionV2ModuleB(in_channels=576, out_channels1=96, out_channels2reduce=128, out_channels2=192,out_channels3reduce=160, out_channels3=160, out_channels4=128),
            InceptionV3ModuleD(in_channels=576, out_channels1reduce=128, out_channels1=192, out_channels2reduce=192,out_channels2=256),
        )

        self.block5 = nn.Sequential(
            InceptionV2ModuleC(in_channels=1024, out_channels1=352, out_channels2reduce=192, out_channels2=160,out_channels3reduce=160, out_channels3=112, out_channels4=128),
            InceptionV2ModuleC(in_channels=1024, out_channels1=352, out_channels2reduce=192, out_channels2=160,
                               out_channels3reduce=192, out_channels3=112, out_channels4=128)
        )

        self.avg_pool = nn.AdaptiveAvgPool2d((2,2))
        self.dropout = nn.Dropout(p=0.5)
    

    def forward(self, x):
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.block4(x)
        x = self.block5(x)
        x = self.avg_pool(x)
        x = self.dropout(x)
        x = x.view(x.size(0), -1)
        return x

# Resnets

In [12]:
import torch.utils.model_zoo as model_zoo
resnet_model_urls = {
    'resnet18': 'https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth',
    'resnet34': 'https://s3.amazonaws.com/pytorch/models/resnet34-333f7ec4.pth',
    'resnet50': 'https://s3.amazonaws.com/pytorch/models/resnet50-19c8e357.pth',
    'resnet101': 'https://s3.amazonaws.com/pytorch/models/resnet101-5d3b4d8f.pth',
    'resnet152': 'https://s3.amazonaws.com/pytorch/models/resnet152-b121ed2d.pth',
}

class Bottleneck(nn.Module):
  expansion = 4

  def __init__(self, inplanes, planes, stride=1, downsample=None):
    super(Bottleneck, self).__init__()
    
    self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride, bias=False) # change
    self.bn1 = nn.BatchNorm2d(planes)
    
    self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, # change
                 padding=1, bias=False)
    self.bn2 = nn.BatchNorm2d(planes)
    
    self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
    self.bn3 = nn.BatchNorm2d(planes * 4)
    self.relu = nn.ReLU(inplace=True)
    self.downsample = downsample
    self.stride = stride

  def forward(self, x):
    residual = x

    out = self.conv1(x)
    out = self.bn1(out)
    out = self.relu(out)

    out = self.conv2(out)
    out = self.bn2(out)
    out = self.relu(out)

    out = self.conv3(out)
    out = self.bn3(out)

    if self.downsample is not None:
      residual = self.downsample(x)

    out += residual
    out = self.relu(out)

    return out
    
    
class ResNet(nn.Module):
  def __init__(self, block, layers, num_classes=1000):
    self.inplanes = 64
    super().__init__()
    self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                 bias=False)
    self.bn1 = nn.BatchNorm2d(64)
    self.relu = nn.ReLU(inplace=True)
    self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=0, ceil_mode=True) # change
    self.layer1 = self._make_layer(block, 64, layers[0])
    self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
    self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
    self.layer4 = self._make_layer(block, 512, layers[3], stride=2)   # different
    self.avgpool = nn.AvgPool2d(7)
    self.fc = nn.Linear(512 * block.expansion, num_classes)

    for m in self.modules():
      if isinstance(m, nn.Conv2d):
        n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
        m.weight.data.normal_(0, math.sqrt(2. / n))
      elif isinstance(m, nn.BatchNorm2d):
        m.weight.data.fill_(1)
        m.bias.data.zero_()

  def _make_layer(self, block, planes, blocks, stride=1):
    downsample = None
    if stride != 1 or self.inplanes != planes * block.expansion:
      downsample = nn.Sequential(
        nn.Conv2d(self.inplanes, planes * block.expansion,
              kernel_size=1, stride=stride, bias=False),
        nn.BatchNorm2d(planes * block.expansion),
      )

    layers = []
    layers.append(block(self.inplanes, planes, stride, downsample))
    self.inplanes = planes * block.expansion
    for i in range(1, blocks):
      layers.append(block(self.inplanes, planes))

    return nn.Sequential(*layers)

  def forward(self, x):
    x = self.conv1(x)
    x = self.bn1(x)
    x = self.relu(x)
    x = self.maxpool(x)

    x = self.layer1(x)
    x = self.layer2(x)
    x = self.layer3(x)
    x = self.layer4(x)

    x = self.avgpool(x)
    x = x.view(x.size(0), -1)
    x = self.fc(x)

    return x

def resnet50(pretrained=True):
  """Constructs a ResNet-50 model.
  Args:
    pretrained (bool): If True, returns a model pre-trained on ImageNet
  """
  model = ResNet(Bottleneck, [3, 4, 6, 3])
  if pretrained:
    model.load_state_dict(model_zoo.load_url(resnet_model_urls['resnet50']))
  return model

def resnet101(pretrained=True):
  """Constructs a ResNet-50 model.
  Args:
    pretrained (bool): If True, returns a model pre-trained on ImageNet
  """
  model = ResNet(Bottleneck, [3, 4, 23, 3])
  if pretrained:
    model.load_state_dict(model_zoo.load_url(resnet_model_urls['resnet101']))
  return model


def resnet152(pretrained=True):
  """Constructs a ResNet-152 model.
  Args:
    pretrained (bool): If True, returns a model pre-trained on ImageNet
  """
  model = ResNet(Bottleneck, [3, 8, 36, 3])
  if pretrained:
    model.load_state_dict(model_zoo.load_url(resnet_model_urls['resnet152']))
  return model

# ConvNext

In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F


def truncated_normal_(tensor, mean=0, std=0.09):
    with torch.no_grad():
        size = tensor.shape
        tmp = tensor.new_empty(size + (4,)).normal_()
        valid = (tmp < 2) & (tmp > -2)
        ind = valid.max(-1, keepdim=True)[1]
        tensor.data.copy_(tmp.gather(-1, ind).squeeze(-1))
        tensor.data.mul_(std).add_(mean)
        return tensor


class ConvNeXtBlock(nn.Module):
    """ The architecture of this block is as follows :
    
    DepthWise conv -> Permute to (N, H, W, C); [Channel Last]; Layer_norm -> Linear -> GELU -> Linear -> Permute Back

    Channel Last is used in input dimensions because its faster in PyTorch
    
    """

    def __init__(
        self, in_channel, depth_rate=0.0, layer_scale_init_value=1e-6
    ):
        super(ConvNeXtBlock, self).__init__()

        """Using Group covolution using groups as in the in_channel so it behaves as Depth Wise Convolution"""
        self.depthWiseConv = nn.Conv2d(
            in_channel, in_channel, kernel_size=7, padding=3, groups=in_channel
        )

        self.norm = Layer_norm(in_channel, eps=1e-6)

        """point wise convolution with 1x1 conv is similar to a Linear Layer"""
        self.pointWiseConv1 = nn.Linear(in_channel, 4 * in_channel)

        self.activation = nn.GELU()

        self.pointWiseConv2 = nn.Linear(4 * in_channel, in_channel)

        self.gamma = (
            nn.Parameter(
                layer_scale_init_value * torch.ones((in_channel)),
                requires_grad=True,
            )
            if layer_scale_init_value > 0
            else None
        )

        """Stochastic Depth aims to shrink the depth of a network during training, 
        while keeping it unchanged during testing. This is achieved by randomly dropping 
        entire ResBlocks during training and bypassing their transformations through 
        skip connections."""
        self.dropPath = nn.Identity()

    def forward(self, x):
        in_tensor = x
        x = self.depthWiseConv(x)
        x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
        x = self.norm(x)
        x = self.pointWiseConv1(x)
        x = self.activation(x)
        x = self.pointWiseConv2(x)
        if self.gamma is not None:
            x = self.gamma * x
        x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)

        x = in_tensor + self.dropPath(x)

        return x


class Layer_norm(nn.Module):
    def __init__(self, normShape, eps=1e-6, input_format="Channel_Last"):
        super(Layer_norm, self).__init__()
        self.weight = nn.Parameter(torch.ones(normShape))
        self.bias = nn.Parameter(torch.zeros(normShape))
        self.eps = eps
        self.dataFormat = input_format
        if self.dataFormat not in ["Channel_Last", "Channel_First"]:
            raise NotImplementedError
        self.normShape = (normShape,)

    def forward(self, x):
        if self.dataFormat == "Channel_Last":
            return F.layer_norm(
                x, self.normShape, self.weight, self.bias, self.eps
            )
        elif self.dataFormat == "Channel_First":
            u = x.mean(1, keepdim=True)
            s = (x - u).pow(2).mean(1, keepdim=True)
            x = (x - u) / torch.sqrt(s + self.eps)
            x = self.weight[:, None, None] * x + self.bias[:, None, None]
            return x


class ConvNeXt(nn.Module):
    """
    Args:
        in_channels (int): Number of input image channels. Default: 3
        num_classes (int): Number of classes for classification head. Default: 100
        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
        drop_path_rate (float): Stochastic depth rate. Default: 0.
        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
    
    """

    def __init__(
        self,
        in_channels=3,
        num_classes=1000,
        depths=[3, 3, 9, 3],
        dims=[96, 192, 384, 768],
        drop_path_rate=0.0,
        layer_scale_init_value=1e-6,
        head_init_scale=1.0,
    ):

        super(ConvNeXt, self).__init__()

        self.downsample_layers = nn.ModuleList()
        stem = nn.Sequential(
            nn.Conv2d(in_channels, dims[0], kernel_size=4, stride=4),
            Layer_norm(dims[0], eps=1e-6, input_format="Channel_First"),
        )
        self.downsample_layers.append(stem)

        for i in range(3):
            downsample_layer = nn.Sequential(
                Layer_norm(dims[i], eps=1e-6, input_format="Channel_First"),
                nn.Conv2d(dims[i], dims[i + 1], kernel_size=2, stride=2),
            )
            self.downsample_layers.append(downsample_layer)

        self.stages = nn.ModuleList()
        dp_rates = [x for x in torch.linspace(0, drop_path_rate, sum(depths))]
        cur = 0

        for i in range(4):
            stage = nn.Sequential(
                *[
                    ConvNeXtBlock(
                        in_channel=dims[i],
                        depth_rate=dp_rates[cur + j],
                        layer_scale_init_value=layer_scale_init_value,
                    )
                    for j in range(depths[i])
                ]
            )

            self.stages.append(stage)
            cur += depths[i]

        self.norm = nn.LayerNorm(dims[-1], eps=1e-6)
        self.head = nn.Linear(dims[-1], num_classes)

        self.apply(self.init_weights)
        self.head.weight.data.mul_(head_init_scale)
        self.head.bias.data.mul_(head_init_scale)

    def init_weights(self, m):
        if isinstance(m, (nn.Conv2d, nn.Linear)):
            truncated_normal_(m.weight, std=0.02)
            nn.init.constant_(m.bias, 0)

    def forward_stages(self, x):
        for i in range(4):
            x = self.downsample_layers[i](x)
            x = self.stages[i](x)
        return self.norm(
            x.mean([-2, -1])
        )  # global average pooling, (N, C, H, W) -> (N, C)

    def forward(self, x, return_feats=False):
        x = self.forward_stages(x)
        if return_feats is True:
            return x
        x = self.head(x)

        return x


def convnext_tiny(**kwargs):
    model = ConvNeXt(depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], **kwargs)
    return model


def convnext_small(**kwargs):
    model = ConvNeXt(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs)
    return model


def convnext_base(**kwargs):
    model = ConvNeXt(
        depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs
    )
    return model

# MobileNetV3

In [14]:
import warnings
from functools import partial
from typing import Any, Callable, List, Optional, Sequence

import torch
from torch import Tensor, nn
from torchvision.ops.misc import ConvNormActivation
from torchvision.ops.misc import SqueezeExcitation as SElayer
from torch.hub import load_state_dict_from_url

__all__ = ["MobileNetV3", "mobilenet_v3_large", "mobilenet_v3_small"]


mobilnet_model_urls = {
    "mobilenet_v3_large": "https://download.pytorch.org/models/mobilenet_v3_large-8738ca79.pth",
    "mobilenet_v3_small": "https://download.pytorch.org/models/mobilenet_v3_small-047dcff4.pth",
}


def _make_divisible(
    v: float, divisor: int, min_value: Optional[int] = None
) -> int:
    """
    This function is taken from the original tf repo.
    It ensures that all layers have a channel number that is divisible by 8
    It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
    """
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    # Make sure that round down does not go down by more than 10%.
    if new_v < 0.9 * v:
        new_v += divisor
    return new_v


class SqueezeExcitation(SElayer):
    """DEPRECATED
    """

    def __init__(self, input_channels: int, squeeze_factor: int = 4):
        squeeze_channels = _make_divisible(input_channels // squeeze_factor, 8)
        super().__init__(
            input_channels, squeeze_channels, scale_activation=nn.Hardsigmoid
        )
        self.relu = self.activation
        delattr(self, "activation")
        warnings.warn(
            "This SqueezeExcitation class is deprecated and will be removed in future versions. "
            "Use torchvision.ops.misc.SqueezeExcitation instead.",
            FutureWarning,
        )


class InvertedResidualConfig:
    # Stores information listed at Tables 1 and 2 of the MobileNetV3 paper
    def __init__(
        self,
        input_channels: int,
        kernel: int,
        expanded_channels: int,
        out_channels: int,
        use_se: bool,
        activation: str,
        stride: int,
        dilation: int,
        width_mult: float,
    ):
        self.input_channels = self.adjust_channels(input_channels, width_mult)
        self.kernel = kernel
        self.expanded_channels = self.adjust_channels(
            expanded_channels, width_mult
        )
        self.out_channels = self.adjust_channels(out_channels, width_mult)
        self.use_se = use_se
        self.use_hs = activation == "HS"
        self.stride = stride
        self.dilation = dilation

    @staticmethod
    def adjust_channels(channels: int, width_mult: float):
        return _make_divisible(channels * width_mult, 8)


class InvertedResidual(nn.Module):
    # Implemented as described at section 5 of MobileNetV3 paper
    def __init__(
        self,
        cnf: InvertedResidualConfig,
        norm_layer: Callable[..., nn.Module],
        se_layer: Callable[..., nn.Module] = partial(
            SElayer, scale_activation=nn.Hardsigmoid
        ),
    ):
        super().__init__()
        if not (1 <= cnf.stride <= 2):
            raise ValueError("illegal stride value")

        self.use_res_connect = (
            cnf.stride == 1 and cnf.input_channels == cnf.out_channels
        )

        layers: List[nn.Module] = []
        activation_layer = nn.Hardswish if cnf.use_hs else nn.ReLU

        # expand
        if cnf.expanded_channels != cnf.input_channels:
            layers.append(
                ConvNormActivation(
                    cnf.input_channels,
                    cnf.expanded_channels,
                    kernel_size=1,
                    norm_layer=norm_layer,
                    activation_layer=activation_layer,
                )
            )

        # depthwise
        stride = 1 if cnf.dilation > 1 else cnf.stride
        layers.append(
            ConvNormActivation(
                cnf.expanded_channels,
                cnf.expanded_channels,
                kernel_size=cnf.kernel,
                stride=stride,
                dilation=cnf.dilation,
                groups=cnf.expanded_channels,
                norm_layer=norm_layer,
                activation_layer=activation_layer,
            )
        )
        if cnf.use_se:
            squeeze_channels = _make_divisible(cnf.expanded_channels // 4, 8)
            layers.append(se_layer(cnf.expanded_channels, squeeze_channels))

        # project
        layers.append(
            ConvNormActivation(
                cnf.expanded_channels,
                cnf.out_channels,
                kernel_size=1,
                norm_layer=norm_layer,
                activation_layer=None,
            )
        )

        self.block = nn.Sequential(*layers)
        self.out_channels = cnf.out_channels
        self._is_cn = cnf.stride > 1

    def forward(self, input: Tensor) -> Tensor:
        result = self.block(input)
        if self.use_res_connect:
            result += input
        return result


class MobileNetV3(nn.Module):
    def __init__(
        self,
        inverted_residual_setting: List[InvertedResidualConfig],
        last_channel: int,
        num_classes: int = 1000,
        block: Optional[Callable[..., nn.Module]] = None,
        norm_layer: Optional[Callable[..., nn.Module]] = None,
        **kwargs: Any
    ) -> None:
        """
        MobileNet V3 main class

        Args:
            inverted_residual_setting (List[InvertedResidualConfig]): Network structure
            last_channel (int): The number of channels on the penultimate layer
            num_classes (int): Number of classes
            block (Optional[Callable[..., nn.Module]]): Module specifying inverted residual building block for mobilenet
            norm_layer (Optional[Callable[..., nn.Module]]): Module specifying the normalization layer to use
        """
        super().__init__()

        if not inverted_residual_setting:
            raise ValueError(
                "The inverted_residual_setting should not be empty"
            )
        elif not (
            isinstance(inverted_residual_setting, Sequence)
            and all(
                [
                    isinstance(s, InvertedResidualConfig)
                    for s in inverted_residual_setting
                ]
            )
        ):
            raise TypeError(
                "The inverted_residual_setting should be List[InvertedResidualConfig]"
            )

        if block is None:
            block = InvertedResidual

        if norm_layer is None:
            norm_layer = partial(nn.BatchNorm2d, eps=0.001, momentum=0.01)

        layers: List[nn.Module] = []

        # building first layer
        firstconv_output_channels = inverted_residual_setting[0].input_channels
        layers.append(
            ConvNormActivation(
                3,
                firstconv_output_channels,
                kernel_size=3,
                stride=2,
                norm_layer=norm_layer,
                activation_layer=nn.Hardswish,
            )
        )

        # building inverted residual blocks
        for cnf in inverted_residual_setting:
            layers.append(block(cnf, norm_layer))

        # building last several layers
        lastconv_input_channels = inverted_residual_setting[-1].out_channels
        lastconv_output_channels = 6 * lastconv_input_channels
        layers.append(
            ConvNormActivation(
                lastconv_input_channels,
                lastconv_output_channels,
                kernel_size=1,
                norm_layer=norm_layer,
                activation_layer=nn.Hardswish,
            )
        )

        self.features = nn.Sequential(*layers)
        self.avgpool = nn.AdaptiveAvgPool2d(1)
        self.classifier = nn.Sequential(
            nn.Linear(lastconv_output_channels, last_channel),
            nn.Hardswish(inplace=True),
            nn.Dropout(p=0.2, inplace=True),
            nn.Linear(last_channel, num_classes),
        )

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode="fan_out")
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.ones_(m.weight)
                nn.init.zeros_(m.bias)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.zeros_(m.bias)

    def _forward_impl(self, x: Tensor) -> Tensor:
        x = self.features(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)

        x = self.classifier(x)

        return x

    def forward(self, x: Tensor) -> Tensor:
        return self._forward_impl(x)


def _mobilenet_v3_conf(
    arch: str,
    width_mult: float = 1.0,
    reduced_tail: bool = False,
    dilated: bool = False,
    **kwargs: Any
):
    reduce_divider = 2 if reduced_tail else 1
    dilation = 2 if dilated else 1

    bneck_conf = partial(InvertedResidualConfig, width_mult=width_mult)
    adjust_channels = partial(
        InvertedResidualConfig.adjust_channels, width_mult=width_mult
    )

    if arch == "mobilenet_v3_large":
        inverted_residual_setting = [
            bneck_conf(16, 3, 16, 16, False, "RE", 1, 1),
            bneck_conf(16, 3, 64, 24, False, "RE", 2, 1),  # C1
            bneck_conf(24, 3, 72, 24, False, "RE", 1, 1),
            bneck_conf(24, 5, 72, 40, True, "RE", 2, 1),  # C2
            bneck_conf(40, 5, 120, 40, True, "RE", 1, 1),
            bneck_conf(40, 5, 120, 40, True, "RE", 1, 1),
            bneck_conf(40, 3, 240, 80, False, "HS", 2, 1),  # C3
            bneck_conf(80, 3, 200, 80, False, "HS", 1, 1),
            bneck_conf(80, 3, 184, 80, False, "HS", 1, 1),
            bneck_conf(80, 3, 184, 80, False, "HS", 1, 1),
            bneck_conf(80, 3, 480, 112, True, "HS", 1, 1),
            bneck_conf(112, 3, 672, 112, True, "HS", 1, 1),
            bneck_conf(
                112, 5, 672, 160 // reduce_divider, True, "HS", 2, dilation
            ),  # C4
            bneck_conf(
                160 // reduce_divider,
                5,
                960 // reduce_divider,
                160 // reduce_divider,
                True,
                "HS",
                1,
                dilation,
            ),
            bneck_conf(
                160 // reduce_divider,
                5,
                960 // reduce_divider,
                160 // reduce_divider,
                True,
                "HS",
                1,
                dilation,
            ),
        ]
        last_channel = adjust_channels(1280 // reduce_divider)  # C5
    elif arch == "mobilenet_v3_small":
        inverted_residual_setting = [
            bneck_conf(16, 3, 16, 16, True, "RE", 2, 1),  # C1
            bneck_conf(16, 3, 72, 24, False, "RE", 2, 1),  # C2
            bneck_conf(24, 3, 88, 24, False, "RE", 1, 1),
            bneck_conf(24, 5, 96, 40, True, "HS", 2, 1),  # C3
            bneck_conf(40, 5, 240, 40, True, "HS", 1, 1),
            bneck_conf(40, 5, 240, 40, True, "HS", 1, 1),
            bneck_conf(40, 5, 120, 48, True, "HS", 1, 1),
            bneck_conf(48, 5, 144, 48, True, "HS", 1, 1),
            bneck_conf(
                48, 5, 288, 96 // reduce_divider, True, "HS", 2, dilation
            ),  # C4
            bneck_conf(
                96 // reduce_divider,
                5,
                576 // reduce_divider,
                96 // reduce_divider,
                True,
                "HS",
                1,
                dilation,
            ),
            bneck_conf(
                96 // reduce_divider,
                5,
                576 // reduce_divider,
                96 // reduce_divider,
                True,
                "HS",
                1,
                dilation,
            ),
        ]
        last_channel = adjust_channels(1024 // reduce_divider)  # C5
    else:
        raise ValueError("Unsupported model type {}".format(arch))

    return inverted_residual_setting, last_channel


def _mobilenet_v3_model(
    arch: str,
    inverted_residual_setting: List[InvertedResidualConfig],
    last_channel: int,
    pretrained: bool,
    progress: bool,
    **kwargs: Any
):
    model = MobileNetV3(inverted_residual_setting, last_channel, **kwargs)
    if pretrained:
        if mobilnet_model_urls.get(arch, None) is None:
            raise ValueError("No checkpoint is available for model type {}".format(arch))
        state_dict = load_state_dict_from_url(mobilnet_model_urls[arch], progress=progress)
        model.load_state_dict(state_dict)
    return model


def mobilenet_v3_large(
    pretrained: bool = False, progress: bool = True, **kwargs: Any
) -> MobileNetV3:
    """
    Constructs a large MobileNetV3 architecture from
    `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    arch = "mobilenet_v3_large"
    inverted_residual_setting, last_channel = _mobilenet_v3_conf(
        arch, **kwargs
    )
    return _mobilenet_v3_model(
        arch,
        inverted_residual_setting,
        last_channel,
        pretrained,
        progress,
        **kwargs
    )


def mobilenet_v3_small(
    pretrained: bool = True, progress: bool = True, **kwargs: Any
) -> MobileNetV3:
    """
    Constructs a small MobileNetV3 architecture from
    `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    arch = "mobilenet_v3_small"
    inverted_residual_setting, last_channel = _mobilenet_v3_conf(
        arch, **kwargs
    )
    return _mobilenet_v3_model(
        arch,
        inverted_residual_setting,
        last_channel,
        pretrained,
        progress,
        **kwargs
    )


# Available Backbones

In [15]:
BACKBONES_MAP = {
    "inceptionv2": {
        "config": [1, 1, 3],
        "class": InceptionV2,
    },
    "resnet50": {
        "config": None,
        "class": resnet50,
    },
    "resnet101": {
        "config": None,
        "class": resnet101,
    },
    "resnet152": {
        "config": None,
        "class": resnet152,
    },
    "convnext_base": {
        "config": None,
        "class": convnext_base,
    },
    "convnext_small": {
        "config": None,
        "class": convnext_small,
    },
    "convnext_tiny": {
        "config": None,
        "class": convnext_tiny, 
    },
    "mobilenetv3_small":{
        "config": None,
        "class": mobilenet_v3_small
    },
}

# Baseline Model

In [16]:
class BaseNet(nn.Module):
    def __init__(self, backbone="resnet101"):
        """Available backbones are:
        - InceptionV2([1,1,3])
        - resnet50()
        - convnext_base()
        - resnet152()
        """
        super().__init__()
        # self.backbone = InceptionV2([1, 1, 3])
        # self.backbone = resnet152()
        self.backbone_name = backbone
        backbone_config = BACKBONES_MAP[backbone]["config"]
        self.backbone = BACKBONES_MAP[backbone]["class"](backbone_config) if backbone_config is not None else BACKBONES_MAP[backbone]["class"]()
        self.fc = nn.Linear(1000, 4096) 
        self.fc1 = nn.Linear(4096, 4096)
        self.fc2 = nn.Linear(4096, 4096)
        self.fc_calories = nn.Sequential(
            nn.Linear(4096, 4096),
            nn.Linear(4096, 1)
        )
        self.fc_mass = nn.Sequential(
            nn.Linear(4096, 4096),
            nn.Linear(4096, 1)
        )
        self.fc_mc = nn.Sequential(
            nn.Linear(4096, 4096),
            nn.Linear(4096, 3)
        )

    def forward(self, x):
        x = self.backbone(x)
        if self.backbone_name != 'inceptionv2':
            x = self.fc(x) # Comment it if backbone is resnet or convnext
        x = self.fc2(self.fc1(x))

        x_cal = self.fc_calories(x)
        x_mass = self.fc_mass(x)
        x_mn = self.fc_mc(x)

        return x_cal, x_mass, x_mn

# Multi-task Loss

In [17]:
class MultiTaskLearner(nn.Module):
    def __init__(self, model: nn.Module, criterion=nn.L1Loss()):
        super(MultiTaskLearner, self).__init__()
        self.model = model
        self.criterion = criterion

    def forward(self, x, y):
        # 1 x 5 Tensor [total_calories, total_mass, total_fat, total_carb, total_protein]

        out_cal, out_mass, out_mn = self.model(x)

        loss_calorie = self.criterion(out_cal, y[:, 0:1])
        
        loss_mass = self.criterion(out_mass, y[:, 1:2])

        loss_mn = self.criterion(out_mn, y[:, 2:])

        loss_total = loss_calorie + loss_mass + loss_mn

        return loss_total

# Delete Model

In [26]:
torch.cuda.empty_cache()
# del model

# Utility Funs

In [19]:
CHECKPOINT_PATH = "/content/drive/MyDrive/checkpoints"
def create_dir_if_not_exists(dirpath: str):
    """Create the specified directory with all intermediate directories if necessary."""
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

def save_checkpoint(
    epoch: int,
    loss: float,
    model: nn.Module,
    model_name: str,
    checkpoint_path: str = CHECKPOINT_PATH,
):
    create_dir_if_not_exists(checkpoint_path)
    torch.save(
        {"epoch": epoch, "loss": loss, "model_state_dict": model.state_dict()},
        os.path.join(checkpoint_path, model_name),
    )


def load_checkpoint(filepath: str):
    state_dict = torch.load(filepath)
    epoch, loss, model_state_dict = (
        state_dict["epoch"],
        state_dict["loss"],
        state_dict["model_state_dict"],
    )
    return epoch, loss, model_state_dict

# Training

In [20]:
print(f"Cuda is available: {torch.cuda.is_available()}")
model = BaseNet(backbone="resnet101")
model.cuda()
learner = MultiTaskLearner(model)

optimizer = torch.optim.Adam(model.parameters(), config.lr, weight_decay=0.9)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=config.epochs*len(train_loader))
scaler = torch.cuda.amp.GradScaler()

Cuda is available: True


Downloading: "https://s3.amazonaws.com/pytorch/models/resnet101-5d3b4d8f.pth" to /root/.cache/torch/hub/checkpoints/resnet101-5d3b4d8f.pth


  0%|          | 0.00/170M [00:00<?, ?B/s]

In [21]:
for epoch in range(config.epochs):
    model.train()
    total_loss = 0

    batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train')

    for i, (x, y) in enumerate(train_loader):
        optimizer.zero_grad()

        x = x.cuda()
        y = y.cuda()

        # with torch.cuda.amp.autocast():     
        loss = learner(x, y)

        # Update # correct & loss as we go
        total_loss += float(loss)

        # Compute training metrics
        train_loss = float(total_loss / (i + 1))
        cur_lr = float(optimizer.param_groups[0]['lr'])


        # tqdm lets you add some details so you can monitor training as you train.
        batch_bar.set_postfix(
            loss="{:.04f}".format(train_loss),
            lr="{:.04f}".format(cur_lr))
        
        # Another couple things you need for FP16. 
        scaler.scale(loss).backward() # This is a replacement for loss.backward()
        # loss.backward()
        scaler.step(optimizer) # This is a replacement for optimizer.step()
        # optimizer.step()
        scaler.update() # This is something added just for FP16

        scheduler.step() # We told scheduler T_max that we'd call step() (len(train_loader) * epochs) many times.

        batch_bar.update() # Update tqdm bar

    batch_bar.close() # You need this to close the tqdm bar

    train_loss = total_loss / len(train_loader)
    
    # Save the model every 3 epochs
    if epoch % 3 == 0:
        save_checkpoint(epoch, train_loss, model, "resnet101")

    # You can add validation per-epoch here if you would like
    model.eval()
    batch_bar = tqdm(total=len(valid_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')
    total_loss = 0
    for i, (x, y) in enumerate(valid_loader):

        x = x.cuda()
        y = y.cuda()

        with torch.no_grad():
            loss = learner(x, y)
        

        total_loss += float(loss)

        batch_bar.set_postfix(loss="{:.04f}".format(float(total_loss / (i + 1))))

        batch_bar.update()
        
    batch_bar.close()

    # scheduler.step(float(total_loss / (i + 1)))

    valid_loss = total_loss / len(valid_loader)

    print("Epoch {}/{}: Train Loss {:.04f}, Learning Rate {:.04f}, Valid Loss {:.04f}".format(
        epoch + 1, config.epochs, train_loss, cur_lr, valid_loss))




Epoch 1/150: Train Loss 221.9507, Learning Rate 0.0002, Valid Loss 377.7618




Epoch 2/150: Train Loss 171.3890, Learning Rate 0.0002, Valid Loss 263.3342




Epoch 3/150: Train Loss 181.2853, Learning Rate 0.0002, Valid Loss 219.8308




Epoch 4/150: Train Loss 159.2909, Learning Rate 0.0002, Valid Loss 205.6444




Epoch 5/150: Train Loss 152.0633, Learning Rate 0.0002, Valid Loss 290.2989




Epoch 6/150: Train Loss 152.1111, Learning Rate 0.0002, Valid Loss 262.8633




Epoch 7/150: Train Loss 148.0023, Learning Rate 0.0002, Valid Loss 235.0944




Epoch 8/150: Train Loss 153.1441, Learning Rate 0.0002, Valid Loss 263.6127




Epoch 9/150: Train Loss 153.6985, Learning Rate 0.0002, Valid Loss 269.6106




Epoch 10/150: Train Loss 143.3414, Learning Rate 0.0002, Valid Loss 237.4458




Epoch 11/150: Train Loss 140.8678, Learning Rate 0.0002, Valid Loss 219.3873




Epoch 12/150: Train Loss 140.6857, Learning Rate 0.0002, Valid Loss 226.1018




Epoch 13/150: Train Loss 141.3959, Learning Rate 0.0002, Valid Loss 245.6431




Epoch 14/150: Train Loss 143.2691, Learning Rate 0.0002, Valid Loss 238.8810




Epoch 15/150: Train Loss 139.1616, Learning Rate 0.0002, Valid Loss 219.6173




Epoch 16/150: Train Loss 134.8575, Learning Rate 0.0002, Valid Loss 223.0808




Epoch 17/150: Train Loss 140.1564, Learning Rate 0.0002, Valid Loss 224.5688




Epoch 18/150: Train Loss 134.3986, Learning Rate 0.0002, Valid Loss 274.0413




Epoch 19/150: Train Loss 131.9855, Learning Rate 0.0002, Valid Loss 238.9126




Epoch 20/150: Train Loss 128.6663, Learning Rate 0.0002, Valid Loss 228.3836




Epoch 21/150: Train Loss 133.2028, Learning Rate 0.0002, Valid Loss 234.1721




Epoch 22/150: Train Loss 135.9772, Learning Rate 0.0002, Valid Loss 320.1841




Epoch 23/150: Train Loss 136.7075, Learning Rate 0.0002, Valid Loss 221.7200




Epoch 24/150: Train Loss 139.7373, Learning Rate 0.0002, Valid Loss 213.3049




Epoch 25/150: Train Loss 131.7008, Learning Rate 0.0002, Valid Loss 239.0313




Epoch 26/150: Train Loss 126.6175, Learning Rate 0.0002, Valid Loss 232.1561




Epoch 27/150: Train Loss 124.2950, Learning Rate 0.0002, Valid Loss 232.2314




Epoch 28/150: Train Loss 124.9368, Learning Rate 0.0002, Valid Loss 224.3838




Epoch 29/150: Train Loss 125.5551, Learning Rate 0.0002, Valid Loss 235.5479




Epoch 30/150: Train Loss 127.9826, Learning Rate 0.0002, Valid Loss 262.1676




Epoch 31/150: Train Loss 133.2563, Learning Rate 0.0002, Valid Loss 223.9337




Epoch 32/150: Train Loss 125.3101, Learning Rate 0.0002, Valid Loss 224.3510




Epoch 33/150: Train Loss 124.7634, Learning Rate 0.0002, Valid Loss 220.2817




Epoch 34/150: Train Loss 119.9069, Learning Rate 0.0002, Valid Loss 235.6920




Epoch 35/150: Train Loss 121.3189, Learning Rate 0.0002, Valid Loss 291.8418




Epoch 36/150: Train Loss 124.4669, Learning Rate 0.0002, Valid Loss 237.8519




Epoch 37/150: Train Loss 122.6428, Learning Rate 0.0002, Valid Loss 221.6067




Epoch 38/150: Train Loss 118.6560, Learning Rate 0.0002, Valid Loss 235.7846




Epoch 39/150: Train Loss 119.5945, Learning Rate 0.0002, Valid Loss 231.0265




Epoch 40/150: Train Loss 119.3437, Learning Rate 0.0002, Valid Loss 231.3567




Epoch 41/150: Train Loss 118.4664, Learning Rate 0.0002, Valid Loss 220.7179




Epoch 42/150: Train Loss 116.7884, Learning Rate 0.0002, Valid Loss 230.3419




Epoch 43/150: Train Loss 117.9699, Learning Rate 0.0002, Valid Loss 229.0288




Epoch 44/150: Train Loss 115.1124, Learning Rate 0.0002, Valid Loss 227.2486




Epoch 45/150: Train Loss 121.2406, Learning Rate 0.0002, Valid Loss 217.1616




Epoch 46/150: Train Loss 118.3016, Learning Rate 0.0002, Valid Loss 217.9488




Epoch 47/150: Train Loss 115.4689, Learning Rate 0.0002, Valid Loss 231.3751




Epoch 48/150: Train Loss 115.9520, Learning Rate 0.0002, Valid Loss 222.7532




Epoch 49/150: Train Loss 117.0138, Learning Rate 0.0002, Valid Loss 234.8081




Epoch 50/150: Train Loss 113.6670, Learning Rate 0.0002, Valid Loss 229.9703




Epoch 51/150: Train Loss 114.2997, Learning Rate 0.0001, Valid Loss 231.0137




Epoch 52/150: Train Loss 112.0457, Learning Rate 0.0001, Valid Loss 226.5978




Epoch 53/150: Train Loss 115.4547, Learning Rate 0.0001, Valid Loss 220.3488




Epoch 54/150: Train Loss 112.0613, Learning Rate 0.0001, Valid Loss 223.2859




Epoch 55/150: Train Loss 114.0909, Learning Rate 0.0001, Valid Loss 230.8421




Epoch 56/150: Train Loss 112.3962, Learning Rate 0.0001, Valid Loss 229.1677




Epoch 57/150: Train Loss 111.6785, Learning Rate 0.0001, Valid Loss 233.4399




Epoch 58/150: Train Loss 111.7352, Learning Rate 0.0001, Valid Loss 230.4537




Epoch 59/150: Train Loss 115.6739, Learning Rate 0.0001, Valid Loss 226.6488




Epoch 60/150: Train Loss 108.9366, Learning Rate 0.0001, Valid Loss 221.6654




Epoch 61/150: Train Loss 110.9883, Learning Rate 0.0001, Valid Loss 223.4752




Epoch 62/150: Train Loss 109.5424, Learning Rate 0.0001, Valid Loss 236.1560




Epoch 63/150: Train Loss 108.9422, Learning Rate 0.0001, Valid Loss 230.4475




Epoch 64/150: Train Loss 110.4916, Learning Rate 0.0001, Valid Loss 224.0586




Epoch 65/150: Train Loss 108.4572, Learning Rate 0.0001, Valid Loss 221.4724




Epoch 66/150: Train Loss 107.1817, Learning Rate 0.0001, Valid Loss 238.2341




Epoch 67/150: Train Loss 106.6565, Learning Rate 0.0001, Valid Loss 223.2905




Epoch 68/150: Train Loss 106.1881, Learning Rate 0.0001, Valid Loss 224.9994




Epoch 69/150: Train Loss 105.8443, Learning Rate 0.0001, Valid Loss 223.6492




Epoch 70/150: Train Loss 106.2793, Learning Rate 0.0001, Valid Loss 220.8647




Epoch 71/150: Train Loss 105.0684, Learning Rate 0.0001, Valid Loss 215.1338




Epoch 72/150: Train Loss 106.4769, Learning Rate 0.0001, Valid Loss 225.0379




Epoch 73/150: Train Loss 105.3642, Learning Rate 0.0001, Valid Loss 231.3692




Epoch 74/150: Train Loss 106.3323, Learning Rate 0.0001, Valid Loss 225.1525




Epoch 75/150: Train Loss 107.3248, Learning Rate 0.0001, Valid Loss 230.4318




Epoch 76/150: Train Loss 103.8907, Learning Rate 0.0001, Valid Loss 224.1721




Epoch 77/150: Train Loss 103.3434, Learning Rate 0.0001, Valid Loss 239.7738




Epoch 78/150: Train Loss 103.3665, Learning Rate 0.0001, Valid Loss 224.5966




Epoch 79/150: Train Loss 103.5556, Learning Rate 0.0001, Valid Loss 227.9807




Epoch 80/150: Train Loss 102.6148, Learning Rate 0.0001, Valid Loss 226.5151




Epoch 81/150: Train Loss 103.0638, Learning Rate 0.0001, Valid Loss 227.9959




Epoch 82/150: Train Loss 102.6347, Learning Rate 0.0001, Valid Loss 225.7281




Epoch 83/150: Train Loss 101.1530, Learning Rate 0.0001, Valid Loss 223.4055




Epoch 84/150: Train Loss 102.3259, Learning Rate 0.0001, Valid Loss 223.4158




Epoch 85/150: Train Loss 100.9831, Learning Rate 0.0001, Valid Loss 232.7119




Epoch 86/150: Train Loss 101.0678, Learning Rate 0.0001, Valid Loss 224.0012




Epoch 87/150: Train Loss 100.7666, Learning Rate 0.0001, Valid Loss 224.5551




Epoch 88/150: Train Loss 101.9120, Learning Rate 0.0001, Valid Loss 230.1535




Epoch 89/150: Train Loss 99.1335, Learning Rate 0.0001, Valid Loss 225.8305




Epoch 90/150: Train Loss 99.4145, Learning Rate 0.0001, Valid Loss 223.6492




Epoch 91/150: Train Loss 99.4559, Learning Rate 0.0001, Valid Loss 224.2702




Epoch 92/150: Train Loss 100.0297, Learning Rate 0.0001, Valid Loss 226.5089




Epoch 93/150: Train Loss 99.1058, Learning Rate 0.0001, Valid Loss 226.6150




Epoch 94/150: Train Loss 98.5129, Learning Rate 0.0001, Valid Loss 226.9831




Epoch 95/150: Train Loss 98.8553, Learning Rate 0.0001, Valid Loss 230.8495




Epoch 96/150: Train Loss 97.8052, Learning Rate 0.0001, Valid Loss 227.6001




Epoch 97/150: Train Loss 97.4401, Learning Rate 0.0001, Valid Loss 226.6004




Epoch 98/150: Train Loss 99.2944, Learning Rate 0.0001, Valid Loss 225.8464




Epoch 99/150: Train Loss 97.3106, Learning Rate 0.0001, Valid Loss 226.1276




Epoch 100/150: Train Loss 96.7975, Learning Rate 0.0001, Valid Loss 234.4702




Epoch 101/150: Train Loss 97.0310, Learning Rate 0.0000, Valid Loss 233.8487




Epoch 102/150: Train Loss 96.7113, Learning Rate 0.0000, Valid Loss 230.8539




Epoch 103/150: Train Loss 97.2864, Learning Rate 0.0000, Valid Loss 231.0946




Epoch 104/150: Train Loss 96.0981, Learning Rate 0.0000, Valid Loss 230.7626




Epoch 105/150: Train Loss 97.2773, Learning Rate 0.0000, Valid Loss 235.8502




Epoch 106/150: Train Loss 96.2506, Learning Rate 0.0000, Valid Loss 230.1167




Epoch 107/150: Train Loss 95.3325, Learning Rate 0.0000, Valid Loss 230.6268




Epoch 108/150: Train Loss 95.4126, Learning Rate 0.0000, Valid Loss 229.8191




Epoch 109/150: Train Loss 95.8544, Learning Rate 0.0000, Valid Loss 228.0986




Epoch 110/150: Train Loss 95.2426, Learning Rate 0.0000, Valid Loss 227.7809




Epoch 111/150: Train Loss 94.8903, Learning Rate 0.0000, Valid Loss 229.6695




Epoch 112/150: Train Loss 95.4880, Learning Rate 0.0000, Valid Loss 230.4352




Epoch 113/150: Train Loss 94.6279, Learning Rate 0.0000, Valid Loss 230.2465




Epoch 114/150: Train Loss 94.3091, Learning Rate 0.0000, Valid Loss 228.7391




Epoch 115/150: Train Loss 93.9853, Learning Rate 0.0000, Valid Loss 229.0617




Epoch 116/150: Train Loss 94.2679, Learning Rate 0.0000, Valid Loss 229.3495




Epoch 117/150: Train Loss 94.4220, Learning Rate 0.0000, Valid Loss 230.2129




Epoch 118/150: Train Loss 93.3248, Learning Rate 0.0000, Valid Loss 232.9382




Epoch 119/150: Train Loss 94.2552, Learning Rate 0.0000, Valid Loss 229.8367




Epoch 120/150: Train Loss 93.4414, Learning Rate 0.0000, Valid Loss 232.5071




Epoch 121/150: Train Loss 93.6756, Learning Rate 0.0000, Valid Loss 230.9434


Train:   7%|▋         | 6/81 [00:04<00:57,  1.30it/s, loss=88.0148, lr=0.0000]

KeyboardInterrupt: ignored

# Evaluate Stats

## Copy Ground Truths to Colab 

In [22]:
!cp /content/drive/MyDrive/11785/project/ground_truth.csv /content/data/metadata/

## Dataset for Evaluation

In [23]:
class EvalDataset(Dataset):

    def __init__(self, data_dir, transforms=Compose([CenterCrop((256, 256)), ToTensor()]), labels=labels_dict):
        self.data_dir = data_dir
        self.transforms = transforms
        self.labels = labels

        # # ['dish_1556572657', 'dish_1556573514', 'dish_1556575014', 'dish_1556575083', 'dish_1556575124', 'dish_1556575273', 'dish_1556575327']
        dirs = os.listdir(self.data_dir)

        self.dish_ids = []
        for dir in dirs:
            if "rgb.png" in os.listdir(os.path.join(self.data_dir,dir)):
                self.dish_ids.append(dir)

        self.dish_ids.sort()

        self.img_paths = list(
            map(
                lambda fname: os.path.join(self.data_dir, fname),
                self.dish_ids,
            )
        )

    def __len__(self):
        return len(self.img_paths)

    def __getitem__(self, idx):
        rgb_path = f"{self.img_paths[idx]}/rgb.png"
        dish_id = self.dish_ids[idx]
        transformed_img = self.transforms(Image.open(rgb_path))
        # Return the dish id for writing the csv file
        return transformed_img, dish_id

In [24]:
torch.cuda.empty_cache()
eval_train_data = EvalDataset(TRAIN_DIR, labels=labels_dict)
eval_val_data = EvalDataset(VALID_DIR, labels=labels_dict)

eval_train_loader = DataLoader(eval_train_data, batch_size=16, shuffle=False, num_workers=2)
eval_val_loader = DataLoader(eval_val_data, batch_size=16, shuffle=False, num_workers=2)

## (Optional) Load Checkpoint

In [27]:
model = BaseNet()
_, _, model_state_dict = load_checkpoint(f"{CHECKPOINT_PATH}/resnet101")
model.load_state_dict(model_state_dict)
model.cuda()

BaseNet(
  (backbone): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
          

## Inference on training and testing data

In [28]:
prediction_filepath = "/content/data/metadata/outputs.csv"
# if os.path.exists(prediction_filepath):
#     os.remove(prediction_filepath)

model.eval()
batch_bar = tqdm(total=len(eval_train_loader), dynamic_ncols=True, position=0, leave=False, desc='Eval_Train')
results_all = None
# Inference on training data
for i, (x, y) in enumerate(eval_train_loader):

    x = x.cuda()
    dish_ids = np.array(list(y))
    dish_ids = dish_ids.reshape(dish_ids.shape[0],1)

    cal, mass, mn = model(x)

    results = torch.cat((cal,mass,mn), 1).detach().cpu().numpy()

    results = np.concatenate((dish_ids, results), 1)
    
    if results_all is None:
        results_all = results
    else:
        results_all = np.concatenate((results_all, results), 0)

    del cal, mass, mn
    torch.cuda.empty_cache()

    batch_bar.update()
    
batch_bar.close()
torch.cuda.empty_cache()
# Inference on validation data
batch_bar = tqdm(total=len(eval_val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')
for i, (x, y) in enumerate(eval_val_loader):

    x = x.cuda()
    dish_ids = np.array(list(y))
    dish_ids = dish_ids.reshape(dish_ids.shape[0],1)

    cal, mass, mn = model(x)

    results = torch.cat((cal,mass,mn), 1).detach().cpu().numpy()

    results = np.concatenate((dish_ids, results), 1)
    
    if results_all is None:
        results_all = results
    else:
        results_all = np.concatenate((results_all, results), 0)

    del cal, mass, mn
    torch.cuda.empty_cache()

    batch_bar.update()
    
batch_bar.close()



In [29]:
# Write to csv
np.savetxt(prediction_filepath, results_all, delimiter=",", fmt='%s,%s,%s,%s,%s,%s')

In [30]:
r"""Script to compute statistics on nutrition predictions.

This script takes in a csv of nutrition predictions and computes absolute and
percentage mean average error values comparable to the metrics used to eval
models in the Nutrition5k paper. The input csv file of nutrition predictions
should be in the form of:
dish_id, calories, mass, carbs, protein
And the groundtruth values will be pulled from the metadata csv file provided
in the Nutrition5k dataset release where the first 5 fields are also:
dish_id, calories, mass, carbs, protein

Example Usage:
python compute_statistics.py path/to/groundtruth.csv path/to/predictions.csv \
path/to/output_statistics.json
"""

import json
from os import path
import statistics
import sys

DISH_ID_INDEX = 0
DATA_FIELDNAMES = ["dish_id", "calories", "mass", "fat", "carb", "protein"]


def ReadCsvData(filepath):
  if not path.exists(filepath):
    raise Exception("File %s not found" % path)
  parsed_data = {}
  with open(filepath, "r") as f_in:
    filelines = f_in.readlines()
    for line in filelines:
      data_values = line.strip().split(",")
      parsed_data[data_values[DISH_ID_INDEX]] = data_values
  return parsed_data

# groundtruth_csv_path = "/content/data/metadata/ground_truth.csv"
groundtruth_csv_path = "/content/drive/MyDrive/11785/project/ground_truth.csv"
predictions_csv_path = prediction_filepath
output_path = "/content/data/metadata/eval_results.json"

groundtruth_data = ReadCsvData(groundtruth_csv_path)
prediction_data = ReadCsvData(predictions_csv_path)

groundtruth_values = {}
err_values = {}
output_stats = {}

for field in DATA_FIELDNAMES[1:]:
  groundtruth_values[field] = []
  err_values[field] = []

for dish_id in prediction_data:
  for i in range(1, len(DATA_FIELDNAMES)):
    groundtruth_values[DATA_FIELDNAMES[i]].append(
        float(groundtruth_data[dish_id][i]))
    err_values[DATA_FIELDNAMES[i]].append(abs(
        float(prediction_data[dish_id][i])
        - float(groundtruth_data[dish_id][i])))

for field in DATA_FIELDNAMES[1:]:
  output_stats[field + "_MAE"] = statistics.mean(err_values[field])
  output_stats[field + "_MAE_%"] = (100 * statistics.mean(err_values[field]) /
                                    statistics.mean(groundtruth_values[field]))

with open(output_path, "w") as f_out:
  f_out.write(json.dumps(output_stats))
