# README: How to run this notebook

The whole jupyter notebook could be seperated into three parts. The first part would be data preparation and model building, including package import and basic set up. The second part would be image classification part, three major cells are included, including training, resuming training and validation of the image classification work. The final part would be the works related to image verification and the detailed explanation would suggest as follows

For the first main part, the first three cells are basic set up, each cell check the GPU type, mount the drivers and import packages respectively. Then the next three cells are model implementation, ranging from the simple convolutional neural network, mobileNet and ConvNext. The next two following cells are for the preperation of dataloader and hyperparameter settings like optimizer and schedular. In this case, we also need to check the number of parameters used in the model to ensure not exceed the 35M

For the second main part, it is used to train the model from epoch 1 to end, normally, colab might disconnect occasionally, so another cell is to resume training from specific epoch. The following several blocks are used to validate the training and submit to kaggle.

For the third main part, we are focusing on the verification task. We implemented the triplet loss and load the best model we trained in the classification task and use triplet loss fine tuning the model. We would train the model, output the auc score in each epoch and finally select the best model to submit to kaggle.


# Import the package and necessary setup

In [None]:
!nvidia-smi

Tue Mar 15 12:30:27 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    47W / 400W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}
%cd /content
!mkdir drive
%cd drive
%cd ..
!google-drive-ocamlfuse /content/drive

import json

TOKEN = {"username":"yuxuanwucmu","key":"81ded4babfd327efdc7655be369299b5"}

!pip install --upgrade --force-reinstall --no-deps kaggle==1.5.8
! mkdir -p .kaggle
! mkdir -p /content & mkdir -p /content/.kaggle & mkdir -p /root/.kaggle/

with open('/content/.kaggle/kaggle.json', 'w') as file:
    json.dump(TOKEN, file)

! pip install --upgrade --force-reinstall --no-deps kaggle
! ls "/content/.kaggle"
! chmod 600 /content/.kaggle/kaggle.json
! cp /content/.kaggle/kaggle.json /root/.kaggle/

! kaggle config set -n path -v /content/drive

!kaggle competitions download -c 11-785-s22-hw2p2-classification
!kaggle competitions download -c 11-785-s22-hw2p2-verification

!unzip -q /content/drive/competitions/11-785-s22-hw2p2-classification/11-785-s22-hw2p2-classification.zip
!unzip -q /content/drive/competitions/11-785-s22-hw2p2-verification/11-785-s22-hw2p2-verification.zip


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import math
import torchvision
import torchvision.transforms as ttf
import os
import os.path as osp

from tqdm import tqdm
from PIL import Image
from sklearn.metrics import roc_auc_score
import numpy as np
import time

def create_folder(folder_path):
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)


# Hyperparameters

In [None]:
from torch.nn.modules import normalization
"""
The well-accepted SGD batch_size & lr combination for CNN classification is 256 batch size for 0.1 learning rate.
When changing batch size for SGD, follow the linear scaling rule - halving batch size -> halve learning rate, etc.
This is less theoretically supported for Adam, but in my experience, it's a decent ballpark estimate.
"""
batch_size = 256
lr = 0.1
epochs = 150
dataset = "full"
model = "ConvNextSelfImple_dropout"#"MobileNetMorePara"
transformation = "RandAug_plus_0.1_label_smoothing"
normalization = "BatchNorm2d_lr0.1"
scheduler = "CosineAnnealing" #"CosineAnnealing" #"ReduceOnPlatue"
optimizerPara = "SGD"#"AdamW"

checkpoints_store_path = "/content/drive/hw2/store_checkpoints/"
model_id = f"{model}_{dataset}_{transformation}_{scheduler}_{optimizerPara}_{normalization}"
data_path = checkpoints_store_path + model_id
create_folder(data_path)

# Very Simple Network

In [None]:
class Conv2dGroup(nn.Module):
    def __init__(self, in_size, out_size, kernel_size, stride):
        super(Conv2dGroup, self).__init__()

        standard_layer = [
            nn.Conv2d(in_channels=in_size, out_channels=out_size, kernel_size=kernel_size, stride=stride),
            nn.BatchNorm2d(out_size),
            nn.ReLU(),
        ]
        self.layer = nn.Sequential(*standard_layer)

    def forward(self, x):
        return self.layer(x)

class Network(nn.Module):
    """
    The Very Low early deadline architecture is a 4-layer CNN.
    The first Conv layer has 64 channels, kernel size 7, and stride 4.
    The next three have 128, 256, and 512 channels. Each have kernel size 3 and stride 2.
    Think about what the padding should be for each layer to not change spatial resolution.
    Each Conv layer is accompanied by a Batchnorm and ReLU layer.
    Finally, you want to average pool over the spatial dimensions to reduce them to 1 x 1.
    Then, remove (Flatten?) these trivial 1x1 dimensions away.
    Look through https://pytorch.org/docs/stable/nn.html 
    TODO: Fill out the model definition below! 

    Why does a very simple network have 4 convolutions?
    Input images are 224x224. Note that each of these convolutions downsample.
    Downsampling 2x effectively doubles the receptive field, increasing the spatial
    region each pixel extracts features from. Downsampling 32x is standard
    for most image models.

    Why does a very simple network have high channel sizes?
    Every time you downsample 2x, you do 4x less computation (at same channel size).
    To maintain the same level of computation, you 2x increase # of channels, which 
    increases computation by 4x. So, balances out to same computation.
    Another intuition is - as you downsample, you lose spatial information. Want
    to preserve some of it in the channel dimension.
    """
    def __init__(self, num_classes=7000):
        super().__init__()

        self.backbone = nn.Sequential(
            
            # nn.BatchNorm2d(3),
            Conv2dGroup(3, 64, 7, 4),
            Conv2dGroup(64, 128, 3, 2),
            Conv2dGroup(128, 256, 3, 2),
            Conv2dGroup(256, 512, 3, 2),
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten()

            ) 
        
        self.cls_layer = nn.Linear(512, num_classes)
    
    def forward(self, x, return_feats=False):
        """
        What is return_feats? It essentially returns the second-to-last-layer
        features of a given image. It's a "feature encoding" of the input image,
        and you can use it for the verification task. You would use the outputs
        of the final classification layer for the classification task.

        You might also find that the classification outputs are sometimes better
        for verification too - try both.
        """
        feats = self.backbone(x)
        out = self.cls_layer(feats)

        if return_feats:
            return feats
        else:
            return out

# MobileNetV2

In [None]:
import torch
import torch.nn as nn
import math


class InvertedResidualBlock(nn.Module):
    """
    Intuitively, layers in MobileNet can be split into "feature mixing" 
    and "spatial mixing" layers. You can think of feature mixing as each pixel
    "thinking on its own" about its own features, and you can think of spatial
    mixing as pixels "talking with each other". Alternating these two builds
    up a CNN.

    In a bit more detail:

    - The purpose of the "feature mixing" layers is what you've already seen in 
    hw1p2. Remember, in hw1p2, we went from some low-level audio input to
    semantically rich representations of phonemes. Featuring mixing is simply a 
    linear layer (a weight matrix) that transforms simpler features into 
    something more advanced.

    - The purpose of the "spatial mixing" layers is to mix features from different
    spatial locations. You can't figure out a face by looking at each pixel on
    its own, right? So we need 3x3 convolutions to mix features from neighboring
    pixels to build up spatially larger features.
    """

    def __init__(self,
                 in_channels,
                 out_channels,
                 stride,
                 expand_ratio):
        super().__init__()  # Just have to do this for all nn.Module classes

        # Can only do identity residual connection if input & output are the
        # same channel & spatial shape.
        if stride == 1 and in_channels == out_channels:
            self.do_identity = True
        else:
            self.do_identity = False

        # Expand Ratio is like 6, so hidden_dim >> in_channels
        hidden_dim = in_channels * expand_ratio

        """
        What is this doing? It's a 1x1 convolutional layer that drastically
        increases the # of channels (feature dimension). 1x1 means each pixel
        is thinking on its own, and increasing # of channels means the network
        is seeing if it can "see" more clearly in a higher dimensional space.

        Some patterns are just more obvious/separable in higher dimensions.

        Also, note that bias = False since BatchNorm2d has a bias term built-in.

        As you go, note the relationship between kernel_size and padding. As you
        covered in class, padding = kernel_size // 2 (kernel_size being odd) to
        make sure input & output spatial resolution is the same.
        """
        self.feature_mixing = nn.Sequential(
            nn.Conv2d(in_channels, hidden_dim, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(hidden_dim),
            nn.ReLU6()

        )

        """
        What is this doing? Let's break it down.
        - kernel_size = 3 means neighboring pixels are talking with each other.
          This is different from feature mixing, where kernel_size = 1.

        - stride. Remember that we sometimes want to down-sample spatially. 
          Downsampling is done to reduce # of pixels (less computation to do), 
          and also to increase receptive field (if a face was 32x32, and now
          it's 16x16, a 3x3 convolution covers more of the face, right?). It
          makes sense to put the downsampling in the spatial mixing portion
          since this layer is "in charge" of messing around spatially anyway.

          Note that most of the time, stride is 1. It's just the first block of
          every "stage" (layer \subsetof block \subsetof stage) that we have
          stride = 2.

        - groups = hidden_dim. Remember depthwise separable convolutions in 
          class? If not, it's fine. Usually, when we go from hidden_dim channels
          to hidden_dim channels, they're densely connected (like a linear 
          layer). So you can think of every pixel/grid in an input
          3 x 3 x hidden_dim block being connected to every single pixel/grid 
          in the output 3 x 3 x hidden_dim block.
          What groups = hidden_dim does is remove a lot of these connections.

          Now, each input 3 x 3 block/region is densely connected to the
          corresponding output 3 x 3 block/region. This happens for each of the
          hidden_dim input/output channel pairs independently.
          So we're not even mixing different channels together - we're only 
          mixing spatial neighborhoods. 
          
          Try to draw this out, or come to my (Jinhyung Park)'s OH if you want 
          a more in-depth explanation.
          https://towardsdatascience.com/a-basic-introduction-to-separable-convolutions-b99ec3102728
        """
        self.spatial_mixing = nn.Sequential(
            nn.Conv2d(hidden_dim, hidden_dim, kernel_size=3, padding=1, stride=stride, groups=hidden_dim, bias=False),
            nn.BatchNorm2d(hidden_dim),
            nn.ReLU6()

        )

        """
        What's this? Remember that hidden_dim is quite large - six times the 
        in_channels. So it was nice to do the above operations in this high-dim
        space, where some patterns might be more clear. But we still want to 
        bring it back down-to-earth.

        Intuitively, you can takeaway two reasons for doing this:
        - Reduces computational cost by a lot. 6x in & out channels means 36x
          larger weights, which is crazy. We're okay with just one of input or 
          output of a convolutional layer being large when mixing channels, but 
          not both.
        
        - We also want a residual connection from the input to the output. To 
          do that without introducing another convolutional layer, we want to
          condense the # of channels back to be the same as the in_channels.
          (out_channels and in_channels are usually the same).
        """
        self.bottleneck_channels = nn.Sequential(
            nn.Conv2d(hidden_dim, out_channels, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(out_channels)
        )

    def forward(self, x):
        out = self.feature_mixing(x)
        out = self.spatial_mixing(out)
        out = self.bottleneck_channels(out)

        if self.do_identity:
            return x + out
        else:
            return out


class MobileNetV2(nn.Module):
    """
    The heavy lifting is already done in InvertedBottleneck.

    Why MobileNetV2 and not V3? V2 is the foundation for V3, which uses "neural
    architecture search" to find better configurations of V2. If you understand
    V2 well, you can totally implement V3!
    """

    def __init__(self, num_classes=7000):
        super().__init__()

        self.num_classes = num_classes

        """
        First couple of layers are special, just do them here.
        This is called the "stem". Usually, methods use it to downsample or twice.
        """
        self.stem = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(32),
            nn.ReLU6(),
            nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1, groups=32, bias=False),
            nn.BatchNorm2d(32),
            nn.ReLU6(),
            nn.Conv2d(32, 16, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(16),
        )

        """
        Since we're just repeating InvertedResidualBlocks again and again, we
        want to specify their parameters like this.
        The four numbers in each row (a stage) are shown below.
        - Expand ratio: We talked about this in InvertedResidualBlock
        - Channels: This specifies the channel size before expansion
        - # blocks: Each stage has many blocks, how many?
        - Stride of first block: For some stages, we want to downsample. In a
          downsampling stage, we set the first block in that stage to have
          stride = 2, and the rest just have stride = 1.

        Again, note that almost every stage here is downsampling! By the time
        we get to the last stage, what is the image resolution? Can it still
        be called an image for our dataset? Think about this, and make changes
        as you want.
        """
        self.stage_cfgs = [
            # expand_ratio, channels, # blocks, stride of first block
            [6, 24, 10, 2],
            [6, 32, 12, 2],
            [6, 64, 16, 2],
            [6, 96, 18, 2],
            [6, 128, 16, 2],
            [6, 320, 10, 2],
            [6, 640, 2, 2],
        ]

        # Remember that our stem left us off at 16 channels. We're going to 
        # keep updating this in_channels variable as we go
        in_channels = 16

        # Let's make the layers
        layers = []
        for curr_stage in self.stage_cfgs:
            expand_ratio, num_channels, num_blocks, stride = curr_stage

            for block_idx in range(num_blocks):
                out_channels = num_channels
                layers.append(InvertedResidualBlock(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    # only have non-trivial stride if first block
                    stride=stride if block_idx == 0 else 1,
                    expand_ratio=expand_ratio
                ))
                # In channels of the next block is the out_channels of the current one
                in_channels = out_channels

        self.layers = nn.Sequential(*layers)  # Done, save them to the class

        # Some final feature mixing
        self.final_block = nn.Sequential(
            nn.Conv2d(in_channels, 1280, kernel_size=1, padding=0, stride=1, bias=False),
            nn.BatchNorm2d(1280),
            nn.ReLU6()
        )

        # Now, we need to build the final classification layer.
        self.cls_layer = nn.Sequential(
            # Pool over & collapse the spatial dimensions to (1, 1)
            nn.AdaptiveAvgPool2d((1, 1)),
            # Collapse the trivial (1, 1) dimensions
            nn.Flatten(),
            # Project to our # of classes
            nn.Linear(1280, num_classes)
        )

        self._initialize_weights()

    def _initialize_weights(self):
        """
        Usually, I like to use default pytorch initialization for stuff, but
        MobileNetV2 made a point of putting in some custom ones, so let's just
        use them.
        """
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()

    def forward(self, x, return_feats=False):
        out = self.stem(x) # down-sample
        out = self.layers(out) # InvertedResidualBlocks
        feats = self.final_block(out)
        out = self.cls_layer(feats)

        if return_feats:
            return feats
        else:
            return out


# ConvNext

In [None]:
!pip install timm
import torch
import torch.nn as nn
import math
from torchsummary import summary
import numpy as np
from timm.models.layers import trunc_normal_, DropPath
from timm.models.registry import register_model
import torch.nn.functional as F


class Block(nn.Module):
    def __init__(self,
                 in_channels,
                 out_channels,
                 expand_ratio, drop_path=0.):
        super().__init__()  # Just have to do this for all nn.Module classes

        # change the kernel size to 7 (spatial mix)
        # depth wise convolution
        # 3*3 input padding 1, stride ==1
        self.spatial_mixing = nn.Conv2d(in_channels, in_channels, kernel_size=7, padding=3, stride=1,
                                        groups=in_channels)
        self.norm = nn.BatchNorm2d(in_channels)

        # point wise convolution, increase the channel number, expand ratio =1
        hidden_dim = in_channels * expand_ratio
        self.feature_mixing = nn.Conv2d(in_channels, hidden_dim, kernel_size=1, stride=1, padding=0)
        self.activation = nn.GELU()

        self.bottleneck_channels = nn.Conv2d(hidden_dim, out_channels, kernel_size=1, stride=1, padding=0)
        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()

    def forward(self, x):
        input = x
        x = self.spatial_mixing(x)
        x = self.norm(x)
        x = self.feature_mixing(x)
        x = self.activation(x)
        x = self.bottleneck_channels(x)
        x = input + self.drop_path(x)

        return x


class ConvNeXt(nn.Module):
    """
    1. stem + 3 intermediate downsampling layers (stem stage)
        Batch norm already shifts the data using its mean. You don't have to make the output affine again. Thats why usually BatchNorm bias is False.
    2. feature resolution stage

    """

    def __init__(self, num_classes=7000, drop_path_rate=0.1):
        super().__init__()

        self.num_classes = num_classes

        """
        First couple of layers are special, just do them here.
        This is called the "stem". Usually, methods use it to downsample or twice.
        """
        stem = nn.Sequential(
            nn.Conv2d(3, 96, kernel_size=4, stride=4),
            nn.BatchNorm2d(96),
        )

        self.downsample_layers = nn.ModuleList()
        self.downsample_layers.append(stem)

        downsample_layer = nn.Sequential(nn.BatchNorm2d(96), nn.Conv2d(96, 192, kernel_size=2, stride=2))
        self.downsample_layers.append(downsample_layer)

        downsample_layer = nn.Sequential(nn.BatchNorm2d(192), nn.Conv2d(192, 384, kernel_size=2, stride=2))
        self.downsample_layers.append(downsample_layer)

        downsample_layer = nn.Sequential(nn.BatchNorm2d(384), nn.Conv2d(384, 768, kernel_size=2, stride=2))
        self.downsample_layers.append(downsample_layer)

        # Feature resolution stage
        self.stages = nn.ModuleList()

        # drop rates (3+9+3+3)
        dp_rates = [x.item() for x in torch.linspace(0, drop_path_rate, (3+9+3+3))]

        # Dim 96 combo
        block_96 = [Block(in_channels=96, out_channels=96, expand_ratio=4, drop_path=dp_rates[j]) for j in range(3)]
        self.stages.append(nn.Sequential(*block_96))

        # Dim 192 combo
        block_192 = [Block(in_channels=192, out_channels=192, expand_ratio=4, drop_path=dp_rates[j + 3]) for j in
                     range(3)]
        self.stages.append(nn.Sequential(*block_192))

        # Dim 384 combo
        block_384 = [Block(in_channels=384, out_channels=384, expand_ratio=4, drop_path=dp_rates[j + 3 + 3]) for j in
                     range(9)]
        self.stages.append(nn.Sequential(*block_384))

        # Dim 192 combo
        block_768 = [Block(in_channels=768, out_channels=768, expand_ratio=4, drop_path=dp_rates[j + 3 + 9 + 3]) for j
                     in range(3)]
        self.stages.append(nn.Sequential(*block_768))

        # Some final feature mixing
        self.norm = nn.BatchNorm2d(768)
        self.flatten = nn.Flatten()
        self.dropout = nn.Dropout(p=0.1)

        self.cls_layer = nn.Sequential(
            nn.Linear(768, num_classes)
        )

        self._initialize_weights()

    def _initialize_weights(self):
        """
        Usually, I like to use default pytorch initialization for stuff, but
        MobileNetV2 made a point of putting in some custom ones, so let's just
        use them.
        """
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()

    def forward_features(self, x):
        for i in range(4):
            x = self.downsample_layers[i](x)  # torch.Size([2, 3, 224, 224]) -> torch.Size([2, 96, 56, 56])
            x = self.stages[i](x)

        return self.flatten(self.norm(x).mean([-2, -1]))  # global average pooling, (N, C, H, W) -> (N, C)

    def forward(self, x, return_feats=False):
        feats = self.forward_features(x)
        x = self.cls_layer(feats)
        x = self.dropout(x)

        if return_feats:
            return feats
        else:
            return x




# Dataset & DataLoader

In [None]:
DATA_DIR = "/content"
# TRAIN_DIR = osp.join(DATA_DIR, "train_subset/train_subset") # This is a smaller subset of the data. Should change this to classification/classification/train
TRAIN_DIR = osp.join(DATA_DIR, "classification/classification/train") 

VAL_DIR = osp.join(DATA_DIR, "classification/classification/dev")

# data augmentation
transforms = ttf.Compose([
    ttf.RandAugment(),
    # ttf.AutoAugment(),
    ttf.ToTensor()
])

train_dataset = torchvision.datasets.ImageFolder(TRAIN_DIR,
                                                 transform=transforms)
val_dataset = torchvision.datasets.ImageFolder(VAL_DIR,
                                               transform=ttf.Compose([ttf.ToTensor()]))


train_loader = DataLoader(train_dataset, batch_size=batch_size,
                          shuffle=True, drop_last=True, num_workers=8)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,
                        drop_last=True, num_workers=1)


# Setup optimizer/schedualr for training 

In [None]:
# model = ResNet50()
model = ConvNeXt()
# model = ConvNeXt()
# model = MobileNetV2()
model.cuda()

# For this homework, we're limiting you to 35 million trainable parameters, as
# outputted by this. This is to help constrain your search space and maintain
# reasonable training times & expectations
num_trainable_parameters = 0
for p in model.parameters():
    num_trainable_parameters += p.numel()
print("Number of Params: {}".format(num_trainable_parameters))


criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4)
# optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4) # AdamW
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(len(train_loader) * epochs))
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5,patience=2, mode='max',threshold=0.01,verbose=True)
# T_max is "how many times will i call scheduler.step() until it reaches 0 lr?"

scaler = torch.cuda.amp.GradScaler()

Number of Params: 33196504


# Classification Task: Train the network

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

txt_file = data_path +"/dev_acc.txt"
with open(txt_file,'w') as f :
    for epoch in range(1, epochs+1):
        start = time.time()
        batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train') 

        num_correct = 0
        total_loss = 0

        # training samples
        # model.train()
        for i, (x, y) in enumerate(train_loader):
            optimizer.zero_grad()

            x = x.cuda()
            y = y.cuda()

            # Don't be surprised - we just wrap these two lines to make it work for FP16
            with torch.cuda.amp.autocast():     
                outputs = model(x)
                loss = criterion(outputs, y)

            # Update # correct & loss as we go
            num_correct += int((torch.argmax(outputs, axis=1) == y).sum())
            total_loss += float(loss)

            # tqdm lets you add some details so you can monitor training as you train.
            batch_bar.set_postfix(
                acc="{:.04f}%".format(100 * num_correct / ((i + 1) * batch_size)),
                loss="{:.04f}".format(float(total_loss / (i + 1))),
                num_correct=num_correct,
                lr="{:.04f}".format(float(optimizer.param_groups[0]['lr'])))
            
            # Another couple things you need for FP16. 
            scaler.scale(loss).backward() # This is a replacement for loss.backward()
            scaler.step(optimizer) # This is a replacement for optimizer.step()
            scaler.update() # This is something added just for FP16
            
            scheduler.step() # We told scheduler T_max that we'd call step() (len(train_loader) * epochs) many times.

            batch_bar.update() # Update tqdm bar
        batch_bar.close() # You need this to close the tqdm bar

        path = data_path +"/model_epoch_{}.txt".format(epoch)
        # print("The model is save to ",path)


        torch.save({
              'epoch': epoch,
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimizer.state_dict(), 
              'scheduler': scheduler,
          }, path)
        

        print_content ="Epoch {}/{}: Train Acc {:.04f}%, Train Loss {:.04f}, Learning Rate {:.04f} \n".format(
        epoch,
        epochs,
        100 * num_correct / (len(train_loader) * batch_size),
        float(total_loss / len(train_loader)),
        float(optimizer.param_groups[0]['lr']))
        print(print_content)
        f.write(print_content)



        # # model validation process
        # model.eval()
        # batch_bar = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')
        # num_correct = 0
        # for i, (x, y) in enumerate(val_loader):

        #     x = x.cuda()
        #     y = y.cuda()

        #     with torch.no_grad():
        #         outputs = model(x)

        #     num_correct += int((torch.argmax(outputs, axis=1) == y).sum())
        #     batch_bar.set_postfix(acc="{:.04f}%".format(100 * num_correct / ((i + 1) * batch_size)))

        #     batch_bar.update()
            
        # batch_bar.close()

        # dev_acc = 100 * num_correct / (len(val_dataset)*batch_size)
        # # scheduler.step(dev_acc) # used for the ReduceOnPlatue 

        # print_content = "Epoch {}/{}: Train Acc {:.04f}%, Validation Acc {:.04f}%, Train Loss {:.04f}, Learning Rate {:.04f} \n".format(
        #     epoch,
        #     epochs,
        #     100 * num_correct / (len(train_loader) * batch_size),
        #     100 * num_correct / (len(val_dataset)*batch_size),
        #     float(total_loss / len(train_loader)),
        #     float(optimizer.param_groups[0]['lr']))
        # print(print_content)
        # f.write(print_content)

        end = time.time()
        print("", (end - start)/60,"minutes")



Epoch 1/150: Train Acc 0.0658%, Train Loss 8.6562, Learning Rate 0.1000 

 3.8363085786501565 minutes




Epoch 2/150: Train Acc 0.8893%, Train Loss 7.9146, Learning Rate 0.1000 

 3.839955035845439 minutes




Epoch 3/150: Train Acc 5.1461%, Train Loss 7.0125, Learning Rate 0.0999 

 3.8620699365933735 minutes




Epoch 4/150: Train Acc 14.9632%, Train Loss 6.1530, Learning Rate 0.0998 

 3.852550220489502 minutes




Epoch 5/150: Train Acc 28.6759%, Train Loss 5.3757, Learning Rate 0.0997 

 3.8578346172968545 minutes




Epoch 6/150: Train Acc 41.4706%, Train Loss 4.7342, Learning Rate 0.0996 

 3.886575663089752 minutes




Epoch 7/150: Train Acc 52.2085%, Train Loss 4.2154, Learning Rate 0.0995 

 3.8687500834465025 minutes




Epoch 8/150: Train Acc 60.3923%, Train Loss 3.8295, Learning Rate 0.0993 

 3.873291965325673 minutes




Epoch 9/150: Train Acc 66.8026%, Train Loss 3.5179, Learning Rate 0.0991 

 3.862355395158132 minutes




Epoch 10/150: Train Acc 71.8972%, Train Loss 3.2181, Learning Rate 0.0989 

 3.8440557797749837 minutes




Epoch 11/150: Train Acc 75.1688%, Train Loss 2.9859, Learning Rate 0.0987 

 3.8997453848520913 minutes




Epoch 12/150: Train Acc 77.8488%, Train Loss 2.7932, Learning Rate 0.0984 

 3.8572275559107463 minutes




Epoch 13/150: Train Acc 80.1003%, Train Loss 2.6705, Learning Rate 0.0982 

 3.9173010349273683 minutes




Epoch 14/150: Train Acc 82.4154%, Train Loss 2.5583, Learning Rate 0.0979 

 3.9285218636194865 minutes




Epoch 15/150: Train Acc 84.1246%, Train Loss 2.4773, Learning Rate 0.0976 

 3.872614896297455 minutes




Epoch 16/150: Train Acc 85.4138%, Train Loss 2.4127, Learning Rate 0.0972 

 3.8635461727778115 minutes




Epoch 17/150: Train Acc 85.9518%, Train Loss 2.3831, Learning Rate 0.0969 

 3.8548874219258624 minutes




Epoch 18/150: Train Acc 86.6880%, Train Loss 2.3386, Learning Rate 0.0965 

 3.888357961177826 minutes




Epoch 19/150: Train Acc 87.0049%, Train Loss 2.3225, Learning Rate 0.0961 

 3.8514486153920493 minutes




Epoch 20/150: Train Acc 87.3676%, Train Loss 2.2989, Learning Rate 0.0957 

 3.8620041410128274 minutes




Epoch 21/150: Train Acc 87.6767%, Train Loss 2.2792, Learning Rate 0.0952 

 3.871598947048187 minutes




Epoch 22/150: Train Acc 87.7146%, Train Loss 2.2732, Learning Rate 0.0948 

 3.8575014233589173 minutes




Epoch 23/150: Train Acc 87.7669%, Train Loss 2.2674, Learning Rate 0.0943 

 3.86794669230779 minutes




Epoch 24/150: Train Acc 87.8906%, Train Loss 2.2550, Learning Rate 0.0938 

 3.8731027086575827 minutes




Epoch 25/150: Train Acc 88.0230%, Train Loss 2.2445, Learning Rate 0.0933 

 3.8665797750155133 minutes




Epoch 26/150: Train Acc 88.1668%, Train Loss 2.2343, Learning Rate 0.0928 

 3.8703240036964415 minutes




Epoch 27/150: Train Acc 88.2226%, Train Loss 2.2285, Learning Rate 0.0922 

 3.8926372647285463 minutes




Epoch 28/150: Train Acc 88.1539%, Train Loss 2.2294, Learning Rate 0.0916 

 3.9076795895894367 minutes




Epoch 29/150: Train Acc 88.2991%, Train Loss 2.2181, Learning Rate 0.0911 

 3.911422296365102 minutes




Epoch 30/150: Train Acc 88.3850%, Train Loss 2.2117, Learning Rate 0.0905 

 3.8445153991381327 minutes




Epoch 31/150: Train Acc 88.4973%, Train Loss 2.2034, Learning Rate 0.0898 

 3.856520676612854 minutes




Epoch 32/150: Train Acc 88.5946%, Train Loss 2.1934, Learning Rate 0.0892 

 3.8652861595153807 minutes




Epoch 33/150: Train Acc 88.5631%, Train Loss 2.1974, Learning Rate 0.0885 

 3.896113467216492 minutes




Epoch 34/150: Train Acc 88.6075%, Train Loss 2.1936, Learning Rate 0.0878 

 3.870450969537099 minutes




Epoch 35/150: Train Acc 88.6440%, Train Loss 2.1858, Learning Rate 0.0872 

 3.8613805055618284 minutes




Epoch 36/150: Train Acc 88.7985%, Train Loss 2.1780, Learning Rate 0.0864 

 3.85339271624883 minutes




Epoch 37/150: Train Acc 88.9323%, Train Loss 2.1694, Learning Rate 0.0857 

 3.855262454350789 minutes




Epoch 38/150: Train Acc 88.7534%, Train Loss 2.1726, Learning Rate 0.0850 

 3.8704043070475262 minutes




Epoch 39/150: Train Acc 88.9244%, Train Loss 2.1614, Learning Rate 0.0842 

 3.8528399070103965 minutes




Epoch 40/150: Train Acc 88.9022%, Train Loss 2.1613, Learning Rate 0.0835 

 3.8829032222429913 minutes




Epoch 41/150: Train Acc 88.8321%, Train Loss 2.1641, Learning Rate 0.0827 

 3.854296290874481 minutes




Epoch 42/150: Train Acc 89.0017%, Train Loss 2.1510, Learning Rate 0.0819 

 3.878300166130066 minutes




Epoch 43/150: Train Acc 88.9394%, Train Loss 2.1551, Learning Rate 0.0811 

 3.857539149125417 minutes




Epoch 44/150: Train Acc 89.0911%, Train Loss 2.1395, Learning Rate 0.0802 

 3.856618344783783 minutes




Epoch 45/150: Train Acc 89.1856%, Train Loss 2.1337, Learning Rate 0.0794 

 3.8670746803283693 minutes




Epoch 46/150: Train Acc 89.1226%, Train Loss 2.1362, Learning Rate 0.0785 

 3.8476856191953024 minutes




Epoch 47/150: Train Acc 89.1061%, Train Loss 2.1356, Learning Rate 0.0777 

 3.8789977351824443 minutes




Epoch 48/150: Train Acc 89.1455%, Train Loss 2.1317, Learning Rate 0.0768 

 3.88303679227829 minutes




Epoch 49/150: Train Acc 89.1283%, Train Loss 2.1304, Learning Rate 0.0759 

 3.8713205377260844 minutes




Epoch 50/150: Train Acc 89.3129%, Train Loss 2.1167, Learning Rate 0.0750 

 3.870379829406738 minutes




Epoch 51/150: Train Acc 89.2864%, Train Loss 2.1163, Learning Rate 0.0741 

 3.9042181889216105 minutes




Epoch 52/150: Train Acc 89.2321%, Train Loss 2.1170, Learning Rate 0.0732 

 3.868407201766968 minutes




Epoch 53/150: Train Acc 89.3258%, Train Loss 2.1084, Learning Rate 0.0722 

 3.871434728304545 minutes




Epoch 54/150: Train Acc 89.4073%, Train Loss 2.1015, Learning Rate 0.0713 

 3.8525020400683085 minutes




Epoch 55/150: Train Acc 89.4073%, Train Loss 2.1018, Learning Rate 0.0703 

 3.9019830147425334 minutes




Epoch 56/150: Train Acc 89.4309%, Train Loss 2.0987, Learning Rate 0.0694 

 3.858998143672943 minutes




Epoch 57/150: Train Acc 89.3480%, Train Loss 2.0998, Learning Rate 0.0684 

 3.858205743630727 minutes




Epoch 58/150: Train Acc 89.3329%, Train Loss 2.0976, Learning Rate 0.0674 

 3.848817034562429 minutes




Epoch 59/150: Train Acc 89.4810%, Train Loss 2.0870, Learning Rate 0.0664 

 3.846176552772522 minutes




Epoch 60/150: Train Acc 89.5225%, Train Loss 2.0818, Learning Rate 0.0655 

 3.8626959760983786 minutes




Epoch 61/150: Train Acc 89.4603%, Train Loss 2.0844, Learning Rate 0.0645 

 3.877136433124542 minutes




Epoch 62/150: Train Acc 89.4302%, Train Loss 2.0831, Learning Rate 0.0634 

 3.851140197118123 minutes




Epoch 63/150: Train Acc 89.5590%, Train Loss 2.0751, Learning Rate 0.0624 

 3.877454936504364 minutes




Epoch 64/150: Train Acc 89.6026%, Train Loss 2.0687, Learning Rate 0.0614 

 3.8688155810038247 minutes




Epoch 65/150: Train Acc 89.6313%, Train Loss 2.0651, Learning Rate 0.0604 

 3.8823956648508706 minutes




Epoch 66/150: Train Acc 89.6005%, Train Loss 2.0639, Learning Rate 0.0594 

 3.8606945276260376 minutes




Epoch 67/150: Train Acc 89.5404%, Train Loss 2.0670, Learning Rate 0.0583 

 3.8582712610562644 minutes




Epoch 68/150: Train Acc 89.6663%, Train Loss 2.0564, Learning Rate 0.0573 

 3.8515575369199118 minutes




Epoch 69/150: Train Acc 89.6999%, Train Loss 2.0526, Learning Rate 0.0563 

 3.8815813024838763 minutes




Epoch 70/150: Train Acc 89.6492%, Train Loss 2.0555, Learning Rate 0.0552 

 3.849536685148875 minutes




Epoch 71/150: Train Acc 89.7078%, Train Loss 2.0487, Learning Rate 0.0542 

 3.9628841439882914 minutes




Epoch 72/150: Train Acc 89.5654%, Train Loss 2.0545, Learning Rate 0.0531 

 3.875212093194326 minutes




Epoch 73/150: Train Acc 89.7672%, Train Loss 2.0398, Learning Rate 0.0521 

 3.8495607376098633 minutes




Epoch 74/150: Train Acc 89.6255%, Train Loss 2.0456, Learning Rate 0.0510 

 3.8549267450968423 minutes




Epoch 75/150: Train Acc 89.7944%, Train Loss 2.0339, Learning Rate 0.0500 

 3.88581413825353 minutes




Epoch 76/150: Train Acc 89.8216%, Train Loss 2.0314, Learning Rate 0.0490 

 3.855894454320272 minutes




Epoch 77/150: Train Acc 89.8974%, Train Loss 2.0242, Learning Rate 0.0479 

 3.849430863062541 minutes




Epoch 78/150: Train Acc 89.6935%, Train Loss 2.0350, Learning Rate 0.0469 

 3.8656876484553018 minutes




Epoch 79/150: Train Acc 89.6971%, Train Loss 2.0322, Learning Rate 0.0458 

 3.8605401198069256 minutes




Epoch 80/150: Train Acc 89.7872%, Train Loss 2.0252, Learning Rate 0.0448 

 3.8555984258651734 minutes




Epoch 81/150: Train Acc 89.6177%, Train Loss 2.0346, Learning Rate 0.0437 

 3.854997007052104 minutes




Epoch 82/150: Train Acc 89.8316%, Train Loss 2.0186, Learning Rate 0.0427 

 3.878119242191315 minutes




Epoch 83/150: Train Acc 89.8695%, Train Loss 2.0137, Learning Rate 0.0417 

 3.8606109380722047 minutes




Epoch 84/150: Train Acc 89.6964%, Train Loss 2.0244, Learning Rate 0.0406 

 3.8404507954915363 minutes




Epoch 85/150: Train Acc 89.7915%, Train Loss 2.0171, Learning Rate 0.0396 

 3.8593149224917096 minutes




Epoch 86/150: Train Acc 89.7457%, Train Loss 2.0172, Learning Rate 0.0386 

 3.8658796389897665 minutes




Epoch 87/150: Train Acc 89.6785%, Train Loss 2.0186, Learning Rate 0.0376 

 3.8670148730278013 minutes




Epoch 88/150: Train Acc 89.8609%, Train Loss 2.0049, Learning Rate 0.0366 

 3.8457866152127584 minutes




Epoch 89/150: Train Acc 89.7915%, Train Loss 2.0071, Learning Rate 0.0355 

 3.8844675183296205 minutes




Epoch 90/150: Train Acc 89.8280%, Train Loss 2.0030, Learning Rate 0.0345 

 3.861160063743591 minutes




Epoch 91/150: Train Acc 89.8187%, Train Loss 2.0007, Learning Rate 0.0336 

 3.879338812828064 minutes




Epoch 92/150: Train Acc 89.8523%, Train Loss 1.9993, Learning Rate 0.0326 

 3.855731117725372 minutes




Epoch 93/150: Train Acc 89.7550%, Train Loss 2.0025, Learning Rate 0.0316 

 3.85532488822937 minutes




Epoch 94/150: Train Acc 89.9346%, Train Loss 1.9900, Learning Rate 0.0306 

 3.865298541386922 minutes




Epoch 95/150: Train Acc 90.0004%, Train Loss 1.9847, Learning Rate 0.0297 

 3.859853919347127 minutes




Epoch 96/150: Train Acc 89.8581%, Train Loss 1.9911, Learning Rate 0.0287 

 3.86423902908961 minutes




Epoch 97/150: Train Acc 89.8938%, Train Loss 1.9868, Learning Rate 0.0278 

 3.8491111477216085 minutes




Epoch 98/150: Train Acc 89.9682%, Train Loss 1.9812, Learning Rate 0.0268 

 3.8748903314272565 minutes




Epoch 99/150: Train Acc 89.6413%, Train Loss 2.0007, Learning Rate 0.0259 

 3.920171594619751 minutes




Epoch 100/150: Train Acc 90.0133%, Train Loss 1.9743, Learning Rate 0.0250 

 3.9433886766433717 minutes




Epoch 101/150: Train Acc 89.8731%, Train Loss 1.9823, Learning Rate 0.0241 

 3.899858272075653 minutes




Epoch 102/150: Train Acc 89.9897%, Train Loss 1.9718, Learning Rate 0.0232 

 3.872350815931956 minutes




Epoch 103/150: Train Acc 90.0698%, Train Loss 1.9666, Learning Rate 0.0223 

 3.854131368796031 minutes




Epoch 104/150: Train Acc 89.8838%, Train Loss 1.9761, Learning Rate 0.0215 

 3.8428102771441144 minutes




Epoch 105/150: Train Acc 89.9074%, Train Loss 1.9734, Learning Rate 0.0206 

 3.8534873763720197 minutes




Epoch 106/150: Train Acc 90.0519%, Train Loss 1.9624, Learning Rate 0.0198 

 3.8759323875109355 minutes




Epoch 107/150: Train Acc 89.8366%, Train Loss 1.9750, Learning Rate 0.0189 

 3.8599671284357706 minutes




Epoch 108/150: Train Acc 89.8266%, Train Loss 1.9735, Learning Rate 0.0181 

 3.848123296101888 minutes




Epoch 109/150: Train Acc 89.9697%, Train Loss 1.9636, Learning Rate 0.0173 

 3.91672598918279 minutes




Epoch 110/150: Train Acc 89.8967%, Train Loss 1.9671, Learning Rate 0.0165 

 3.914385922749837 minutes




Epoch 111/150: Train Acc 90.1471%, Train Loss 1.9489, Learning Rate 0.0158 

 3.9090270161628724 minutes




Epoch 112/150: Train Acc 89.9010%, Train Loss 1.9633, Learning Rate 0.0150 

 3.849647255738576 minutes




Epoch 113/150: Train Acc 89.8831%, Train Loss 1.9630, Learning Rate 0.0143 

 3.868568738301595 minutes




Epoch 114/150: Train Acc 89.9511%, Train Loss 1.9571, Learning Rate 0.0136 

 3.85196879307429 minutes




Epoch 115/150: Train Acc 89.8874%, Train Loss 1.9593, Learning Rate 0.0128 

 3.8506410757700604 minutes




Epoch 116/150: Train Acc 89.9174%, Train Loss 1.9561, Learning Rate 0.0122 

 3.862179704507192 minutes




Epoch 117/150: Train Acc 90.0147%, Train Loss 1.9493, Learning Rate 0.0115 

 3.8836012721061706 minutes




Epoch 118/150: Train Acc 89.8967%, Train Loss 1.9558, Learning Rate 0.0108 

 3.8691479563713074 minutes




Epoch 119/150: Train Acc 90.0405%, Train Loss 1.9456, Learning Rate 0.0102 

 3.892748538653056 minutes




Epoch 120/150: Train Acc 89.9196%, Train Loss 1.9518, Learning Rate 0.0095 

 3.8801867008209228 minutes




Epoch 121/150: Train Acc 89.9711%, Train Loss 1.9478, Learning Rate 0.0089 

 3.872723909219106 minutes




Epoch 122/150: Train Acc 90.0011%, Train Loss 1.9448, Learning Rate 0.0084 

 3.8679453253746034 minutes




Epoch 123/150: Train Acc 89.7901%, Train Loss 1.9571, Learning Rate 0.0078 

 3.8830745299657186 minutes




Epoch 124/150: Train Acc 89.9632%, Train Loss 1.9445, Learning Rate 0.0072 

 3.8473328312238055 minutes




Epoch 125/150: Train Acc 89.8545%, Train Loss 1.9506, Learning Rate 0.0067 

 3.8441006342569985 minutes




Epoch 126/150: Train Acc 89.8523%, Train Loss 1.9497, Learning Rate 0.0062 

 3.8697543660799663 minutes




Epoch 127/150: Train Acc 90.0512%, Train Loss 1.9365, Learning Rate 0.0057 

 3.8676225264867146 minutes




Epoch 128/150: Train Acc 89.9768%, Train Loss 1.9401, Learning Rate 0.0052 

 3.8649959405263266 minutes




Epoch 129/150: Train Acc 90.0090%, Train Loss 1.9373, Learning Rate 0.0048 

 3.865364933013916 minutes




Epoch 130/150: Train Acc 89.9554%, Train Loss 1.9404, Learning Rate 0.0043 

 3.885703992843628 minutes




Epoch 131/150: Train Acc 89.9625%, Train Loss 1.9387, Learning Rate 0.0039 

 3.9602760752042134 minutes




Epoch 132/150: Train Acc 89.9818%, Train Loss 1.9362, Learning Rate 0.0035 

 3.849486434459686 minutes




Epoch 133/150: Train Acc 89.9446%, Train Loss 1.9383, Learning Rate 0.0031 

 3.8489320119222006 minutes




Epoch 134/150: Train Acc 89.9453%, Train Loss 1.9379, Learning Rate 0.0028 

 3.8863594492276508 minutes




Epoch 135/150: Train Acc 90.0641%, Train Loss 1.9302, Learning Rate 0.0024 

 3.8751012841860453 minutes




Epoch 136/150: Train Acc 90.1221%, Train Loss 1.9251, Learning Rate 0.0021 

 3.8616149624188743 minutes




Epoch 137/150: Train Acc 89.9804%, Train Loss 1.9339, Learning Rate 0.0018 

 3.9445790529251097 minutes




Epoch 138/150: Train Acc 90.0290%, Train Loss 1.9296, Learning Rate 0.0016 

 3.8424040913581847 minutes




Epoch 139/150: Train Acc 90.0491%, Train Loss 1.9285, Learning Rate 0.0013 

 3.877110409736633 minutes




Epoch 140/150: Train Acc 90.0462%, Train Loss 1.9282, Learning Rate 0.0011 

 3.9101724068323773 minutes




Epoch 141/150: Train Acc 89.9976%, Train Loss 1.9309, Learning Rate 0.0009 

 3.865354331334432 minutes




Epoch 142/150: Train Acc 89.8867%, Train Loss 1.9374, Learning Rate 0.0007 

 3.8843679070472716 minutes




Epoch 143/150: Train Acc 90.0662%, Train Loss 1.9262, Learning Rate 0.0005 

 3.8635205149650576 minutes




Epoch 144/150: Train Acc 89.9368%, Train Loss 1.9339, Learning Rate 0.0004 

 3.852676026026408 minutes




Epoch 145/150: Train Acc 89.9618%, Train Loss 1.9321, Learning Rate 0.0003 

 3.853631826241811 minutes




Epoch 146/150: Train Acc 89.9489%, Train Loss 1.9326, Learning Rate 0.0002 

 3.8637343049049377 minutes




Epoch 147/150: Train Acc 89.9575%, Train Loss 1.9317, Learning Rate 0.0001 

 3.86182625691096 minutes




Epoch 148/150: Train Acc 89.9625%, Train Loss 1.9319, Learning Rate 0.0000 

 3.8533109148343403 minutes




Epoch 149/150: Train Acc 90.0527%, Train Loss 1.9256, Learning Rate 0.0000 

 3.865573743979136 minutes




Epoch 150/150: Train Acc 90.0434%, Train Loss 1.9262, Learning Rate 0.0000 

 3.84755463997523 minutes


In [None]:
del model
torch.cuda.empty_cache()

# Classification Task: Resume the network

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# set up the resuming training epoch from specified number
resume_epoch = 150
epochs = 150

# model = MobileNetV2()
model = ConvNeXt().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(len(train_loader) * epochs))

scaler = torch.cuda.amp.GradScaler()


# Reload the checkpoint
checkpoint = torch.load("/content/drive/hw2/store_checkpoints/{}/model_epoch_{}.txt".format(model_id,resume_epoch))

model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
scheduler=checkpoint['scheduler']
epoch = checkpoint['epoch']
txt_file = data_path +"/dev_acc.txt"

print("We are working on the directory: => "+"/content/drive/hw2/store_checkpoints/{}/model_epoch_{}.txt".format(model_id,resume_epoch))

with open(txt_file,'w') as f :
    for epoch in range(resume_epoch+1, epochs+1):
        start = time.time()
        batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train') 

        num_correct = 0
        total_loss = 0

        for i, (x, y) in enumerate(train_loader):
            optimizer.zero_grad()

            x = x.cuda()
            y = y.cuda()

            # Don't be surprised - we just wrap these two lines to make it work for FP16
            with torch.cuda.amp.autocast():     
                outputs = model(x)
                loss = criterion(outputs, y)

            # Update # correct & loss as we go
            num_correct += int((torch.argmax(outputs, axis=1) == y).sum())
            total_loss += float(loss)

            # tqdm lets you add some details so you can monitor training as you train.
            batch_bar.set_postfix(
                acc="{:.04f}%".format(100 * num_correct / ((i + 1) * batch_size)),
                loss="{:.04f}".format(float(total_loss / (i + 1))),
                num_correct=num_correct,
                lr="{:.04f}".format(float(optimizer.param_groups[0]['lr'])))
            
            # Another couple things you need for FP16. 
            scaler.scale(loss).backward() # This is a replacement for loss.backward()
            scaler.step(optimizer) # This is a replacement for optimizer.step()
            scaler.update() # This is something added just for FP16

            scheduler.step() # We told scheduler T_max that we'd call step() (len(train_loader) * epochs) many times.

            batch_bar.update() # Update tqdm bar
        batch_bar.close() # You need this to close the tqdm bar

        path = data_path +"/model_epoch_{}.txt".format(epoch)
        # print("The model is save to ",path)


        torch.save({
              'epoch': epoch,
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimizer.state_dict(), 
              'scheduler': scheduler,
          }, path)
        

        print_content ="Epoch {}/{}: Train Acc {:.04f}%, Train Loss {:.04f}, Learning Rate {:.04f} \n".format(
        epoch,
        epochs,
        100 * num_correct / (len(train_loader) * batch_size),
        float(total_loss / len(train_loader)),
        float(optimizer.param_groups[0]['lr']))
        print(print_content)
        f.write(print_content)
        end = time.time()
        print("", (end - start)/60,"minutes")

We are working on the directory: => /content/drive/hw2/store_checkpoints/ConvNextSelfImple_dropout_full_RandAug_plus_0.1_label_smoothing_CosineAnnealing_SGD_BatchNorm2d_lr0.1/model_epoch_150.txt


# Classification Task: Validation

In [None]:
model.eval()
batch_bar = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')
num_correct = 0
for i, (x, y) in enumerate(val_loader):

    x = x.cuda()
    y = y.cuda()

    with torch.no_grad():
        outputs = model(x)

    num_correct += int((torch.argmax(outputs, axis=1) == y).sum())
    batch_bar.set_postfix(acc="{:.04f}%".format(100 * num_correct / ((i + 1) * batch_size)))

    batch_bar.update()
    
batch_bar.close()
print("Validation: {:.04f}%".format(100 * num_correct / len(val_dataset)))

                                                                    

Validation: 88.7486%




# Classification Task: Submit to Kaggle

In [None]:
class ClassificationTestSet(Dataset):
    # It's possible to load test set data using ImageFolder without making a custom class.
    # See if you can think it through!

    def __init__(self, data_dir, transforms):
        self.data_dir = data_dir
        self.transforms = transforms

        # This one-liner basically generates a sorted list of full paths to each image in data_dir
        self.img_paths = list(map(lambda fname: osp.join(self.data_dir, fname), sorted(os.listdir(self.data_dir))))

    def __len__(self):
        return len(self.img_paths)
    
    def __getitem__(self, idx):
        return self.transforms(Image.open(self.img_paths[idx]))

In [None]:
del train_loader
del val_loader
DATA_DIR = "/content"
TEST_DIR = osp.join(DATA_DIR, "classification/classification/test")
batch_size = 32
test_dataset = ClassificationTestSet(TEST_DIR, ttf.Compose([ttf.ToTensor()]))
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False,
                         drop_last=False, num_workers=1)

In [None]:
# model = ConvNeXtRef()
model = ConvNeXt()
# model = ResNet50()
torch.cuda.empty_cache()

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4)

epochs = 150
path = data_path +"/model_epoch_{}.txt".format(epochs)
print("You are loading files from ",path)
checkpoint = torch.load(path)
# model = ConvNeXtRef()
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
scheduler=checkpoint['scheduler']

# epoch = checkpoint['epoch']

You are loading files from  /content/drive/hw2/store_checkpoints/ConvNextSelfImple_dropout_full_RandAug_plus_0.1_label_smoothing_CosineAnnealing_SGD_BatchNorm2d_lr0.1/model_epoch_150.txt


In [None]:
torch.cuda.empty_cache()
model.eval()
batch_bar = tqdm(total=len(test_loader), dynamic_ncols=True, position=0, leave=False, desc='Test')


res = []
for i, (x) in enumerate(test_loader):


    (x) = (x).cuda()
    model = model.cuda()
  
    output = model((x))
    y_pred = torch.argmax(output, axis=1)
    res.extend(y_pred.tolist())

    batch_bar.update()
    
batch_bar.close()



In [None]:
with open("classification_early_submission.csv", "w+") as f:
    f.write("id,label\n")
    for i in range(len(test_dataset)):
        f.write("{},{}\n".format(str(i).zfill(6) + ".jpg", res[i]))

In [None]:
!kaggle competitions submit -c 11-785-s22-hw2p2-classification -f classification_early_submission.csv -m "Message"

  0% 0.00/541k [00:00<?, ?B/s]100% 541k/541k [00:00<00:00, 2.67MB/s]
Successfully submitted to Face Recognition

# Verification Task: Triplet loss implementation


There are 6K verification dev images, but 166K "pairs" for you to compare. So, it's much more efficient to compute the features for the 6K verification images, and just compare afterwards.

This will be done by creating a dictionary mapping the image file names to the features. Then, you'll use this dictionary to compute the similarities for each pair.

In [None]:
!ls verification/verification/dev | wc -l
!cat verification/verification/verification_dev.csv | wc -l

6000
166801


In [None]:
class TripletDataset(torchvision.datasets.VisionDataset):
    def __init__(self, root, transform):
        # For "root", note that you're making this dataset on top of the regular classification dataset.
        self.dataset = torchvision.datasets.ImageFolder(root=root, transform=transform)

        # map class indices to dataset image indices
        self.classes_to_img_indices = [[] for _ in range(len(self.dataset.classes))]
        for img_idx, (_, class_id) in enumerate(self.dataset.samples):
            self.classes_to_img_indices[class_id].append(img_idx)

        # VisionDataset attributes for display
        self.root = root
        self.length = len(
            self.dataset.classes)  # pseudo length! Length of this dataset is 7000, *not* the actual # of images in the dataset. You can just increase the # of epochs you train for.
        self.transforms = self.dataset.transforms

    def __len__(self):
        return self.length

    def __getitem__(self, anchor_class_idx):
        """Treat the given index as the anchor class and pick a triplet randomly"""
        anchor_class = self.classes_to_img_indices[anchor_class_idx]
        # choose positive pair (assuming each class has at least 2 images)
        anchor, positive = np.random.choice(a=anchor_class, size=2, replace=False)
        # choose negative image
        # hint for further exploration: you can choose 2 negative images to make it a Quadruplet Loss

        classes_to_choose_negative_class_from = list(range(self.length))
        classes_to_choose_negative_class_from.pop(anchor_class_idx)  # TODO: What are we removing?
        negative_class = self.classes_to_img_indices[np.random.choice(classes_to_choose_negative_class_from, size=1)[0]]
        negative = np.random.choice(a=negative_class, size=1)[0]  # TODO: How do we get a sample from that negative class?


        # self.dataset[idx] will return a tuple (image tensor, class label). You can use its outputs to train for classification alongside verification
        # If you do not want to train for classification, you can use self.dataset[idx][0] to get the image tensor
        return self.dataset[anchor][0], self.dataset[positive][0], self.dataset[negative][0]

class TripletWrapper(nn.Module):
    def __init__(self, network):
        super().__init__()
        self.network = network

    def forward(self, anchor, posi, nega, imgs=None,return_feats= False):
        if return_feats:
            return self.network(imgs)
        else:
            return self.network(anchor), self.network(posi), self.network(nega)


# Prepare the verificationDataest
class VerificationDataset(Dataset):
    def __init__(self, data_dir, transforms):
        self.data_dir = data_dir
        self.transforms = transforms

        # This one-liner basically generates a sorted list of full paths to each image in data_dir
        self.img_paths = list(map(lambda fname: osp.join(self.data_dir, fname), sorted(os.listdir(self.data_dir))))

    def __len__(self):
        return len(self.img_paths)
    
    def __getitem__(self, idx):
        # We return the image, as well as the path to that image (relative path)
        return self.transforms(Image.open(self.img_paths[idx])), osp.relpath(self.img_paths[idx], self.data_dir)

In [None]:
DATA_DIR = "/content"
TRAIN_DIR = osp.join(DATA_DIR, "classification/classification/train") 
VAL_DIR = osp.join(DATA_DIR, "classification/classification/dev")
lr = 1e-4
batch_size = 32
epochs =150

# data augmentation
transforms = ttf.Compose([
    ttf.RandomHorizontalFlip(),
    # ttf.AutoAugment(),
    ttf.ToTensor()
])

train_dataset_triplet = TripletDataset(TRAIN_DIR,transforms)

val_dataset_triplet = TripletDataset(VAL_DIR,transform=ttf.Compose([ttf.ToTensor()]))


train_loader = DataLoader(train_dataset_triplet, batch_size=batch_size,
                          shuffle=True, drop_last=True, num_workers=2)
val_loader = DataLoader(val_dataset_triplet, batch_size=batch_size, shuffle=False,
                        drop_last=True, num_workers=1)

model = ConvNeXt().cuda()

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4)
scaler = torch.cuda.amp.GradScaler()


# Reload the checkpoint
data_path = "/content/drive/hw2/store_checkpoints/ConvNextSelfImple_dropout_full_RandAug_plus_0.1_label_smoothing_CosineAnnealing_SGD_BatchNorm2d_lr0.1"
path = data_path +"/model_epoch_{}.txt".format(epochs)
print("You are loading files from ",path)
checkpoint = torch.load(path)

model.load_state_dict(checkpoint['model_state_dict'])
scheduler=checkpoint['scheduler']


# New criterion
criterion2 = nn.TripletMarginLoss()
# New model
model = TripletWrapper(model).cuda()


DATA_DIR = "/content"
val_veri_dataset = VerificationDataset(osp.join(DATA_DIR, "verification/verification/dev"),
                                       ttf.Compose([ttf.ToTensor()]))
val_ver_loader = torch.utils.data.DataLoader(val_veri_dataset, batch_size=batch_size, 
                                             shuffle=False, num_workers=1)


You are loading files from  /content/drive/hw2/store_checkpoints/ConvNextSelfImple_dropout_full_RandAug_plus_0.1_label_smoothing_CosineAnnealing_SGD_BatchNorm2d_lr0.1/model_epoch_150.txt


In [None]:
epochs = 20
for epoch in range(epochs):
    # Quality of life tip: leave=False and position=0 are needed to make tqdm usable in jupyter
    batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train') 

    num_correct = 0
    total_loss = 0

    ##########################################################################
    # model training
    ##########################################################################
    model.train()
    for i, (x, y, z) in enumerate(train_loader):

        optimizer.zero_grad()

        x = x.cuda()
        y = y.cuda()
        z = z.cuda()
        
        # Don't be surprised - we just wrap these two lines to make it work for FP16
        with torch.cuda.amp.autocast():     
            anchor, pos, neg = model(x,y,z)
            
            # loss1 = criterion(pos, neg) #crossentropy output/actual
            loss2 = criterion2(anchor, pos, neg)
            loss1 = 0
            loss = loss1 + loss2

        # Update # correct & loss as we go
        # num_correct += int((torch.argmax(outputs, axis=1) == y).sum())
        total_loss += float(loss)
        
        # tqdm lets you add some details so you can monitor training as you train.
        batch_bar.set_postfix(
            # acc="{:.04f}%".format(100 * num_correct / ((i + 1) * batch_size)),
            loss="{:.04f}".format(float(total_loss / (i + 1))),
            # num_correct=num_correct,
            lr="{:.04f}".format(float(optimizer.param_groups[0]['lr'])))
        
        # Another couple things you need for FP16. 
        scaler.scale(loss).backward() # This is a replacement for loss.backward()
        scaler.step(optimizer) # This is a replacement for optimizer.step()
        scaler.update() # This is something added just for FP16

        scheduler.step() # We told scheduler T_max that we'd call step() (len(train_loader) * epochs) many times.

        batch_bar.update() # Update tqdm bar
    batch_bar.close() # You need this to close the tqdm bar

    # You can add validation per-epoch here if you would like

    print("Epoch {}/{}: Train Loss {:.10f}, Learning Rate {:.10f}".format(
        epoch + 1,
        epochs,
        float(total_loss / len(train_loader)),
        float(optimizer.param_groups[0]['lr'])))
    
    ##########################################################################
    # model evaluation
    ##########################################################################

    model.eval()
    feats_dict = dict()
    for batch_idx, (imgs, path_names) in tqdm(enumerate(val_ver_loader), total=len(val_ver_loader), position=0, leave=False):
        imgs = imgs.cuda()
        model.cuda()

        with torch.no_grad():
          # obtain the features
            feats = model(_,_,_,imgs, return_feats=True) 
        

        for i in range(len(path_names)):
          pn = 'dev/'+path_names[i]
          ft = feats[i]
          feats_dict.update({pn:ft})
    
    val_veri_csv = osp.join(DATA_DIR, "verification/verification/verification_dev.csv")


    # Now, loop through the csv and compare each pair, getting the similarity between them
    pred_similarities = []
    gt_similarities = []
    for line in tqdm(open(val_veri_csv).read().splitlines()[1:], position=0, leave=False): # skip header
        img_path1, img_path2, gt = line.split(",")

        img1 = feats_dict[img_path1]
        img2 = feats_dict[img_path2]
        similarity = F.cosine_similarity(img1,img2,dim=0).to("cpu").numpy()

        pred_similarities.append(similarity)
        gt_similarities.append(int(gt))

    pred_similarities = np.array(pred_similarities).flatten()
    gt_similarities = np.array(gt_similarities)
    auc =roc_auc_score(gt_similarities, pred_similarities)
    print(auc)

    path = data_path +f"/verification_{lr}_margin1_model_epoch_{epoch}_auc{auc}.txt"
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(), 
        'scheduler': scheduler,
    }, path)


                                                                                

Epoch 1/20: Train Loss 0.2430992990, Learning Rate 0.0001000000




0.914358905767369


                                                                                

Epoch 2/20: Train Loss 0.1386141033, Learning Rate 0.0001000000




0.9369342838043428


                                                                                

Epoch 3/20: Train Loss 0.0937704865, Learning Rate 0.0001000000




0.9416851006753952


                                                                                

Epoch 4/20: Train Loss 0.1086166364, Learning Rate 0.0001000000




0.9500457032870776


                                                                                

Epoch 5/20: Train Loss 0.0768240964, Learning Rate 0.0001000000




0.9494912846918075


                                                                                

Epoch 6/20: Train Loss 0.0909866040, Learning Rate 0.0001000000




0.9509309137430271


                                                                                

Epoch 7/20: Train Loss 0.0906642161, Learning Rate 0.0001000000




0.9542327033677562


                                                                                

Epoch 8/20: Train Loss 0.1026388286, Learning Rate 0.0001000000




0.9476438197293808


                                                                                

Epoch 9/20: Train Loss 0.0766102349, Learning Rate 0.0001000000




0.9534510917546448


                                                                                

Epoch 10/20: Train Loss 0.0647040945, Learning Rate 0.0001000000




0.9556232504264442


                                                                                

Epoch 11/20: Train Loss 0.0654990279, Learning Rate 0.0001000000




0.9576836605619843


                                                                                

Epoch 12/20: Train Loss 0.0670144656, Learning Rate 0.0001000000




0.9574625884583468


                                                                                

Epoch 13/20: Train Loss 0.0532646278, Learning Rate 0.0001000000




0.956819850398783


                                                                                

Epoch 14/20: Train Loss 0.0680741750, Learning Rate 0.0001000000




0.9597733135286525


                                                                                

Epoch 15/20: Train Loss 0.0670101457, Learning Rate 0.0001000000




0.9569697887372642


                                                                                

Epoch 16/20: Train Loss 0.0689347086, Learning Rate 0.0001000000




0.9582438534415195


                                                                                

Epoch 17/20: Train Loss 0.0634206763, Learning Rate 0.0001000000




0.9603718299064128


                                                                                

Epoch 18/20: Train Loss 0.0560014587, Learning Rate 0.0001000000




0.9619034988589739


                                                                                

Epoch 19/20: Train Loss 0.0675913609, Learning Rate 0.0001000000




0.9566596725024203


                                                                                

Epoch 20/20: Train Loss 0.0558926490, Learning Rate 0.0001000000




0.959605720690609


# Verification Task: Submit to Kaggle

In [None]:
# Reload the checkpoint
# epoch =20
# data_path = "/content/drive/hw2/store_checkpoints/ConvNextRef_full_RandAug_plus_0.1_label_smoothing_CosineAnnealing_SGD_BatchNorm2d_lr0.1"
# path = data_path +f"/verification_{lr}_margin0.05_model_epoch_{epoch}.txt"
path = "/content/drive/hw2/store_checkpoints/ConvNextSelfImple_dropout_full_RandAug_plus_0.1_label_smoothing_CosineAnnealing_SGD_BatchNorm2d_lr0.1/verification_0.0001_margin1_model_epoch_17_auc0.9619034988589739.txt"
checkpoint = torch.load(path)

model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
scheduler=checkpoint['scheduler']
epoch = checkpoint['epoch']


In [None]:
val_transforms = [ttf.ToTensor()]
test_veri_dataset = VerificationDataset(osp.join(DATA_DIR, "verification/verification/test"),
                                        ttf.Compose(val_transforms))
test_ver_loader = torch.utils.data.DataLoader(test_veri_dataset, batch_size=batch_size, 
                                              shuffle=False, num_workers=1)

In [None]:
model.eval()

feats_dict = dict()
for batch_idx, (imgs, path_names) in tqdm(enumerate(test_ver_loader), total=len(test_ver_loader), position=0, leave=False):
    imgs = imgs.cuda()

    with torch.no_grad():
        # Note that we return the feats here, not the final outputs
        # Feel free to try to final outputs too!
        feats = model(_,_,_,imgs, return_feats=True) 
    
    # TODO: Now we have features and the image path names. What to do with them?
    # Hint: use the feats_dict somehow.
    for i in range(len(path_names)):
      pn = 'test/'+path_names[i]
      ft = feats[i]
      feats_dict.update({pn:ft})



In [None]:
# We use cosine similarity between feature embeddings.
# TODO: Find the relevant function in pytorch and read its documentation.
# similarity_metric = 
val_veri_csv = osp.join(DATA_DIR, "verification/verification/verification_test.csv")


# Now, loop through the csv and compare each pair, getting the similarity between them
pred_similarities = []
for line in tqdm(open(val_veri_csv).read().splitlines()[1:], position=0, leave=False): # skip header
    img_path1, img_path2 = line.split(",")
    
    img1 = feats_dict[img_path1]
    img2 = feats_dict[img_path2]
    similarity = F.cosine_similarity(img1,img2,dim=0).to("cpu").numpy().flatten()

    pred_similarities.append(similarity)
    # TODO: Finish up verification testing.
    # How to use these img_paths? What to do with the features?



In [None]:
with open("verification_early_submission.csv", "w+") as f:
    f.write("id,match\n")
    for i in range(len(pred_similarities)):
        f.write("{},{}\n".format(i, pred_similarities[i][0]))

In [None]:
!kaggle competitions submit -c 11-785-s22-hw2p2-verification -f verification_early_submission.csv -m "Message"

100% 16.4M/16.4M [00:00<00:00, 39.7MB/s]
Successfully submitted to Face Verification