<a href="https://colab.research.google.com/github/yling01/15799-project1/blob/main/Untitled11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import torchvision
import torchvision.transforms as ttf

import os
import os.path as osp
import math

from tqdm import tqdm
from PIL import Image
from sklearn.metrics import roc_auc_score
import numpy as np

!pip install timm
from timm.models.layers import DropPath



In [2]:
import os 

VERSION = '12'
PLATFORM = 'Collab'
CONTINUE = False 

if PLATFORM == 'AWS':
    PATH = '/home/ubuntu/efs/11785/hw2p2/Models'
else:
    PATH = '/content/drive/MyDrive/CMU/11785/hw2p2/Models/'
    from google.colab import drive
    drive.mount('/content/drive')
    
PATH = PATH + VERSION + '/'
try:
    os.makedirs(PATH)
except OSError as error:
    print(error)   


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[Errno 17] File exists: '/content/drive/MyDrive/CMU/11785/hw2p2/Models/12/'


In [3]:
# !pip install --upgrade --force-reinstall --no-deps kaggle==1.5.8
# !mkdir /root/.kaggle

# with open("/root/.kaggle/kaggle.json", "w+") as f:
#     f.write('{"username":"yingjieling","key":"5a27d5620222819ea5c3b71d2251e19c"}') # Put your kaggle username & key here

# !chmod 600 /root/.kaggle/kaggle.json

# !kaggle competitions download -c 11-785-s22-hw2p2-classification
# !kaggle competitions download -c 11-785-s22-hw2p2-verification

# !unzip -q 11-785-s22-hw2p2-classification.zip
# !unzip -q 11-785-s22-hw2p2-verification.zip

In [4]:
!pip3 install --no-cache-dir --upgrade comet_ml
from comet_ml import Experiment
experiment = Experiment(
    api_key="fs5JWzC05BHp2mS1s7w1OUTz5",
    project_name="11785-hw2p2",
    workspace="yling01",
)

experiment.set_name("{} {}".format(VERSION, PLATFORM))




COMET ERROR: Failed to calculate active processors count. Fall back to default CPU count 1
COMET ERROR: Error logging git-related information
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/yling01/11785-hw2p2/a5030335cc2947468d7b47104deed48e



In [5]:
"""
The well-accepted SGD batch_size & lr combination for CNN classification is 256 batch size for 0.1 learning rate.
When changing batch size for SGD, follow the linear scaling rule - halving batch size -> halve learning rate, etc.
This is less theoretically supported for Adam, but in my experience, it's a decent ballpark estimate.
"""
batch_size = 64
lr = 0.01
epochs = 100 # Just for the early submission. We'd want you to train like 50 epochs for your main submissions.

In [6]:
class Block_ConvNext(nn.Module):

    def __init__(self,
                 in_channels,
                 out_channels,
                 stride,
                 expand_ratio):
        
        super().__init__() 

        if stride == 1 and in_channels == out_channels:
            self.do_identity = True
        else:
            self.do_identity = False

        hidden_dim = in_channels * expand_ratio

        #depth wise
        self.spatial_mixing = nn.Sequential(
            nn.Conv2d(in_channels, in_channels, kernel_size=7, padding=3, stride=stride, groups=in_channels, bias=False),
            nn.BatchNorm2d(in_channels),
        )

        #point wise 
        self.feature_mixing = nn.Sequential(
            nn.Conv2d(in_channels, hidden_dim, kernel_size=1, padding=0, stride=stride, bias=True),
            nn.GELU(),
        )

        #point wise 
        self.bottleneck_channels = nn.Sequential(
            nn.Conv2d(hidden_dim, out_channels, kernel_size=1, padding=0, stride=stride, bias=True),
        )

        self.drop_path = DropPath(0.2)

    def forward(self, x):
        out = self.spatial_mixing(x)
        out = self.feature_mixing(out)
        out = self.bottleneck_channels(out)

        if self.do_identity:
            return x + self.drop_path(out)
        else:
            return out


class ConvNext(nn.Module):

    def __init__(self, num_classes=7000):
        super().__init__()

        self.num_classes = num_classes

        self.stem = nn.Sequential(
            nn.Conv2d(3, 96, kernel_size=4, stride=4, padding=1, bias=False),
            nn.BatchNorm2d(96),
        )

        self.stage_cfgs = [
            # expand_ratio, channels, # blocks, stride of first block
            [4, 96, 3, 1],
            [4, 192, 3, 1],
            [4, 384, 9, 1],
            [4, 768, 3, 1],
        ]

        in_channels = 96
        downsample_dims = [192, 384, 768]
        layers = []
        for index, curr_stage in enumerate(self.stage_cfgs):
            expand_ratio, num_channels, num_blocks, stride = curr_stage

            for block_idx in range(num_blocks):
                out_channels = num_channels
                layers.append(Block_ConvNext(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    stride=stride,
                    expand_ratio=expand_ratio
                ))
                # In channels of the next block is the out_channels of the current one
                in_channels = out_channels
            if index < 3:
                layers.append(
                    nn.Sequential(
                        nn.BatchNorm2d(out_channels),
                        nn.Conv2d(out_channels, downsample_dims[index], kernel_size=2, stride=2),
                    )
                )
                in_channels = downsample_dims[index]


        self.layers = nn.Sequential(*layers)  # Done, save them to the class

        # Now, we need to build the final classification layer.
        self.cls_layer = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),  # Pool over & collapse the spatial dimensions to (1, 1)
            nn.BatchNorm2d(in_channels),
            nn.Flatten(),  # Collapse the trivial (1, 1) dimensions
            nn.Dropout(p=0.2),
            nn.Linear(in_channels, num_classes)  # Project to our # of classes
        )

        self._initialize_weights()

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()

    def forward(self, x, return_feats=False):
        out = self.stem(x)
        feats = self.layers(out)
        out = self.cls_layer(feats)
        if return_feats:
            return feats
        else:
            return out

In [7]:
"""
Transforms (data augmentation) is quite important for this task.
Go explore https://pytorch.org/vision/stable/transforms.html for more details
"""
torch.manual_seed(0)
DATA_DIR = ""
TRAIN_DIR = osp.join(DATA_DIR, "classification/classification/train") # This is a smaller subset of the data. Should change this to classification/classification/train
VAL_DIR = osp.join(DATA_DIR, "classification/classification/dev")
TEST_DIR = osp.join(DATA_DIR, "classification/classification/test")

train_transforms = [ttf.RandomHorizontalFlip(), 
                    ttf.RandomResizedCrop((224, 224), scale=(0.25, 1)),
                    ttf.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5),
                    ttf.RandAugment(),
                    ttf.ToTensor()]
val_transforms = [ttf.ToTensor()]

train_dataset = torchvision.datasets.ImageFolder(TRAIN_DIR,
                                                 transform=ttf.Compose(train_transforms))
val_dataset = torchvision.datasets.ImageFolder(VAL_DIR,
                                               transform=ttf.Compose(val_transforms))


train_loader = DataLoader(train_dataset, batch_size=batch_size,
                          shuffle=True, drop_last=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,
                        drop_last=True, num_workers=1)

In [8]:
model = ConvNext()
model.cuda()

# For this homework, we're limiting you to 35 million trainable parameters, as
# outputted by this. This is to help constrain your search space and maintain
# reasonable training times & expectations
num_trainable_parameters = 0
for p in model.parameters():
    num_trainable_parameters += p.numel()
print("Number of Params: {}".format(num_trainable_parameters))

# TODO: What criterion do we use for this task?
criterion = torch.nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(len(train_loader) * epochs))

# For this homework, we strongly strongly recommend using FP16 to speed up training.
# It helps more for larger models.
# Go to https://effectivemachinelearning.com/PyTorch/8._Faster_training_with_mixed_precision
# and compare "Single precision training" section with "Mixed precision training" section
scaler = torch.cuda.amp.GradScaler()

Number of Params: 33189784


In [9]:
current_epoch = 0
if CONTINUE:
    models = os.listdir(PATH)
    models.sort(key=lambda x: int(x.split("_")[-1]))
    model_path = PATH + models[-1]
    true_epoch = int(model_path.split("_")[-1])
    print("!!!!!!Warning!!!!!!\n continuing from \n\t{}\n\n".format(model_path))
    temp = torch.load(model_path)
    model.load_state_dict(temp['model_state_dict'])
    optimizer.load_state_dict(temp['optimizer_state_dict'])
    scheduler.load_state_dict(temp['scheduler_state_dict'])

for epoch in range(current_epoch + 1, epochs + 1):
        
    # Quality of life tip: leave=False and position=0 are needed to make tqdm usable in jupyter
    batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train') 

    num_correct = 0
    total_loss = 0

    for i, (x, y) in enumerate(train_loader):
        optimizer.zero_grad()

        x = x.cuda()
        y = y.cuda()

        # Don't be surprised - we just wrap these two lines to make it work for FP16
        with torch.cuda.amp.autocast():     
            outputs = model(x)
            loss = criterion(outputs, y)

        # Update # correct & loss as we go
        num_correct += int((torch.argmax(outputs, axis=1) == y).sum())
        total_loss += float(loss)

        # tqdm lets you add some details so you can monitor training as you train.
        batch_bar.set_postfix(
            acc="{:.04f}%".format(100 * num_correct / ((i + 1) * batch_size)),
            loss="{:.04f}".format(float(total_loss / (i + 1))),
            num_correct=num_correct,
            lr="{:.04f}".format(float(optimizer.param_groups[0]['lr'])))
        
        # Another couple things you need for FP16. 
        scaler.scale(loss).backward() # This is a replacement for loss.backward()
        scaler.step(optimizer) # This is a replacement for optimizer.step()
        scaler.update() # This is something added just for FP16
        scheduler.step()
        batch_bar.update() # Update tqdm bar
    batch_bar.close() # You need this to close the tqdm bar

    # training metrics
    train_acc = 100 * num_correct / (len(train_loader) * batch_size)
    train_loss = float(total_loss / len(train_loader))

    # validation metrics
    model.eval()
    num_correct = 0
    for i, (x, y) in enumerate(val_loader):
        x = x.cuda()
        y = y.cuda()

        with torch.no_grad():
            outputs = model(x)

        num_correct += int((torch.argmax(outputs, axis=1) == y).sum())
    validation_acc = 100 * num_correct / len(val_dataset)
    

    # print results
    print("Epoch {}/{}: Train Acc {:.04f}%, Validation Acc {:.04f}%, Train Loss {:.04f}, Learning Rate {:.04f}".format(
        epoch,
        epochs,
        train_acc,
        validation_acc,
        train_loss,
        float(optimizer.param_groups[0]['lr'])))
    
    # metric export
    experiment.log_metric("train acc", train_acc, epoch=epoch)
    experiment.log_metric("validation acc", validation_acc, epoch=epoch)

    # model saving
    torch.save({
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'scheduler_state_dict' : scheduler.state_dict(),
        }, PATH+"Model_"+str(epoch))



Epoch 1/100: Train Acc 0.0193%, Validation Acc 0.0286%, Train Loss 8.9338, Learning Rate 0.0100




Epoch 2/100: Train Acc 0.0307%, Validation Acc 0.1114%, Train Loss 8.7281, Learning Rate 0.0100




Epoch 3/100: Train Acc 0.2079%, Validation Acc 0.5629%, Train Loss 8.2585, Learning Rate 0.0100




Epoch 4/100: Train Acc 1.1653%, Validation Acc 2.2857%, Train Loss 7.5508, Learning Rate 0.0100




Epoch 5/100: Train Acc 3.6951%, Validation Acc 7.6600%, Train Loss 6.8648, Learning Rate 0.0099




Epoch 6/100: Train Acc 8.9742%, Validation Acc 15.9914%, Train Loss 6.1839, Learning Rate 0.0099




Epoch 7/100: Train Acc 18.0327%, Validation Acc 26.1200%, Train Loss 5.4735, Learning Rate 0.0099




Epoch 8/100: Train Acc 28.5865%, Validation Acc 35.1114%, Train Loss 4.8210, Learning Rate 0.0098




Epoch 9/100: Train Acc 38.8932%, Validation Acc 42.9657%, Train Loss 4.2626, Learning Rate 0.0098




Epoch 10/100: Train Acc 47.9660%, Validation Acc 50.8714%, Train Loss 3.8061, Learning Rate 0.0098




Epoch 11/100: Train Acc 55.7192%, Validation Acc 55.0314%, Train Loss 3.4377, Learning Rate 0.0097




Epoch 12/100: Train Acc 61.8191%, Validation Acc 58.4314%, Train Loss 3.1453, Learning Rate 0.0096




Epoch 13/100: Train Acc 67.3733%, Validation Acc 60.4400%, Train Loss 2.8922, Learning Rate 0.0096




Epoch 14/100: Train Acc 71.9393%, Validation Acc 61.1286%, Train Loss 2.6879, Learning Rate 0.0095




Epoch 15/100: Train Acc 75.7223%, Validation Acc 63.4543%, Train Loss 2.5210, Learning Rate 0.0095




Epoch 16/100: Train Acc 79.3088%, Validation Acc 64.3857%, Train Loss 2.3654, Learning Rate 0.0094




Epoch 17/100: Train Acc 82.2852%, Validation Acc 65.5686%, Train Loss 2.2427, Learning Rate 0.0093




Epoch 18/100: Train Acc 85.0573%, Validation Acc 65.9143%, Train Loss 2.1306, Learning Rate 0.0092




Epoch 19/100: Train Acc 86.9349%, Validation Acc 66.7543%, Train Loss 2.0524, Learning Rate 0.0091




Epoch 20/100: Train Acc 88.8032%, Validation Acc 67.8171%, Train Loss 1.9732, Learning Rate 0.0090




Epoch 21/100: Train Acc 89.8062%, Validation Acc 68.6114%, Train Loss 1.9287, Learning Rate 0.0090




Epoch 22/100: Train Acc 91.2587%, Validation Acc 69.1743%, Train Loss 1.8674, Learning Rate 0.0089




Epoch 23/100: Train Acc 91.8796%, Validation Acc 68.8400%, Train Loss 1.8371, Learning Rate 0.0088




Epoch 24/100: Train Acc 92.5326%, Validation Acc 70.4914%, Train Loss 1.8041, Learning Rate 0.0086




Epoch 25/100: Train Acc 93.1249%, Validation Acc 70.5057%, Train Loss 1.7781, Learning Rate 0.0085


Train:   5%|▌         | 119/2187 [00:27<07:58,  4.32it/s, acc=93.8542%, loss=1.7392, lr=0.0085, num_correct=7208]

KeyboardInterrupt: ignored

In [None]:
!nvidia-smi