In [2]:
import torch as torch
import torch.optim as optim
import torch.nn as nn
import os
import torchvision
import torch.nn as nn
from torch.autograd import Variable as var
import logging as log
import gc
import numpy as np
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import Dataset
import torch.multiprocessing as mp
import torch.backends.cudnn as cudnn
from PIL import Image
from glob import glob
import sys
import os
import time



KeyboardInterrupt: 

In [2]:
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'

torch.distributed.init_process_group("nccl", rank = 0 , world_size = 1)



In [3]:
sys.path.append('/kaggle/input/moco-v2')
from loader import TwoCropsTransform, GaussianBlur

In [4]:
if torch.cuda.is_available():
    print("GPU is available")
    print("Number of GPUs:", torch.cuda.device_count())
    print("GPU name:", torch.cuda.get_device_name(0))
else:
    print("GPU is not available")

GPU is available
Number of GPUs: 2
GPU name: Tesla T4


In [5]:
PARAMETERS = {}
PARAMETERS['model_name'] = 'resnet18'
PARAMETERS['model_saved'] = "/kaggle/working/"
PARAMETERS['learning_rate'] = 0.03
PARAMETERS['momentum'] = 0.9
PARAMETERS['epochs'] = 100
PARAMETERS['weight_decay'] = 0.0001
PARAMETERS['batch_size'] = 128
PARAMETERS['temperature'] = 0.07
PARAMETERS['num_channels'] = 3
PARAMETERS['dictionary_size'] = 8192
PARAMETERS['num_workers'] = 1
PARAMETERS['num_cores'] = 1
PARAMETERS['log_steps'] = 20
PARAMETERS['load_from_saved'] = False
PARAMETERS['start_epoch'] = 1

In [6]:
import torchvision.datasets as datasets

TranfGauss = GaussianBlur([.1, 2.])
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
augmentation = [
            transforms.RandomResizedCrop(224, scale=(0.2, 1.)),
            #transforms.RandomApply([transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)  ], p=0.8),
            #transforms.RandomGrayscale(p=0.2),
            transforms.RandomApply([TranfGauss], p=0.5),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize
            ]

train_dataset = datasets.ImageFolder(
        "/kaggle/input/chest-xray-pneumonia/chest_xray/train/",
        TwoCropsTransform(transforms.Compose(augmentation)))

train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)

train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size = PARAMETERS['batch_size'],
    sampler = train_sampler,
    num_workers = PARAMETERS['num_workers'], 
    pin_memory = True,drop_last=True)


In [7]:
@torch.no_grad()
def concat_all_gather(tensor):
    """
    Performs all_gather operation on the provided tensors.
    *** Warning ***: torch.distributed.all_gather has no gradient.
    """
    tensors_gather = [torch.ones_like(tensor) for _ in range(torch.distributed.get_world_size())]
    torch.distributed.all_gather(tensors_gather, tensor, async_op=False)

    output = torch.cat(tensors_gather, dim=0)
    return output


In [8]:
class BaseModel(nn.Module):
    def __init__(self, base_model_name, channels_out):
        super(BaseModel, self).__init__()

        if base_model_name == 'resnet50':
          model = models.resnet50(pretrained=True)
        elif base_model_name == 'resnet18':
          model = models.resnet18(pretrained=True)
        
        penultimate = model.fc.weight.shape[1]
        modules = list(model.children())[:-1]
        self.encoder = nn.Sequential(*modules)

        self.relu = nn.ReLU()
        self.fc = nn.Linear(penultimate, channels_out);
    
    def forward(self,x):
        x = self.encoder(x)
        x = x.view(x.size(0),-1)
        x = self.relu(x)
        x = self.fc(x)
        
        return x

In [9]:
class Moco(nn.Module):
  def __init__(self):
    super(Moco, self).__init__()

    self.query_enc = BaseModel('resnet18', PARAMETERS['batch_size']).cuda()
    self.key_enc = BaseModel('resnet18', PARAMETERS['batch_size']).cuda()

    for param_q, param_k in zip(self.query_enc.parameters(), self.key_enc.parameters()):
      param_k.data.copy_(param_q.data)
      param_k.requires_grad = False  # not update k encoder by gradient

    self.register_buffer("queue", torch.randn(PARAMETERS['batch_size'], PARAMETERS['dictionary_size']))
    self.queue = nn.functional.normalize(self.queue, dim=0)

    self.register_buffer("queue_index", torch.zeros(1, dtype=torch.long))

  @torch.no_grad()
  def update_key_params(self):
    for p_k,p_q in zip(self.key_enc.parameters(),self.query_enc.parameters()):
        val = (1-PARAMETERS['momentum'])* p_q.data + PARAMETERS['momentum'] * p_k.data
        p_k.data = p_k.data.copy_(val)
    
  @torch.no_grad()
  def update_queue(self, keys):
      keys = concat_all_gather(keys)

      index = int(self.queue_index)

      self.queue[:, index:index + PARAMETERS['batch_size']] = keys.T
      index = (index + PARAMETERS['batch_size']) % PARAMETERS['dictionary_size']

      self.queue_index[0] = index

  @torch.no_grad()
  def shuffle(self, x):

      current_batch_size = x.shape[0]
      x_gather = concat_all_gather(x)
      gathered_batch_size = x_gather.shape[0]

      num_tpus = gathered_batch_size // current_batch_size

      shuffle_index = torch.randperm(gathered_batch_size).cuda()#torch.randperm(gathered_batch_size).cpu()

      torch.distributed.broadcast(shuffle_index, src=0)

      unshuffle_index = torch.argsort(shuffle_index)

      current_tpu = torch.distributed.get_rank()#xm.get_ordinal()
      current = shuffle_index.view(num_tpus, -1)[current_tpu]

      return x_gather[current], unshuffle_index

  @torch.no_grad()
  def unshuffle(self, x, unshuffle_index):

      current_batch_size = x.shape[0]
      x_gather = concat_all_gather(x)
      gathered_batch_size = x_gather.shape[0]

      num_tpus = gathered_batch_size // current_batch_size

      current_tpu = torch.distributed.get_rank()
      current = unshuffle_index.view(num_tpus, -1)[current_tpu]

      return x_gather[current]

  def forward(self, images):
      images_q , images_k = images[0].cuda() , images[1].cuda()
      q = self.query_enc(images_q)
      q = nn.functional.normalize(q,dim=1)

      with torch.no_grad():
        images_k, unshuffle_index = self.shuffle(images_k)

        self.update_key_params()
        
        k = self.key_enc.forward(images_k)
        k = nn.functional.normalize(k, dim=1)
        k = self.unshuffle(k, unshuffle_index)

      l_pos = torch.einsum('nc,nc->n', [q, k]).unsqueeze(-1)

      l_neg = torch.einsum('nc,ck->nk', [q, self.queue.clone().detach()])


      logits = torch.cat([l_pos, l_neg], dim=1)

      labels = torch.zeros(PARAMETERS['batch_size']).type(torch.LongTensor).cuda()

      logits = logits/PARAMETERS['temperature']

      self.update_queue(k)

      return logits,labels

In [10]:
def saveModel(epoch, model, optimizer, loss, path):
   torch.save({
              'epoch': epoch,
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimizer.state_dict(),
              'loss': loss
              }, path)

In [None]:
def loadModel(model, path):
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']

    print('Epoch: ',epoch,'Loss: ',loss)
    return model, epoch, loss

In [11]:
model = Moco()
model.cuda()
moco_model = torch.nn.parallel.DistributedDataParallel(model)


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


  0%|          | 0.00/44.7M [00:00<?, ?B/s]

In [12]:
loss_function = nn.CrossEntropyLoss().cuda(0)
optimizer = torch.optim.SGD(moco_model.parameters(), lr=PARAMETERS['learning_rate'], momentum=0.9, weight_decay=PARAMETERS['weight_decay'])

In [13]:
def training_loop(data,epoch):
    epoch_loss = 0.0
    running_loss = 0.0
    moco_model.train()

    for i, (images, _) in enumerate(data):
      optimizer.zero_grad()

      logits, labels = moco_model.forward(images)

      loss = loss_function(logits, labels)
      loss.backward()
      optimizer.step()

      epoch_loss += loss.item()
      running_loss += loss.item()

      #tracker.add(PARAMETERS['batch_size'])
      if((i+1) % 5 == 0):
        print('Batch =({}) Loss={:.5f} Time={}'.format(
               i, running_loss/5, time.asctime()), flush=True)
        running_loss = 0.0
      #if((i+1) % 15 == 0):
    
        

    return epoch_loss, running_loss

In [14]:
for epoch in range(PARAMETERS['start_epoch'], PARAMETERS['epochs'] + 1):
    epoch_loss, running_loss = training_loop(train_loader,epoch)
    train_loss = epoch_loss/len(train_loader)
    if ((epoch+1) % 10 == 0):
        saveModel(epoch,moco_model,optimizer,epoch_loss,"/kaggle/working/mocco_model_"+str(epoch)+".pth")
    print("["+str(epoch)+" , "+str(train_loss))


Batch =(4) Loss=3.95430 Time=Wed Dec 28 18:27:07 2022
Batch =(9) Loss=4.80729 Time=Wed Dec 28 18:27:28 2022
Batch =(14) Loss=3.35438 Time=Wed Dec 28 18:27:48 2022
Batch =(19) Loss=2.68751 Time=Wed Dec 28 18:28:09 2022
Batch =(24) Loss=3.31849 Time=Wed Dec 28 18:28:30 2022
Batch =(29) Loss=3.87615 Time=Wed Dec 28 18:28:52 2022
Batch =(34) Loss=4.21320 Time=Wed Dec 28 18:29:13 2022
Batch =(39) Loss=4.34688 Time=Wed Dec 28 18:29:34 2022
[1 , 3.819775804132223
Batch =(4) Loss=4.40217 Time=Wed Dec 28 18:29:52 2022
Batch =(9) Loss=4.41866 Time=Wed Dec 28 18:30:10 2022
Batch =(14) Loss=4.41046 Time=Wed Dec 28 18:30:27 2022
Batch =(19) Loss=4.38769 Time=Wed Dec 28 18:30:45 2022
Batch =(24) Loss=4.38439 Time=Wed Dec 28 18:31:03 2022
Batch =(29) Loss=4.32999 Time=Wed Dec 28 18:31:20 2022
Batch =(34) Loss=4.33265 Time=Wed Dec 28 18:31:38 2022
Batch =(39) Loss=4.30427 Time=Wed Dec 28 18:31:56 2022
[2 , 4.371285486221313
Batch =(4) Loss=4.29590 Time=Wed Dec 28 18:32:14 2022
Batch =(9) Loss=4.29655 