In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
from __future__ import print_function
import argparse
import os
from datetime import datetime
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import numpy as np
import torchvision.datasets.vision
import cv2
from torch.autograd import Variable
from torch.optim.lr_scheduler import StepLR

In [None]:
class Net(nn.Module):
  def __init__(self, *args, **kwargs) -> None:
    super().__init__(*args, **kwargs)
    self.conv1 = nn.Conv2d(1, 32, 3, 1)
    self.conv2 = nn.Conv2d(32, 64, 3, 1)
    self.dropout1 = nn.Dropout2d(0.25)
    self.dropout2 = nn.Dropout2d(0.5)
    self.fc1 = nn.Linear(9216, 128)
    self.fc2 = nn.Linear(128, 10)

  def forward(self, x):
    x = self.conv1(x)
    x = F.relu(x)
    x = self.conv2(x)
    x = F.max_pool2d(x)
    x = self.dropout1(x)
    x = torch.flatten(x, 1)
    x = self.fc1(x)
    x = F.relu(x)
    x = self.dropout2(x)
    x = self.fc2(x)
    output = F.log_softmax(x, dim=1)
    return output

In [None]:
def default_image_loader(path):
     print(path)
     image = cv2.imread(path)
     image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
     return image


class MNISTCustom_Loader(torchvision.datasets.vision.VisionDataset):
    def __init__(self, root, filenames_filename, class_filename, transform=None, target_transform=None):
        super(MNISTCustom_Loader, self).__init__(root, transform=transform,target_transform=target_transform)
        """ filenames_filename: A text file with each line containing the path to an image e.g. images/class/sample.jpg
            class_filename: A text file with each line containing the class of the image """

        self.base_path = root
        self.loader = default_image_loader
        self.filenamelist = []
        for line in open(filenames_filename):
            self.filenamelist.append(line.rstrip('\n'))

        self.targets = []
        for line in open(class_filename):
            self.targets.append(int(line.rstrip('\n')))

        self.transform = transform
        self.target_transform = target_transform
        print("Loader Intialized Successfully")

    def __getitem__(self, index):
        class_type = int(self.targets[index])
        path = self.filenamelist[index]
        img_path = os.path.join(self.base_path,str(path).replace("\/","/"))
        img = self.loader(img_path)
        if self.transform is not None:
            img = self.transform(img)
        if self.target_transform is not None:
            class_type = self.target_transform(int(class_type))

        return img, class_type

    def __len__(self):
        return len(self.targets)

In [None]:
parser = argparse.ArgumentParser(description="DeepSpeed_PytorchCustomLoader")
parser.add_argument('--epochs', type=int, default=12, metavar="N", help="Number of epochs to train(default:10) ")
parser.add_argument('--no-cuda', action='store_true', default=False, help='enabled CUDA training')

_StoreTrueAction(option_strings=['--no-cuda'], dest='no_cuda', nargs=0, const=True, default=False, type=None, choices=None, required=False, help='enabled CUDA training', metavar=None)

In [None]:
def train(args, model, device, train_loader, optimizer, epoch, sum_loss):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        sum_loss += loss
        loss.backward()
        optimizer.step()
        return sum_loss

In [None]:
args = parser.parse_args([])
use_cuda = not args.no_cuda and torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print("Device is - ", device)
kwargs = {'num_workers': 1, "pin_memory": True} if use_cuda else {}
train_dataset = MNISTCustom_Loader("/content/drive/My Drive/Colab Notebooks/Digit Images/",
                                  "/content/drive/My Drive/Colab Notebooks/Digit Images/image_files.txt",
                                  "/content/drive/My Drive/Colab Notebooks/Digit Images/class.txt",
                                  transform=transforms.Compose([
                                      transforms.ToTensor(),
                                      transforms.Normalize((0.1307,), (0.3081,))
                                  ]))
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True, **kwargs)
model = Net().to(device)
optimizer = optim.Adam(model.parameters(),lr=1.0)
scheduler = StepLR(optimizer, step_size=1, gamma=0.7)

Device is -  cpu
Loader Intialized Successfully


In [None]:
print("Start time : ",datetime.now())
for epoch in range(1, args.epochs + 1):
    sum_loss = 0
    sum_loss = train(args, model, device, train_loader, optimizer, epoch, sum_loss)
    scheduler.step()
    print("Epoch : ", epoch , " Current_epoch_train_sum_loss : " , sum_loss)
print("End time : ",datetime.now())

Start time :  2024-04-26 14:14:18.742200
/content/drive/My Drive/Colab Notebooks/Digit Images/7/img_35625.jpg


error: OpenCV(4.8.0) /io/opencv/modules/imgproc/src/color.cpp:182: error: (-215:Assertion failed) !_src.empty() in function 'cvtColor'


In [None]:
!pip install mpi4py
!pip install deepspeed

Collecting mpi4py
  Downloading mpi4py-3.1.6.tar.gz (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: mpi4py
  Building wheel for mpi4py (pyproject.toml) ... [?25l[?25hdone
  Created wheel for mpi4py: filename=mpi4py-3.1.6-cp310-cp310-linux_x86_64.whl size=2746326 sha256=ea384e4a78327db98bcb36562d186fe80917b19aa837c250375d07aa9893d9c8
  Stored in directory: /root/.cache/pip/wheels/4c/ca/89/8fc1fb1c620afca13bb41c630b1f948bbf446e0aaa4b762e10
Successfully built mpi4py
Installing collected packages: mpi4py
Successfully installed mpi4py-3.1.6


In [None]:
import deepspeed
def train_with_deepsped(args, model_engine, device, train_loader,  epoch, sum_loss):
    for batch_idx, (data, target) in enumerate(train_loader):
        #- deepspeed
        data, target = data[0].to(model_engine.local_rank) , target[0].to(model_engine.local_rank)
        #- - deepspeed
        data , target = Variable(data.unsqueeze(0)), Variable(target.unsqueeze(0))

        #- deepspeed
        output = model_engine(data)
        loss = F.nll_loss(output, target)
        sum_loss += loss
        #- deepspeed , runs backpropagation
        model_engine.backward(loss)
        #- deepspeed , weight update - deepspeed
        model_engine.step()
        return sum_loss

In [None]:
parser.add_argument("--local-rank", type=int, default=0, help="Local rank passed from distributed launcher")
parser.add_argument("--deepspeed-config", default="/content/drive/My Drive/Colab Notebooks/Casme/ds_confg.json", type=str, help="Deepspeed config file")

ArgumentError: argument --deepspeed-config: conflicting option string: --deepspeed-config

In [None]:
args = parser.parse_args([])
use_cuda = not args.no_cuda and torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print("Device is - ", device)
kwargs = {'num_workers': 1, "pin_memory": True} if use_cuda else {}
train_dataset = MNISTCustom_Loader("/content/drive/My Drive/Colab Notebooks/Digit Images/",
                                  "/content/drive/My Drive/Colab Notebooks/Digit Images/image_files.txt",
                                  "/content/drive/My Drive/Colab Notebooks/Digit Images/class.txt",
                                  transform=transforms.Compose([
                                      transforms.ToTensor(),
                                      transforms.Normalize((0.1307,), (0.3081,))
                                  ]))
model= Net().to(device)
os.environ["RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "6000"

model_engine, optimizer, train_loader, __ = deepspeed.initialize(args=args, model=model, model_parameters=model.parameters(), training_data=train_dataset)


Device is -  cpu
Loader Intialized Successfully
[2024-04-26 14:34:53,487] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.14.3, git-hash=[none], git-branch=[none]
[2024-04-26 14:34:53,489] [INFO] [comm.py:637:init_distributed] cdb=None
[2024-04-26 14:34:53,495] [INFO] [comm.py:652:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...
[2024-04-26 14:34:54,250] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=172.28.0.12, master_port=29500
[2024-04-26 14:34:54,252] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend gloo


No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...
Creating extension directory /root/.cache/torch_extensions/py310_cu121/deepspeed_shm_comm...
Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/deepspeed_shm_comm/build.ninja...
Building extension module deepspeed_shm_comm...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


Time to load deepspeed_shm_comm op: 55.52771759033203 seconds
DeepSpeed deepspeed.ops.comm.deepspeed_shm_comm_op built successfully


Loading extension module deepspeed_shm_comm...


ValueError: Expected a string path to an existing deepspeed config, or a dictionary or a valid base64. Received: /content/drive/My Drive/Colab Notebooks/Casme/ds_confg.json

In [None]:
print("Start time - ", datetime.now())
for epoch in range(1, args.epochs+1):
  sum_loss = 0
  sum_loss = train_with_deepspeed(args, model_enginer, device, trainL_loader, epoch, sum_loss)
  print("Epoch - ", epoch, "Current_epoch_train_sum_loss - ", sum_loss)
print("End time - ", datetime.now())