In [1]:
rank = 0
world_size = 3

import os
import torch
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallelCPU as DDP

def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = '10.0.1.121'
    os.environ['MASTER_PORT'] = '8890'
    os.environ['GLOO_SOCKET_IFNAME'] = 'ens3'

    # initialize the process group
    dist.init_process_group(backend='gloo', 
                            init_method='env://', rank=rank, world_size=world_size)

    # Explicitly setting seed to make sure that models created in two processes
    # start from same random weights and biases.
    torch.manual_seed(42)


def cleanup():
    dist.destroy_process_group()

setup(rank = rank, world_size = world_size)

In [2]:
import torch 
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from collections import OrderedDict

In [3]:
import torch.nn.functional as F

sequence_length = 28
input_size = 28

# Hyper-parameters
batch_size = 100
num_epochs = 3
learning_rate = 0.01

num_classes = 10
num_cells = 128
dense_size = 32
drop_pr = 0.2

# Recurrent neural network (many-to-one)
class RNN(nn.Module):
    def __init__(self, input_size):
        super(RNN, self).__init__()
        self.rnn = nn.LSTM(input_size, num_cells, num_layers=1, batch_first=True)
        self.fc1 = nn.Linear(128, dense_size)
        self.fc2 = nn.Linear(dense_size, num_classes)
        self.dropout = nn.Dropout(drop_pr)
    
    def forward(self, x):
        # Set initial hidden and cell states 
        h0 = torch.zeros(1, x.size(0), num_cells)
        c0 = torch.zeros(1, x.size(0), num_cells)
        
        # Forward propagate LSTM
        out, _ = self.rnn(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, num_cells)
        
        out = self.dropout(out[:, -1, :])
        
        out = F.relu(self.fc1(out))
        
        out = self.dropout(out)
        
        out = self.fc2(out) # no softmax needed - nn.CrossEntropy does it
        
        return out



In [4]:
# MNIST dataset
train_dataset = torchvision.datasets.MNIST(root='../data/',
                                           train=True, 
                                           transform=transforms.ToTensor(),
                                           download=True)

test_dataset = torchvision.datasets.MNIST(root='../data/',
                                          train=False, 
                                          transform=transforms.ToTensor())

train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset,
        num_replicas=world_size,
        rank=rank
    )

# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           sampler=train_sampler)
#                                            shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=batch_size, 
                                          shuffle=False)

model = RNN(input_size)

model = DDP(model)



In [5]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

def train():
    # Train the model
    total_step = len(train_loader)
    for epoch in range(num_epochs):
        for i, (images, labels) in enumerate(train_loader):
            images = images.reshape(-1, sequence_length, input_size)
            labels = labels

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if (i+1) % 100 == 0:
                print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                       .format(epoch+1, num_epochs, i+1, total_step, loss.item()))


def test():
    # Test the model
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in test_loader:
            images = images.reshape(-1, sequence_length, input_size)
            labels = labels
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total)) 

In [6]:
def main():
    with torch.autograd.profiler.profile(use_cuda=False) as prof:
        train()
#     tbl = prof.key_averages().table(sort_by="self_cpu_time_total")
#     tbl = prof.key_averages().table()
    tbl = prof.table()
    print(tbl)
    
    test()


if __name__ == '__main__':
    main()

Epoch [1/3], Step [100/200], Loss: 0.7752
Epoch [1/3], Step [200/200], Loss: 0.3188
Epoch [2/3], Step [100/200], Loss: 0.2291
Epoch [2/3], Step [200/200], Loss: 0.1822
Epoch [3/3], Step [100/200], Loss: 0.1680
Epoch [3/3], Step [200/200], Loss: 0.2278
-----------------------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  
Name                                 Self CPU total %  Self CPU total   CPU total %      CPU total        CPU time avg     Number of Calls  
-----------------------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  
random_                              0.00%            82.447us         0.00%            82.447us         27.482us         3                
is_floating_point                    0.00%            13.754us         0.00%            13.754us         1.528us          9                
is_complex                           0.00%     

Test Accuracy of the model on the 10000 test images: 97.47 %


In [7]:
# profs = []
# # Train the model
# total_step = len(train_loader)
# for epoch in range(num_epochs):
#     for i, (images, labels) in enumerate(train_loader):
#         images = images.reshape(-1, sequence_length, input_size)
#         labels = labels

#         # Forward pass
#         with torch.autograd.profiler.profile(use_cuda=False) as prof:
#             outputs = model(images)
#         tbl = prof.key_averages().table(sort_by="self_cpu_time_total")
#         profs.append(tbl)
        
#         loss = criterion(outputs, labels)

#         # Backward and optimize
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#         if (i+1) % 100 == 0:
#             print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
#                    .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

