# Parallelize the training of a PyTorch model across processors and machines 



## Import libraries

In [15]:
import pandas as pd

import torch
from torch import nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

In [16]:
torch.cuda.device_count()

2

In [17]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [18]:
wine_data = pd.read_csv('wine_data.csv')
wine_features = wine_data.drop('Class', axis = 1)
wine_target = wine_data[['Class']]

In [19]:
X_train, x_test, Y_train, y_test = train_test_split(wine_features,
                                                    wine_target,
                                                    test_size=0.4,
                                                    random_state=0)

In [20]:
Xtrain_ = torch.from_numpy(X_train.values).float()
Xtest_ = torch.from_numpy(x_test.values).float()

Ytrain_ = torch.from_numpy(Y_train.values).view(1,-1)[0]
Ytest_ = torch.from_numpy(y_test.values).view(1,-1)[0]

In [21]:
Xtrain_ = Xtrain_.to(device)
Ytrain_ = Ytrain_.to(device)
Xtest_ = Xtest_.to(device)
Ytest_ = Ytest_.to(device)

In [22]:
input_size = 13
output_size = 3
hidden_size = 100

In [23]:
class Net(nn.Module):
    
    def __init__(self):
        
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)

    def forward(self, X):
        
        X = torch.sigmoid((self.fc1(X)))
        X = torch.sigmoid(self.fc2(X))
        X = self.fc3(X)
        
        print("\nInside  the model: Device id - ", torch.cuda.current_device())

        return F.log_softmax(X, dim=-1)

## nn.DataParallel
    

    
* First, we need to make a model instance and check if we have multiple GPUs. If we have multiple GPUs, we can wrap our model using nn.DataParallel. Then we can put our model on GPUs by model.to(device)

* We’ve placed a print statement inside the model to monitor the size of input and output tensors. 

*  just calling my_tensor.to(device) returns a new copy of my_tensor on GPU instead of rewriting my_tensor. You need to assign it to a new tensor and use that tensor on the GPU.

* It’s natural to execute your forward, backward propagations on multiple GPUs. However, Pytorch will only use one GPU by default. You can easily run your operations on multiple GPUs by making your model run parallelly using DataParallel

* DataParallel parallelizes the application of the given :attr:`module` by
    splitting the input across the specified devices by chunking in the batch
    dimension (other objects will be copied once per device). In the forward
    pass, the module is replicated on each device, and each replica handles a
    portion of the input. During the backwards pass, gradients from each replica
    are summed into the original module.

In [24]:
model = Net()
    
model = nn.DataParallel(model)

model.to(device)

DataParallel(
  (module): Net(
    (fc1): Linear(in_features=13, out_features=100, bias=True)
    (fc2): Linear(in_features=100, out_features=100, bias=True)
    (fc3): Linear(in_features=100, out_features=3, bias=True)
  )
)

In [25]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

loss_fn = nn.NLLLoss()

In [29]:
epochs = 100

for epoch in range(epochs):

    optimizer.zero_grad()
    
    Ypred = model(Xtrain_)
    loss = loss_fn(Ypred, Ytrain_ )
    
    loss.backward()
    optimizer.step()
        
    print("\nOutside the model: Device id - %d, Loss - %f" %(torch.cuda.current_device(), loss.item()))
    


Inside  the model: Device id -  0

Inside  the model: Device id -  1

Outside the model: Device id - 0, Loss - 0.646588

Inside  the model: Device id -  1

Inside  the model: Device id -  0

Outside the model: Device id - 0, Loss - 0.643560

Inside  the model: Device id -  0

Inside  the model: Device id -  1

Outside the model: Device id - 0, Loss - 0.640521

Inside  the model: Device id -  0

Inside  the model: Device id -  1

Outside the model: Device id - 0, Loss - 0.637470

Inside  the model: Device id -  0

Inside  the model: Device id -  1

Outside the model: Device id - 0, Loss - 0.634518

Inside  the model: Device id -  1

Inside  the model: Device id -  0

Outside the model: Device id - 0, Loss - 0.631801

Inside  the model: Device id -  0

Inside  the model: Device id -  1

Outside the model: Device id - 0, Loss - 0.629254

Inside  the model: Device id -  0

Inside  the model: Device id -  1

Outside the model: Device id - 0, Loss - 0.626631

Inside  the model: Device id - 

In [30]:
predict_out = model(Xtest_)
_, predict_y = torch.max(predict_out, 1)


Inside  the model: Device id -  0

Inside  the model: Device id -  1


In [31]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print ('prediction accuracy', accuracy_score(Ytest_.cpu().data, predict_y.cpu().data))
print ('micro precision', precision_score(Ytest_.cpu().data, predict_y.cpu().data, average='micro'))
print ('micro recall', recall_score(Ytest_.cpu().data, predict_y.cpu().data, average='micro'))

prediction accuracy 0.9583333333333334
micro precision 0.9583333333333334
micro recall 0.9583333333333334
