In [17]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from utils import elapsed

# Parameters and DataLoaders
input_size = 10
output_size = 1
mid_size = 256*75

batch_size = 32
data_size = 1024

In [58]:
x = torch.tensor([1], requires_grad=True, dtype=torch.float)
y = torch.tensor([3], requires_grad=False, dtype=torch.float)
y.requires_grad=True
z = x*y

In [10]:
import numpy as np
a = [1,3,]
b = [1, 2]
np.stack([a, b], axis=1)

array([[1, 1],
       [3, 2]])

In [59]:
y.requires_grad=False
z2 = 2*z
y.requires_grad = True
z0 = z2+z
z0.backward()
print(y.grad)
print(x.grad)

tensor([3.])
tensor([9.])


In [1]:
from collections import defaultdict

In [5]:
a = defaultdict(lambda:0)

In [32]:
print(z, z.requires_grad)
print(x.grad, x.requires_grad)
print(y.grad, y.requires_grad)

tensor([3.]) False
None True
None False


In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
USE_CUDA = torch.cuda.is_available()


In [19]:
class RandomDataset(Dataset):

    def __init__(self, size, length):
        self.len = length
        self.data = torch.randn(length, size)

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return self.len

rand_loader = DataLoader(dataset=RandomDataset(input_size, data_size),
                         batch_size=batch_size, shuffle=True)

In [20]:
class BigModel(nn.Module):
    def __init__(self, input_size, middle_size, output_size, num_middle=10):
        super(BigModel, self).__init__()
        self.fc_start = nn.Linear(input_size, middle_size)
        self.fc_end = nn.Linear(middle_size, output_size)
        self.fcs = [nn.Linear(middle_size, middle_size) for i in range(num_middle)]
        self.fcs = nn.ModuleList(self.fcs)
    
    def forward(self, x_in):
        x = self.fc_start(x_in)
        for fc in self.fcs:
            x = fc(x)
        x_out = self.fc_end(x)
        return x_out

In [25]:
model = BigModel(input_size, mid_size, output_size)
model_par = model.to(DEVICE)
model_par = nn.DataParallel(model_par)
print(elapsed())
for data in rand_loader:
    data.to(DEVICE)
    x_out = model_par(data)
    print(elapsed())

164.05830335617065
0.0053861141204833984
0.4052426815032959
0.4063069820404053
0.4051856994628906
0.40535998344421387
0.4057154655456543
0.4060182571411133
0.4053957462310791
0.406146764755249
0.4050769805908203
0.404926061630249
0.4053800106048584
0.4066636562347412
0.4049513339996338
0.40500497817993164
0.406219482421875
0.4051966667175293
0.40551066398620605
0.40559887886047363
0.4051215648651123
0.40535926818847656
0.4054572582244873
0.4053053855895996
0.40537190437316895
0.40584850311279297
0.405353307723999
0.4062035083770752
0.4052464962005615
0.40541982650756836
0.4055655002593994
0.40627193450927734
0.4055156707763672


In [33]:
model.requires_grad

ModuleAttributeError: 'BigModel' object has no attribute 'requires_grad'

In [24]:
model = BigModel(input_size, mid_size, output_size)
model_par = model.to(DEVICE)
model_par = nn.DataParallel(model_par)
print(elapsed())
for data in rand_loader:
    data.to(DEVICE)
    with torch.no_grad():
        x_out = model_par(data)
    print(elapsed())

82.37031841278076
0.007927417755126953
0.40496182441711426
0.40456485748291016
0.4055190086364746
0.40537595748901367
0.40589046478271484
0.40517401695251465
0.40515756607055664
0.40506720542907715
0.40549278259277344
0.40499114990234375
0.4064645767211914
0.405442476272583
0.40474605560302734
0.40587282180786133
0.40639567375183105
0.4050109386444092
0.40500903129577637
0.40532445907592773
0.4060356616973877
0.4056215286254883
0.4058549404144287
0.4057583808898926
0.40545058250427246
0.4052088260650635
0.4052591323852539
0.40569472312927246
0.40560317039489746
0.4056422710418701
0.40529441833496094
0.40564727783203125
0.4048326015472412


In [22]:
class Model(nn.Module):
    # Our model

    def __init__(self, input_size, output_size):
        super(Model, self).__init__()
        self.in_size = input_size
        self.out_size = output_size
        self.fc1 = nn.Linear(input_size, input_size)
        self.fc = nn.Linear(input_size, output_size)

    def forward(self, input):
        output = self.fc(self.fc1(input))
        print("\tIn Model: input size", input.size(),
              "output size", output.size())

        return output
    
    def save(self):
        print("hell year")

In [23]:
model = Model(input_size, output_size)
optim = torch.optim.Adam(model.parameters(), lr=3e-1, weight_decay=2e-6, amsgrad=True)

In [11]:
for i in model.parameters():
    print(i)

Parameter containing:
tensor([[ 0.2712, -0.1001,  0.2892,  0.3707,  0.2015],
        [ 0.0549, -0.2165,  0.2439,  0.3563,  0.0879],
        [-0.2075,  0.2502, -0.1503,  0.2665,  0.3101],
        [ 0.3449, -0.0920, -0.0448,  0.4290,  0.2583],
        [ 0.2845,  0.0279, -0.1769,  0.2498,  0.0594]], requires_grad=True)
Parameter containing:
tensor([ 0.0017,  0.0636,  0.1402,  0.3010, -0.0706], requires_grad=True)
Parameter containing:
tensor([[ 0.0358, -0.0594,  0.1209, -0.1628, -0.0356]], requires_grad=True)
Parameter containing:
tensor([0.0473], requires_grad=True)


In [12]:
data_it = iter(rand_loader)
data = next(data_it)

In [13]:
res = model(data)
res = torch.sum(res)

	In Model: input size torch.Size([30, 5]) output size torch.Size([30, 1])


In [14]:
for i in model.fc1.parameters():
    i.requires_grad = True
    print(i)

Parameter containing:
tensor([[ 0.2712, -0.1001,  0.2892,  0.3707,  0.2015],
        [ 0.0549, -0.2165,  0.2439,  0.3563,  0.0879],
        [-0.2075,  0.2502, -0.1503,  0.2665,  0.3101],
        [ 0.3449, -0.0920, -0.0448,  0.4290,  0.2583],
        [ 0.2845,  0.0279, -0.1769,  0.2498,  0.0594]], requires_grad=True)
Parameter containing:
tensor([ 0.0017,  0.0636,  0.1402,  0.3010, -0.0706], requires_grad=True)


In [15]:
res.backward(retain_graph=True)

In [17]:
for i in model.fc1.parameters():
    i.requires_grad = False
    i.grad.data
    print(i)

Parameter containing:
tensor([[ 0.2712, -0.1001,  0.2892,  0.3707,  0.2015],
        [ 0.0549, -0.2165,  0.2439,  0.3563,  0.0879],
        [-0.2075,  0.2502, -0.1503,  0.2665,  0.3101],
        [ 0.3449, -0.0920, -0.0448,  0.4290,  0.2583],
        [ 0.2845,  0.0279, -0.1769,  0.2498,  0.0594]])
Parameter containing:
tensor([ 0.0017,  0.0636,  0.1402,  0.3010, -0.0706])


In [41]:
res.backward(retain_graph=True)
optim.step()

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [5, 1]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

In [40]:
for i in model.fc1.parameters():
    print(i)

Parameter containing:
tensor([[-0.0459,  0.7090,  0.4245,  0.1407, -0.3375],
        [-0.6099,  0.4525, -0.0609,  0.2979,  0.0492],
        [-0.0445, -0.6919, -0.0793,  0.0395, -0.0084],
        [-0.0079, -0.0691, -0.0807, -0.5454,  0.4948],
        [ 0.1155, -0.2570,  0.0363, -0.5015,  0.7034]], requires_grad=True)
Parameter containing:
tensor([-0.4320,  0.0135,  0.5419,  0.2562,  0.5430], requires_grad=True)


In [15]:
model = Model(input_size=3, output_size=1)

In [16]:
model = nn.DataParallel(model)

In [5]:
class ComplexModel(nn.Module):
    
    def __init__(self, block=[]):
        super().__init__()
        self.blocks = nn.ModuleList(block)
    
    def forward(self, x):
        for model in self.blocks:
            x = model(x)
        return x

In [6]:
class Transfer(nn.Module):
    
    def __init__(self, model):
        super().__init__()
        self.model = model
    
    def to_parallel(self):
        self.model = self.model.to(DEVICE)
        if USE_CUDA:
            self.model = nn.DataParallel(self.model)
    
    def __call__(self, x):
        self.to_parallel()
        x = self.model(x)
        return x

In [7]:
class Graph(nn.Module):
    
    def __init__(self, in_size, mid_size, out_size):
        super().__init__()
        model1 = Model(in_size, mid_size)
        model2 = Model(mid_size, out_size)
        model3 = Model(mid_size, out_size)
        self.edge_out = {str((in_size, mid_size)): model1}
        self.edge_in = {str((mid_size, out_size))+"1": model2, str((mid_size, out_size))+"2": model3}
        complex_model1 = ComplexModel(block=[self.edge_out[str((in_size, mid_size))], self.edge_in[str((mid_size, out_size))+"1"]]) 
        complex_model2 = ComplexModel(block=[self.edge_out[str((in_size, mid_size))], self.edge_in[str((mid_size, out_size))+"2"]])
        
#         self.params = {}
#         self.params.update(self.edge_in)
#         self.params.update(self.edge_out)
        self.params = {"1":complex_model1, "2":complex_model2}
        self.params = nn.ModuleDict(self.params)
        print(self.params.keys())
        #self.params = nn.ModuleDict(self.params)
        #print(id(self.params[str((in_size, mid_size))]))
        transfer1 = Transfer(complex_model1)
        transfer2 = Transfer(complex_model2)
        self.models = {
            1: transfer1,
            2: transfer2,
        }

    
    def compute_path(self, data, paths=[1]):
        for path in paths:
            pred = self.models[path](data)
        return pred
        

In [124]:
new_model = Graph(5, 3, 1)
i = 0
for param in new_model.parameters():
    i+=1
print(i)

odict_keys(['1', '2'])
6


In [118]:
new_model = Graph(5, 3, 1)
i = 0
for param in new_model.parameters():
    i+=1
print(i)

odict_keys(['1', '2'])
Parameter containing:
tensor([[-0.0025,  0.4084, -0.0397,  0.1268, -0.1185],
        [ 0.2268,  0.3659,  0.1232,  0.1255, -0.0987],
        [-0.2927, -0.1250, -0.0806,  0.3706, -0.3717]], requires_grad=True)
Parameter containing:
tensor([-0.0472,  0.2329,  0.3608], requires_grad=True)
Parameter containing:
tensor([[-0.4971,  0.0874,  0.0073]], requires_grad=True)
Parameter containing:
tensor([-0.0401], requires_grad=True)
Parameter containing:
tensor([[ 0.2731, -0.1283, -0.2387]], requires_grad=True)
Parameter containing:
tensor([-0.1481], requires_grad=True)


In [72]:
new_model = Graph(5, 3, 1)

print(new_model.state_dict())

139955367158544
139955367158544
OrderedDict([('params.(3, 1)1.fc.weight', tensor([[ 0.2868,  0.4138, -0.3961]])), ('params.(3, 1)1.fc.bias', tensor([-0.2932])), ('params.(3, 1)2.fc.weight', tensor([[0.0177, 0.4977, 0.1249]])), ('params.(3, 1)2.fc.bias', tensor([-0.2039])), ('params.(5, 3).fc.weight', tensor([[ 0.4354,  0.4455, -0.3568, -0.1309,  0.1331],
        [-0.2123,  0.0450,  0.2694,  0.1094, -0.0351],
        [-0.3255, -0.4168,  0.2886,  0.0686,  0.4170]])), ('params.(5, 3).fc.bias', tensor([ 0.2080, -0.0991, -0.0649]))])


In [91]:
for data in rand_loader:
    input = data.to(DEVICE)
    output = new_model.compute_path(input, paths=[1, 2])
    loss = output.mean().backward()
    target_weights = list(new_model.parameters())
    print(target_weights[0].grad)
    print("Outside: input size", input.size(),
          "output_size", output.size())

	In Model: input size torch.Size([15, 5]) output size torch.Size([15, 3])
	In Model: input size torch.Size([15, 5]) output size torch.Size([15, 3])
	In Model: input size	In Model: input size torch.Size([15, 3]) output size torch.Size([15, 1])
 torch.Size([15, 3]) output size torch.Size([15, 1])
	In Model: input size 	In Model: input size torch.Size([15, 5]) output size torch.Size([15, 3])
torch.Size([15, 5]) output size torch.Size([15, 3])
	In Model: input size torch.Size([15, 3]) output size torch.Size([15, 1])
	In Model: input size torch.Size([15, 3]) output size torch.Size([15, 1])
tensor([[ 0.0142, -0.0033, -0.0102,  0.0031,  0.0053],
        [ 0.0134, -0.0031, -0.0097,  0.0030,  0.0050],
        [ 0.1034, -0.0237, -0.0744,  0.0228,  0.0383]], device='cuda:0')
Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 1])
	In Model: input size torch.Size([8, 5]) output size torch.Size([8, 3])
	In Model: input size torch.Size([8, 5]) output size torch.Size([8, 3])
	In Model

In [48]:
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
    new_model = nn.DataParallel(complex_model1)
    

new_model.to(device)

Let's use 2 GPUs!


DataParallel(
  (module): ComplexModel(
    (blocks): ModuleList(
      (0): Model(
        (fc): Linear(in_features=5, out_features=4, bias=True)
      )
      (1): Model(
        (fc): Linear(in_features=4, out_features=2, bias=True)
      )
    )
  )
)

In [49]:
for data in rand_loader:
    input = data.to(device)
    output = new_model(input)
    print("Outside: input size", input.size(),
          "output_size", output.size())


	In Model: input size torch.Size([15, 5]) output size torch.Size([15, 4])
	In Model: input size torch.Size([15, 5]) output size torch.Size([15, 4])
	In Model: input size torch.Size([15, 4]) output size torch.Size([15, 2])
	In Model: input size torch.Size([15, 4]) output size torch.Size([15, 2])
Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
	In Model: input size torch.Size([15, 5]) output size torch.Size([15, 4])
	In Model: input size torch.Size([15, 5]) output size torch.Size([15, 4])
	In Model: input size	In Model: input size torch.Size([15, 4]) output size torch.Size([15, 2])
 torch.Size([15, 4]) output size torch.Size([15, 2])
Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
	In Model: input size	In Model: input size torch.Size([15, 5]) output size torch.Size([15, 4])
 torch.Size([15, 5]) output size torch.Size([15, 4])
	In Model: input size	In Model: input size torch.Size([15, 4]) output size torch.Size([15, 2])
 torch.Size([15, 4]) 

In [25]:
model = Model(mid_size, output_size)
model_common1 = Model(input_size, mid_size)
model_common2 = Model(input_size, mid_size)
complex_model1 = ComplexModel(block=[model_common1, model])
complex_model2 = ComplexModel(block=[model_common2, model])
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
    model1 = nn.DataParallel(complex_model1)
    model2 = nn.DataParallel(complex_model2)

model1.to(device)
model2.to(device)

Let's use 2 GPUs!


DataParallel(
  (module): ComplexModel(
    (blocks): ModuleList(
      (0): Model(
        (fc): Linear(in_features=5, out_features=4, bias=True)
      )
      (1): Model(
        (fc): Linear(in_features=4, out_features=2, bias=True)
      )
    )
  )
)

In [26]:
for data in rand_loader:
    input = data.to(device)
    output = model1(input)
    print("Outside: input size", input.size(),
          "output_size", output.size())

	In Model: input size torch.Size([15, 5]) output size torch.Size([15, 4])
	In Model: input size torch.Size([15, 5]) output size torch.Size([15, 4])
	In Model: input size torch.Size([15, 4]) output size torch.Size([15, 2])
	In Model: input size torch.Size([15, 4]) output size torch.Size([15, 2])
Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
	In Model: input size torch.Size([15, 5]) output size torch.Size([15, 4])
	In Model: input size torch.Size([15, 5]) output size torch.Size([15, 4])
	In Model: input size torch.Size([15, 4]) output size torch.Size([15, 2])
	In Model: input size torch.Size([15, 4]) output size torch.Size([15, 2])
Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
	In Model: input size torch.Size([15, 5]) output size torch.Size([15, 4])
	In Model: input size torch.Size([15, 4])	In Model: input size torch.Size([15, 5]) output size torch.Size([15, 4])
 output size torch.Size([15, 2])
	In Model: input size torch.Size([15, 4]) 

In [27]:
for data in rand_loader:
    input = data.to(device)
    output = model2(input)
    print("Outside: input size", input.size(),
          "output_size", output.size())

	In Model: input size torch.Size([15, 5]) 	In Model: input size torch.Size([15, 5]) output size torch.Size([15, 4])
output size torch.Size([15, 4])
	In Model: input size torch.Size([15, 4]) output size torch.Size([15, 2])
	In Model: input size torch.Size([15, 4]) output size torch.Size([15, 2])
Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
	In Model: input size torch.Size([15, 5]) output size torch.Size([15, 4])
	In Model: input size torch.Size([15, 5]) output size 	In Model: input sizetorch.Size([15, 4])
 torch.Size([15, 4]) output size torch.Size([15, 2])
	In Model: input size torch.Size([15, 4]) output size torch.Size([15, 2])
Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
	In Model: input size torch.Size([15, 5]) output size torch.Size([15, 4])
	In Model: input size	In Model: input size torch.Size([15, 5]) output size torch.Size([15, 4]) torch.Size([15, 4]) output size torch.Size([15, 2])

	In Model: input size torch.Size([15, 4]) 

In [28]:
device

device(type='cuda')

In [80]:
a = [1, 2, 3, 4, 5, 6]
b = 1
print(b in a)

True
