In [2]:
import torch
import torch.nn as nn
import transformers

print(torch.__version__)

1.13.0+cu117


torch.save(arg, PATH)

torch.load(PATH)

model.load_state_dict(arg)

# Save model

torch.save(model, PATH)

# Load

model = torch.load(PATH)

[set model to eval mode]

model.eval()

# Save para directly

[Save all the parameters]

torch.save(model.state_dict(), PATH)

#Load

[need to define a model first]

model = Model(*args, **kwargs)

model.load_state_dict(torch.load(PATH))

model.eval()

In [3]:
class Model(nn.Module):
    def __init__(self, input_size):
        super(Model, self).__init__()
        self.l1 = nn.Linear(input_size, 1)

    def forward(self, x):
        x = torch.sigmoid(self.l1(x))
        return x

In [19]:
model = Model(input_size=10)
FILE = 'm/model.pth'
FILE2 = 'm/model_state.pth'

In [61]:
learninig_rate = 0.01
optimizer = torch.optim.SGD(model.parameters(), lr=learninig_rate)

In [62]:
print(optimizer.state_dict())

{'state': {}, 'param_groups': [{'lr': 0.01, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'maximize': False, 'foreach': None, 'differentiable': False, 'params': [0, 1]}]}


# Lazy method (larger size)

In [20]:
torch.save(model, FILE)

In [27]:
m1 = torch.load(FILE)
m1.eval()

Model(
  (l1): Linear(in_features=10, out_features=1, bias=True)
)

In [28]:
for param in m1.parameters():
    print(param)

Parameter containing:
tensor([[-0.0258,  0.2678,  0.2451,  0.2652,  0.0178, -0.1142,  0.3051, -0.2506,
         -0.0194,  0.1107]], requires_grad=True)
Parameter containing:
tensor([0.2239], requires_grad=True)


# Preferable method (smaller size)

In [23]:
torch.save(model.state_dict(),FILE2)

In [24]:
m2 = Model(input_size=10)
m2.load_state_dict(torch.load(FILE2))
m2.eval()

Model(
  (l1): Linear(in_features=10, out_features=1, bias=True)
)

In [26]:
for param in m2.parameters():
    print(param)

Parameter containing:
tensor([[-0.0258,  0.2678,  0.2451,  0.2652,  0.0178, -0.1142,  0.3051, -0.2506,
         -0.0194,  0.1107]], requires_grad=True)
Parameter containing:
tensor([0.2239], requires_grad=True)


In [30]:
print(model.state_dict())

OrderedDict([('l1.weight', tensor([[-0.0258,  0.2678,  0.2451,  0.2652,  0.0178, -0.1142,  0.3051, -0.2506,
         -0.0194,  0.1107]])), ('l1.bias', tensor([0.2239]))])


# Checkpoint

In [63]:
checkpoint = {
    "epoch": 90,
    "model_state": model.state_dict(),
    "optim_state": optimizer.state_dict(),
}

num_epoch = 8
torch.save(checkpoint, f"m/checkpoint-{num_epoch}.pth")

In [64]:
m_checkpoint = torch.load(f"m/checkpoint-{num_epoch}.pth")

In [65]:
epoch = m_checkpoint['epoch']
model_checkpoint = Model(input_size=10)
optimizer_c = torch.optim.SGD(model_checkpoint.parameters(), lr=0)

model_checkpoint.load_state_dict(m_checkpoint['model_state'])
optimizer_c.load_state_dict(checkpoint['optim_state'])

In [66]:
print(optimizer_c.state_dict() == optimizer.state_dict())

True


# Model on GPU and saved, but load on CPU

In [67]:
device = torch.device('cuda')

In [68]:
model_gpu = Model(input_size=10).to(device)
torch.save(model_gpu.state_dict(), "m/ModelGPU.pth")

In [79]:
cpu = torch.device('cpu')
m_cpu = Model(input_size=10)

In [80]:
m_cpu.load_state_dict(torch.load("m/ModelGPU.pth", map_location=cpu))

<All keys matched successfully>

In [81]:
print(model_gpu.state_dict())
print(m_cpu.state_dict())

OrderedDict([('l1.weight', tensor([[ 0.2037, -0.0217,  0.0297,  0.0653,  0.1367, -0.2099,  0.3075, -0.0286,
          0.1865,  0.1716]], device='cuda:0')), ('l1.bias', tensor([-0.0463], device='cuda:0'))])
OrderedDict([('l1.weight', tensor([[ 0.2037, -0.0217,  0.0297,  0.0653,  0.1367, -0.2099,  0.3075, -0.0286,
          0.1865,  0.1716]])), ('l1.bias', tensor([-0.0463]))])


# Save on GPU, load on GPU

In [82]:
m_gpu = Model(input_size=10)
m_gpu.load_state_dict(torch.load("m/ModelGPU.pth"))

<All keys matched successfully>

In [86]:
m_gpu.state_dict()

OrderedDict([('l1.weight',
              tensor([[ 0.2037, -0.0217,  0.0297,  0.0653,  0.1367, -0.2099,  0.3075, -0.0286,
                        0.1865,  0.1716]])),
             ('l1.bias', tensor([-0.0463]))])

# get para of a model

In [110]:
a = iter(m_gpu.named_parameters())
b = next(a)

In [114]:
type(b[1]), b[1]

(torch.nn.parameter.Parameter,
 Parameter containing:
 tensor([[ 0.2037, -0.0217,  0.0297,  0.0653,  0.1367, -0.2099,  0.3075, -0.0286,
           0.1865,  0.1716]], requires_grad=True))

In [136]:
type(b[1].__getitem__(0)), b[1].__getitem__(0)

(torch.Tensor,
 tensor([ 0.2037, -0.0217,  0.0297,  0.0653,  0.1367, -0.2099,  0.3075, -0.0286,
          0.1865,  0.1716], grad_fn=<SelectBackward0>))

In [142]:
m_gpu.l1.weight.__getitem__(0)

tensor([ 0.2037, -0.0217,  0.0297,  0.0653,  0.1367, -0.2099,  0.3075, -0.0286,
         0.1865,  0.1716], grad_fn=<SelectBackward0>)