# 初始化模型与配置

In [7]:
import torch, mindspore
import mindspore.ops as ops
import numpy as np
from transformers.models.bart import modeling_bart as pt
import mindnlp.models.bart as m

# init config
ms_config = m.BartConfig()
pt_config = pt.BartConfig()
# pt_config.activation_function = "gelu_new"

# init model
ms_model = m.BartForCausalLM(ms_config)
pt_model = pt.BartForCausalLM(pt_config)

seq_len = 64
batch = 4
embed_dim = ms_config.d_model
tgt_len = 32
src_len = 32
encoder_attention_heads = ms_config.encoder_attention_heads

hidden_states = np.random.randn(batch,seq_len,embed_dim)
# input_ids = np.random.randint(128,None,(batch,seq_len))
input_ids = np.array([
                [71, 82, 18, 33, 46, 91, 2],
                [68, 34, 26, 58, 30, 82, 2],
                [5, 97, 17, 39, 94, 40, 2],
                [76, 83, 94, 25, 70, 78, 2],
                [87, 59, 41, 35, 48, 66, 2],
                [55, 13, 16, 58, 5, 2, 1],  # note padding
                [64, 27, 31, 51, 12, 75, 2],
                [52, 64, 86, 17, 83, 39, 2],
                [48, 61, 9, 24, 71, 82, 2],
                [26, 1, 60, 48, 22, 13, 2],
                [21, 5, 62, 28, 14, 76, 2],
                [45, 98, 37, 86, 59, 48, 2],
                [70, 70, 50, 9, 28, 0, 2],
                [70, 70, 50, 9, 28, 0, 2],
                [70, 70, 50, 9, 28, 0, 2],
                [70, 70, 50, 9, 28, 0, 2],
                [70, 70, 50, 9, 28, 0, 2],
                [70, 70, 50, 9, 28, 0, 2],
                [70, 70, 50, 9, 28, 0, 2],
                [70, 70, 50, 9, 28, 0, 2],
                [70, 70, 50, 9, 28, 0, 2],
                [70, 70, 50, 9, 28, 0, 2],
                [70, 70, 50, 9, 28, 0, 2],
                [70, 70, 50, 9, 28, 0, 2],
            ])
# attention_mask = np.random.randint(0,1,(batch,seq_len))
attention_mask = np.random.randn(batch,1,seq_len,seq_len)
head_mask = np.random.randint(0,1,(ms_config.encoder_layers,ms_config.encoder_attention_heads))
layer_head_mask = np.random.randn(encoder_attention_heads)

ms_hidden_states = mindspore.Tensor(hidden_states, dtype=mindspore.float32)
ms_input_ids = mindspore.Tensor(input_ids, dtype=mindspore.int64)
ms_attention_mask = mindspore.Tensor(attention_mask, dtype=mindspore.float32)
ms_head_mask = mindspore.Tensor(head_mask, dtype=mindspore.bool_)
ms_layer_head_mask = mindspore.Tensor(layer_head_mask, dtype=mindspore.float32)

pt_hidden_states = torch.tensor(hidden_states, dtype=torch.float)
pt_input_ids = torch.tensor(input_ids, dtype=torch.long)
pt_attention_mask = torch.tensor(attention_mask, dtype=torch.float)
pt_head_mask = torch.tensor(head_mask, dtype=torch.bool)
pt_layer_head_mask = torch.tensor(layer_head_mask, dtype=torch.float)

# 记录pt参数中所有的key, 若ms中存在, 则从key中抹去
pt_param_keys = set(pt_model.state_dict().keys())

ms_notfound_keys = set()
# 记录ms中无法在pt中找到的参数
# for key, param in ms_model.parameters_and_names():
#     if key in pt_param_keys:
#         pt_param_keys.remove(key)
#     else:
#         ms_notfound_keys.add(key)

# print("pt中未使用参数",pt_param_keys)
# print("ms中未找到参数",ms_notfound_keys)

for key, param in ms_model.parameters_and_names():
    # 预处理key
    key = key.replace('gamma','weight')
    key = key.replace('beta','bias')
    key = key.replace('embedding_table','weight')
    param.set_data(mindspore.Tensor(pt_model.state_dict().get(key).detach().numpy()))

ms_model.set_train(False)
pt_model.eval()

ms_out = ms_model(ms_input_ids)
pt_out = pt_model(pt_input_ids)

def judge(o1, o2, loss = 1e-5, prefix = '-'):
    prefix += '-'
    if (isinstance(o1, tuple)):
        assert len(o1) == len(o2)
        for i in range(len(o1)):
            judge(o1[i], o2[i], loss=loss, prefix=prefix)
    elif (isinstance(o1,mindspore.Tensor)):
        assert o1.shape == o2.shape
        print(f"{prefix}{np.allclose(o1.asnumpy(), o2.detach().numpy(), loss, loss)}")
    else:
        print(f"{type(o1)}-{type(o2)}:{o1==o2}")

judge(ms_out,pt_out)


---True
-----True
-----True
-----True
-----True
-----True
-----True
-----True
-----True
-----True
-----True
-----True
-----True
-----True
-----True
-----True
-----True
-----True
-----True
-----True
-----True
-----True
-----True
-----True
-----True


In [4]:
import mindspore
from mindspore import nn
import torch
import numpy as np

def test_me():
    input_x = mindspore.Tensor(np.array([[-1.0, 4.0, -8.0], [2.0, -5.0, 9.0]]), mindspore.float32)
    fast_gelu = nn.GELU(approximate=False)
    output = fast_gelu(input_x)
    return output

def test_torch():
    input_x = torch.Tensor(np.array([[-1.0, 4.0, -8.0], [2.0, -5.0, 9.0]]))
    gelu = torch.nn.GELU()
    output = gelu(input_x)
    return output

def judge(o1, o2, loss = 1e-5, prefix = '-'):
    prefix += '-'
    if (isinstance(o1, tuple)):
        assert len(o1) == len(o2)
        for i in range(len(o1)):
            judge(o1[i], o2[i], loss=loss, prefix=prefix)
    elif (isinstance(o1,mindspore.Tensor)):
        assert o1.shape == o2.shape
        print(f"{prefix}{np.allclose(o1.asnumpy(), o2.detach().numpy(), loss, loss)}")
    else:
        print(f"{type(o1)}-{type(o2)}:{o1==o2}")

ms_out = test_me()
pt_out = test_torch()

judge(ms_out,pt_out)

--True


In [3]:
import mindspore
from mindspore import nn
from transformers.activations import NewGELUActivation
import torch
import numpy as np
import math


def test_me():
    input_x = mindspore.Tensor(np.array([[-1.0, 4.0, -8.0], [2.0, -5.0, 9.0]]), mindspore.float32)
    fast_gelu = nn.GELU(approximate=True)
    output = fast_gelu(input_x)
    return output

def test_torch():
    input_x = torch.Tensor(np.array([[-1.0, 4.0, -8.0], [2.0, -5.0, 9.0]]))
    gelu = NewGELUActivation()
    output = gelu(input_x)
    return output

def judge(o1, o2, loss = 1e-5, prefix = '-'):
    prefix += '-'
    if (isinstance(o1, tuple)):
        assert len(o1) == len(o2)
        for i in range(len(o1)):
            judge(o1[i], o2[i], loss=loss, prefix=prefix)
    elif (isinstance(o1,mindspore.Tensor)):
        assert o1.shape == o2.shape
        print(f"{prefix}{np.allclose(o1.asnumpy(), o2.detach().numpy(), loss, loss)}")
    else:
        print(f"{type(o1)}-{type(o2)}:{o1==o2}")

ms_out = test_me()
pt_out = test_torch()

judge(ms_out,pt_out)

--True


In [14]:
from huggingface_hub import hf_hub_url

path = "/home/wangxingran/bart_migration/Bartckpt"

def download_script(size:str):
    """print wget to download files of a pretrained model"""
    print(f"wget {hf_hub_url(repo_id=size, filename='config.json')} -P {path}/{size}")
    print(f"wget {hf_hub_url(repo_id=size, filename='tokenizer.json')} -P {path}/{size}")
    print(f"wget {hf_hub_url(repo_id=size, filename='pytorch_model.bin')} -P {path}/{size}")

sizes = ["facebook/bart-base", "facebook/bart-large", "facebook/bart-large-mnli", "facebook/bart-large-cnn", "facebook/bart-large-xsum"]

download_script(sizes[4])

wget https://huggingface.co/facebook/bart-large-xsum/resolve/main/config.json -P /home/wangxingran/bart_migration/Bartckpt/facebook/bart-large-xsum
wget https://huggingface.co/facebook/bart-large-xsum/resolve/main/tokenizer.json -P /home/wangxingran/bart_migration/Bartckpt/facebook/bart-large-xsum
wget https://huggingface.co/facebook/bart-large-xsum/resolve/main/pytorch_model.bin -P /home/wangxingran/bart_migration/Bartckpt/facebook/bart-large-xsum
