In [None]:
import os, time, re, random, glob, json, jieba, copy
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoModelForMultipleChoice,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    default_data_collator
)
device="cuda:0" if torch.cuda.is_available() else "cpu"
from sys import platform
if platform == "linux" or platform == "linux2":
    # linux
    root = "/mnt/sfevol775196/sunzeye273/Data"
#     root = "/mnt/share-pa002-vol682688-prd/sunzeye273/Data"
#     root = "/mnt/pa002-28359-vol543625-private/Data"
#     root = "/root/autodl-tmp/Data"
elif platform == "darwin":
    # OS X
    root = "/Users/zeyesun/Documents/Data"
elif platform == "win32":
    # Windows...
    root = "D:\\Data"

In [None]:
model_name = "llama-7b"
# model_name = "glm-350M-chinese"
# model_name = "chatglm-6B"
model_name_or_path = os.path.join(root, "models", model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_cache=False, trust_remote_code=True)
print(tokenizer.special_tokens_map)
print(tokenizer.all_special_ids)
print(
    f"unk: {tokenizer.unk_token_id}\n",
    f"pad: {tokenizer.pad_token_id}\n",
    f"bos: {tokenizer.bos_token_id}\n",
    f"eos: {tokenizer.eos_token_id}\n",
    f"sep: {tokenizer.sep_token_id}\n",
    f"mask: {tokenizer.mask_token_id}\n",
#     f"eop: {tokenizer.eop_token_id}\n"
#     f"sop: {tokenizer.sop_token_id}\n"
    f"cls: {tokenizer.cls_token_id}"
) 

# LLaMa

In [None]:
model_name = "llama-small"
model_name_or_path = os.path.join(root, "models", model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_cache=False, trust_remote_code=True)
config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)

In [None]:
f = os.path.join(model_name_or_path, "pytorch_model.bin")
torch.save(model.state_dict(), f)

In [None]:
prompt = "你好 </s> 你是谁 </s> 你好 </s> 你是谁"
eos_ids = [0, 5, 12, 16, 22]
max_length = 512
input_embeds = torch.rand(1, dtype=torch.float16)
# tokenizer.tokenize(prompt, add_special_tokens=False)
inputs = tokenizer(prompt,  max_length=max_length, return_tensors="pt", truncation="longest_first", 
          return_attention_mask=True, return_token_type_ids=False)
# print(inputs)
tgt_len = len(inputs['input_ids'][0])
print(f"tgt_len: {tgt_len}")
combined_attention_mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(input_embeds.dtype).min))
# print(f"shape: {combined_attention_mask.shape}, combined_attention_mask: {combined_attention_mask}")
for i in range(len(eos_ids)-1):
    attention_mask = torch.ones((1, eos_ids[i+1]-eos_ids[i]), dtype=torch.long)
#     print(f"{i}, shape: {attention_mask.shape}, attention_mask: {attention_mask}")
    attention_mask = _prepare_decoder_attention_mask(attention_mask, attention_mask.shape, 
                                    input_embeds, 0)
    combined_attention_mask[eos_ids[i]:eos_ids[i+1], eos_ids[i]:eos_ids[i+1]] = attention_mask
#     print(f"{i}-th, shape: {attention_mask.shape}, attention_mask: {attention_mask}")
#     break
# print(f"shape: {combined_attention_mask.shape}, combined_attention_mask: {combined_attention_mask}")

In [None]:
def _make_causal_mask(
    input_ids_shape, dtype, device, past_key_values_length = 0
):
    """
    Make causal mask used for bi-directional self-attention.
    """
    bsz, tgt_len = input_ids_shape
    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
    mask_cond = torch.arange(mask.size(-1), device=device)
    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
    mask = mask.to(dtype)

    if past_key_values_length > 0:
        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)


# Copied from transformers.models.bart.modeling_bart._expand_mask
def _expand_mask(mask, dtype, tgt_len):
    """
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    """
    bsz, src_len = mask.size()
    tgt_len = tgt_len if tgt_len is not None else src_len

    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)

    inverted_mask = 1.0 - expanded_mask

    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)

def _prepare_decoder_attention_mask(attention_mask, input_shape, inputs_embeds, past_key_values_length):
        # create causal mask
        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
        combined_attention_mask = None
        if input_shape[-1] > 1:
            combined_attention_mask = _make_causal_mask(
                input_shape,
                inputs_embeds.dtype,
                device=inputs_embeds.device,
                past_key_values_length=past_key_values_length,
            )

        if attention_mask is not None:
            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
                inputs_embeds.device
            )
            combined_attention_mask = (
                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
            )

        return combined_attention_mask

# GLM

In [None]:
prompt = "你好"
prefix = "答:"
max_length = 128
encoded_prompt = tokenizer(prompt, prefix + tokenizer.mask_token)
prompt_length = len(encoded_prompt['input_ids'])
encoded_dict = tokenizer(prompt, prefix + tokenizer.mask_token,
                         max_length=min(prompt_length, max_length),
                         truncation="only_first",
                         return_tensors="pt",
                         return_token_type_ids=False)
max_gen_length = max_length - encoded_dict['input_ids'].shape[1]
inputs = tokenizer.build_inputs_for_generation(encoded_dict, max_gen_length=max_gen_length, padding=True)

In [None]:
inputs

In [None]:
batch_size, seq_length = encoded_dict['input_ids'].shape[:2]
attention_mask1 = encoded_dict['attention_mask']
attention_mask2 = attention_mask1.unsqueeze(1).expand(-1, seq_length + max_gen_length, -1)
generation_attention_mask = torch.cat([attention_mask2.new_zeros((seq_length, max_gen_length)),
                                       torch.tril(attention_mask2.new_ones((max_gen_length, max_gen_length)))],
                                      dim=0).unsqueeze(0).expand(batch_size, -1, -1)
attention_mask3 = torch.cat((attention_mask2, generation_attention_mask), dim=2)
attention_mask4 = attention_mask3.unsqueeze(1)

In [None]:
print(attention_mask1.shape)
print(attention_mask1)

In [None]:
print(attention_mask2.shape)
# print(attention_mask2)

In [None]:
print(attention_mask3.shape)
print(attention_mask3)

In [None]:
print(attention_mask4.shape)
print(attention_mask4)

# ChatGLM

In [None]:
model_name_or_path = os.path.join(root, "models", "chatglm-6B")
# model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, trust_remote_code=True)
# model = model.half().to(device)

In [None]:
texts = [
    "你好",
    "晚上睡不着应该怎么办"
]
history = []
for text in texts:
    response, history = model.chat(tokenizer, text, history=history)
    print(f"问: {text}\n答:{response}\n")

In [None]:
max_length = 512
prompt = "晚上睡不着应该怎么办"
label = "吃褪黑素"
encoded_dict = tokenizer(prompt, label,
                         max_length=max_length, return_tensors="pt",
                         truncation="longest_first", padding="max_length", return_token_type_ids=False)
print(encoded_dict.keys())
print(encoded_dict['input_ids'].shape)
print(encoded_dict['input_ids'][0, :50])

In [None]:
ids = [ 94747, 103400,  84213,  86846, 150001, 150004,  20005,  84020, 110857,
         84248,  84886,  20003]
print([tokenizer._convert_id_to_token(i) for i in ids])

In [None]:
input_ids = encoded_dict['input_ids']
print(input_ids[0, :20])
seqs = input_ids.tolist()
mask_positions = [seq.index(tokenizer.gmask_token_id) for seq in seqs]
gmask = True

In [None]:
batch_size, seq_length = input_ids.shape
context_lengths = [seq.tolist().index(tokenizer.bos_token_id) for seq in input_ids]
attention_mask = torch.ones((batch_size, seq_length, seq_length), device=device)
attention_mask.tril_()
for i, context_length in enumerate(context_lengths):
    attention_mask[i, :, :context_length] = 1
attention_mask.unsqueeze_(1)
# attention_mask = (attention_mask < 0.5).bool()
print(attention_mask.shape)
print(attention_mask[0, :20, :20])
batch_size, seq_length = input_ids.shape
context_lengths = [seq.tolist().index(tokenizer.bos_token_id) for seq in input_ids]
# if self.position_encoding_2d:
position_ids = torch.arange(seq_length, dtype=torch.long, device=device).expand(batch_size, seq_length)
if not gmask:
    for i, context_length in enumerate(context_lengths):
        position_ids[i, context_length:] = mask_positions[i]
block_position_ids = [torch.cat((
    torch.zeros(context_length, dtype=torch.long, device=device),
    torch.arange(seq_length - context_length, dtype=torch.long, device=device) + 1
)) for context_length in context_lengths]
block_position_ids = torch.stack(block_position_ids, dim=0)
position_ids = torch.stack((position_ids, block_position_ids), dim=1)
# else:
#     position_ids = torch.arange(seq_length, dtype=torch.long, device=device).expand(batch_size, seq_length)
#     if not gmask:
#         for i, context_length in enumerate(context_lengths):
#             position_ids[context_length:] = mask_positions[i]
print(position_ids.shape)
print(position_ids[0, :, :20])

In [None]:
print(tokenizer.tokenize(prompt))
print(tokenizer(prompt))

In [None]:
encoded_prompt = tokenizer(prompt)
prompt_length = len(encoded_prompt['input_ids'])
inputs = tokenizer(prompt, max_length=min(prompt_length, max_length), truncation="only_first",
                   return_tensors="pt")
inputs = inputs.to(device)

In [None]:
max_length_generation = 50
num_return_sequences = 1
top_p = 0.8
temperature = 1.0
outputs = model.generate(**inputs, max_new_tokens=max_length_generation,
                         eos_token_id=tokenizer.eop_token_id,
                         pad_token_id=tokenizer.pad_token_id,
                         do_sample=False,
                         num_return_sequences=num_return_sequences,
                         top_p=top_p,
                         temperature=temperature)

# DeepSpeed

### estimate the memory needs for params, optim states and gradients

In [None]:
# stage 1 and 2
from deepspeed.runtime.zero.stage_1_and_2 import estimate_zero2_model_states_mem_needs_all_live
estimate_zero2_model_states_mem_needs_all_live(model, num_gpus_per_node=3, num_nodes=1)

In [None]:
# stage 3
from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live
estimate_zero3_model_states_mem_needs_all_live(model, num_gpus_per_node=3, num_nodes=1)

# Split torch checkpoint into multiple checkpoints

In [None]:
checkpoint = os.path.join(root, "chatgpt", "output", "sft", "pangu-2.6B", "checkpoint-42782")
if "glm" in model_name_or_path:
    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, trust_remote_code=True)
else:
    model = AutoModelForCausalLM.from_pretrained(checkpoint, trust_remote_code=True, use_cache=False)
st = model.state_dict()

In [None]:
import sys
print(sys.getsizeof(st))
keys = list(st.keys())
n = 10
m = {"metadata": {"total_size":sys.getsizeof(st)}, "weight_map":dict()}
span = len(keys) // n
for i in range(n):
    fn = f"pytorch_model-{i+1}-of-{n}.bin"
    f = os.path.join(checkpoint, fn)
    stt = dict()
    for key in keys[i*span:(i+1)*span]:
        stt[key] = st[key]
        m["weight_map"][key] = fn
    torch.save(stt, f)
f = os.path.join(checkpoint, "pytorch_model.bin.index.json")
json.dump(m, open(f, "w", encoding="utf-8"), ensure_ascii=False)