In [1]:
import torch
import numpy as np
from transformers import (AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer, LlamaForCausalLM,
                          GPTNeoForCausalLM, GPT2TokenizerFast,LlamaConfig)
import os
import random
from typing import *
import matplotlib.pyplot as plt
import os
import seaborn as sns
import torch.nn as nn
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer, BloomForCausalLM
from torch.utils.data import DataLoader

2024-01-15 04:30:02.647502: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-15 04:30:02.679483: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
class MyGPTNeoSelfAttention(nn.Module):
    def __init__(self, config, device, is_linear):
        super().__init__()

        max_positions = config.max_position_embeddings
        # bias = torch.tril(torch.ones((max_positions, max_positions), dtype=bool)).view(
        #     1, 1, max_positions, max_positions
        # )

        # local causal self attention is a sliding window where each token can only attend to the previous
        # window_size tokens. This is implemented by updating the causal mask such that for each token
        # all other tokens are masked except the previous window_size tokens.
        # if attention_type == "local":
        #     bias = torch.bitwise_xor(bias, torch.tril(bias, -config.window_size))

        # self.attn_dropout = nn.Dropout(float(config.attention_dropout))
        self.resid_dropout = nn.Dropout(float(config.resid_dropout))

        self.embed_dim = config.hidden_size
        self.num_heads = config.num_heads
        self.head_dim = self.embed_dim // self.num_heads
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )

        self.linear = nn.Sequential(nn.Linear(in_features=self.embed_dim, out_features=self.embed_dim*2, bias=False, dtype=torch.float16),
                                    nn.Linear(in_features=self.embed_dim*2, out_features=self.embed_dim, bias=False, dtype=torch.float16))
        from torch.nn.init import xavier_uniform_
        xavier_uniform_(self.linear[0].weight.data)
        xavier_uniform_(self.linear[1].weight.data)
        self.device = device
        self.is_linear = is_linear

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        layer_past=None,
        head_mask=None,
        use_cache=False,
        output_attentions=False,
    ):

        bsz, q_len, _ = hidden_states.size()
        input_dtype = hidden_states.dtype
        if input_dtype != torch.float16:
            self.linear = self.linear.to(input_dtype)

        key = torch.ones((bsz, q_len, self.num_heads, self.head_dim), dtype=input_dtype).to(
            self.device).transpose(1, 2)
        value = torch.ones((bsz, q_len, self.num_heads, self.head_dim), dtype=input_dtype).to(
            self.device).transpose(1, 2)

        attn_weights = torch.zeros((bsz, self.num_heads, q_len, q_len), dtype=input_dtype).to(self.device)

        attn_output = self.linear(hidden_states)
        p = self.linear.parameters()
        if self.is_linear:
            # attn_output = self.resid_dropout(attn_output)
            pass
        else:
            attn_output = torch.ones_like(attn_output, dtype=input_dtype).to(self.device)

        if layer_past is not None:
            past_key = layer_past[0]
            past_value = layer_past[1]
            key = torch.cat((past_key, key), dim=-2)
            value = torch.cat((past_value, value), dim=-2)

        if use_cache is True:
            present = (key, value)
        else:
            present = None

        outputs = (attn_output, present)
        if output_attentions:
            outputs += (attn_weights,)

        return outputs  # a, present, (attentions)

class MyLlamaAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config: LlamaConfig, device, is_linear):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.hidden_size // self.num_heads
        self.num_key_value_heads = config.num_key_value_heads
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
        self.max_position_embeddings = config.max_position_embeddings
        self.rope_theta = config.rope_theta
        self.is_causal = True

        # self.resid_dropout = nn.Dropout(float(config.resid_dropout))
        self.linear = nn.Sequential(nn.Linear(in_features=2048, out_features=5632, bias=False, dtype=torch.float16),
                                                      nn.Linear(in_features=5632, out_features=2048, bias=False, dtype=torch.float16))
        self.device = device
        self.is_linear =is_linear
        if (self.head_dim * self.num_heads) != self.hidden_size:
            raise ValueError(
                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                f" and `num_heads`: {self.num_heads})."
            )


    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:

        bsz, q_len, _ = hidden_states.size()
        input_dtype = hidden_states.dtype
        if input_dtype != torch.float16:
            self.linear = self.linear.to(input_dtype)


        key_states = torch.zeros((bsz, q_len, self.num_key_value_heads, self.head_dim), dtype=input_dtype).to(self.device).transpose(1, 2)
        value_states = torch.zeros((bsz, q_len, self.num_key_value_heads, self.head_dim), dtype=input_dtype).to(self.device).transpose(1, 2)

        past_key_value = (key_states, value_states) if use_cache else None

        attn_weights = torch.zeros((bsz, self.num_heads, q_len, q_len), dtype=input_dtype).to(self.device)

        attn_output = self.linear(hidden_states)
        if not self.is_linear:
            attn_output = torch.zeros_like(attn_output, dtype=input_dtype).to(self.device)

        return attn_output, attn_weights, past_key_value


In [3]:
def plt_heatMap_sns(scores, save_path=None, title=None, cmap=None, y_ticks=None, x_ticks=None, show=None):
    plt.subplots(figsize=(20, 20), dpi=200)
    plt.rcParams['font.size'] = '10'
    if cmap is None:
        cmap = sns.color_palette("Reds", as_cmap=True)
    if x_ticks and y_ticks:
        sns.heatmap(scores, cmap=cmap,  xticklabels=x_ticks, yticklabels=y_ticks)
    else:
        sns.heatmap(scores, cmap=cmap)
    if title is not None:
        plt.title(title)
    if save_path:
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        plt.savefig(os.path.join(save_path, f'{title}.png'), bbox_inches="tight")
    if show:
        plt.show()
    plt.close()


def load_model(model_name: str, device='cuda', low_cpu_mem_usage=False, layers=[], train_layers=[], is_linear=False):
    tokenizer: GPT2TokenizerFast = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    if model_name.find('gpt-neo') != -1:
        model: GPTNeoForCausalLM = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)# .to(device)

        for layer_id in layers:
            model.transformer.h[layer_id].attn.attention = MyGPTNeoSelfAttention(model.config, device=device,
                                                                                 is_linear=is_linear).to(device)
            # model.model.layers[layer_id].self_attn = LlamaAttentionLinear(model.config , device).to(device)

        MODEL_CONFIG = {"n_heads": model.config.num_attention_heads,
                        "n_layers": model.config.num_hidden_layers,
                        "resid_dim": model.config.hidden_size,
                        "name_or_path": model.config.name_or_path,
                        "attn_hook_names": [f'transformer.h.{layer}.attn.attention.attn_dropout' for layer in
                                            range(model.config.num_hidden_layers)]
                        }
        # p = model.named_parameters()
        if len(train_layers) > 0:
            for k, params in model.named_parameters():
                for layer_id in train_layers:
                    if k.find(f'transformer.h.{layer_id}.attn.attention') == -1:
                        params.requires_grad = False

        return model, tokenizer, MODEL_CONFIG
    if model_name.find('TinyLlama') != -1:
        model: LlamaForCausalLM = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)#.to(device)

        for layer_id in layers:
            # model.transformer.h[layer_id].attn.attention = MyGPTNeoSelfAttention(model.config, device=device,is_linear=is_linear).to(device)
            model.model.layers[layer_id].self_attn = MyLlamaAttention(model.config, device, is_linear=is_linear).to(
                device)

        MODEL_CONFIG = {"n_heads": model.config.num_attention_heads,
                        "n_layers": model.config.num_hidden_layers,
                        "resid_dim": model.config.hidden_size,
                        "name_or_path": model.config.name_or_path,
                        "attn_hook_names": [f'model.layers.{layer}.self_attn.attn_dropout' for layer in
                                            range(model.config.num_hidden_layers)]
                        }
        if len(train_layers) > 0:
            for k, params in model.named_parameters():
                for layer_id in train_layers:
                    if k.find(str(layer_id)) == -1:
                        params.requires_grad = False

        return model, tokenizer, MODEL_CONFIG


def draw_attention(model, tokenizer, MODEL_CONFIG, device, prompt, check_token_id, save_path, title):
    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    output_and_cache = model(**inputs, output_hidden_states=True, output_attentions=True)
    ground_attentions = torch.cat(output_and_cache.attentions, dim=0).detach().cpu().numpy()
    ground_attentions = ground_attentions[:, :, check_token_id, :]

    x_ticks = [f"layer{i + 1}" for i in range(MODEL_CONFIG['n_layers'])]
    encoded_line = tokenizer.encode(prompt)
    codes = tokenizer.convert_ids_to_tokens(encoded_line)
    y_ticks = [f"head{i_head}-{c}" for i_head in range(MODEL_CONFIG['n_heads']) for i, c in enumerate(codes)]
    plt_heatMap_sns(ground_attentions.reshape(ground_attentions.shape[0], -1).T,
                    title=title, x_ticks=x_ticks, y_ticks=y_ticks
                    , show=True, save_path=save_path)


In [4]:
layers = [22]  # [23]
train_layers = [22]
is_linear = False
save_model_path = f'./results/gpt_neo_{layers}_{is_linear}_{train_layers}'

device_str = 'cuda:1'

In [5]:
torch.cuda.empty_cache()
model_name = 'EleutherAI/gpt-neo-1.3B'  # 'EleutherAI/gpt-neo-1.3B' # 'TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T' # 'EleutherAI/gpt-neo-125m'
model, tokenizer, MODEL_CONFIG = load_model(model_name, device=device_str, layers=layers, train_layers=train_layers,
                                            is_linear=is_linear)

In [6]:
tokenizer
# MODEL_CONFIG
print(tokenizer.eos_token)

<|endoftext|>


In [7]:
 # dataset = load_dataset('cerebras/SlimPajama-627B',split='train[:100]') # TinyLlama
dataset = load_dataset('monology/pile-uncopyrighted', split='train[:1%]')  # gpt-neo

def process_func(examples):
    contents = [e + tokenizer.eos_token for e in examples["text"]]
    return tokenizer(contents, max_length=128, truncation=True)
tokenized_ds = dataset.map(process_func, batched=True, remove_columns=dataset.column_names)

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Map:   0%|          | 0/1770097 [00:00<?, ? examples/s]

In [8]:
# print(len(tokenized_ds['input_ids'][2]))
# print(len(tokenized_ds['input_ids'][0]))
dl = DataLoader(tokenized_ds, batch_size=1, collate_fn=DataCollatorForLanguageModeling(tokenizer, mlm=False), shuffle=True)
inp = next(enumerate(dl))[1]['input_ids'][0]
print(inp)
print(tokenizer.decode(inp))
tokenizer(tokenizer.decode(inp))

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([39112,   533,  1430,    26, 13771,  5197,   286, 15077,  5010,   973,
          287, 16217,   418,  7211,  3314,  9102,   438, 16045,  7708,    13,
         8125,  3896,    13,   198,  1212,  2457,  3896,   716,  2412,   262,
         6647,   284,  2148, 13771,  5197,   329, 15077,  5010,   973,   287,
        16217,   418,  7211,  3314,  9102, 30760,   284,   281,  1981,   508,
        11583,   281,  1618, 23319,   329,   543, 13771,  6074,   318,   925,
           13,   770,  3896, 12497,   262, 28547,   286,  2665, 45278,     7,
           82,  5769,    17,  5769,    41,     8,   286,   262,  5483,  4765,
         2191,   326,  3769, 13771,  5197,   329, 15077,  5010,   973,   287,
        16217,   418,  7211,  3314,  9102,   329,   257,  2278,   286,   510,
          284,   352,   614,   422,   262,  3128,   286, 17655,   422,   281,
          287, 26029,  4436,  2652,  1141,   543,   262, 13771,    12, 32111,
         1618,   393, 10712, 23319,   373,  6157,    13,   770])

{'input_ids': [39112, 533, 1430, 26, 13771, 5197, 286, 15077, 5010, 973, 287, 16217, 418, 7211, 3314, 9102, 438, 16045, 7708, 13, 8125, 3896, 13, 198, 1212, 2457, 3896, 716, 2412, 262, 6647, 284, 2148, 13771, 5197, 329, 15077, 5010, 973, 287, 16217, 418, 7211, 3314, 9102, 30760, 284, 281, 1981, 508, 11583, 281, 1618, 23319, 329, 543, 13771, 6074, 318, 925, 13, 770, 3896, 12497, 262, 28547, 286, 2665, 45278, 7, 82, 5769, 17, 5769, 41, 8, 286, 262, 5483, 4765, 2191, 326, 3769, 13771, 5197, 329, 15077, 5010, 973, 287, 16217, 418, 7211, 3314, 9102, 329, 257, 2278, 286, 510, 284, 352, 614, 422, 262, 3128, 286, 17655, 422, 281, 287, 26029, 4436, 2652, 1141, 543, 262, 13771, 12, 32111, 1618, 393, 10712, 23319, 373, 6157, 13, 770], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [9]:
from torch.optim import AdamW
from tensorboardX import SummaryWriter
writer = SummaryWriter(save_model_path)
optimizer = AdamW(model.parameters(), lr=2e-8, eps=1e-5, weight_decay=0.01)
model = model.to(torch.device(device_str))
def train(trainloader, epoch=1, log_step=100):
    global_step = 0
    lowest_loss = 10
    accum_steps = 8
    for ep in range(epoch):
        model.train()
        for idx, batch in enumerate(trainloader):
            # inp = batch['input_ids'][0]
            # print(inp)
            # print(tokenizer.decode(inp))
            # tokenizer(tokenizer.decode(inp))
            if torch.cuda.is_available():
                batch = {k: v.to(torch.device(device_str)) for k, v in batch.items()}
            # with torch.autograd.detect_anomaly():
            optimizer.zero_grad()
            output = model(**batch)
            loss = output.loss
            (loss/ accum_steps).backward()
            # torch.nn.utils.clip_grad_norm_(model.parameters(), 20)
            if (idx+1) % accum_steps == 0 or (idx+1) == len(trainloader):
                optimizer.step()
                optimizer.zero_grad()
                # for name, param in model.named_parameters():
                #     if param.requires_grad:
                #         print(torch.isnan(param.grad).any())
                #         print('name:{} param grad:{} param requires_grad:{},params:{}'.format(name, param.grad,
                #                                                                    param.requires_grad,param))
                if global_step % log_step == 0:
                    print(f"ep: {ep}, global_step: {global_step}, loss: {output.loss.item()}")
                    if lowest_loss > output.loss.item():
                        lowest_loss = output.loss.item()
                        torch.save(model.state_dict(), save_model_path+'pth')
                    writer.add_scalar('loss', output.loss.item(), global_step=global_step)
                global_step += 1
train(dl)

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn