In [1]:
import torch
import numpy as np
from transformers import (AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer, LlamaForCausalLM,
                          GPTNeoForCausalLM, GPT2TokenizerFast,LlamaConfig)
import os
import random
from typing import *
import matplotlib.pyplot as plt
import os
import seaborn as sns
import torch.nn as nn
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer, BloomForCausalLM
from torch.utils.data import DataLoader

2024-01-15 13:23:55.044163: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-15 13:23:55.078065: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
from transformers.pipelines import SUPPORTED_TASKS
for k, v in SUPPORTED_TASKS.items():
    print('----------',k)

---------- audio-classification
---------- automatic-speech-recognition
---------- text-to-audio
---------- feature-extraction
---------- text-classification
---------- token-classification
---------- question-answering
---------- table-question-answering
---------- visual-question-answering
---------- document-question-answering
---------- fill-mask
---------- summarization
---------- translation
---------- text2text-generation
---------- text-generation
---------- zero-shot-classification
---------- zero-shot-image-classification
---------- zero-shot-audio-classification
---------- conversational
---------- image-classification
---------- image-segmentation
---------- image-to-text
---------- object-detection
---------- zero-shot-object-detection
---------- depth-estimation
---------- video-classification
---------- mask-generation
---------- image-to-image


In [2]:
class MyGPTNeoSelfAttention(nn.Module):
    def __init__(self, config, device, is_linear):
        super().__init__()

        max_positions = config.max_position_embeddings
        # bias = torch.tril(torch.ones((max_positions, max_positions), dtype=bool)).view(
        #     1, 1, max_positions, max_positions
        # )

        # local causal self attention is a sliding window where each token can only attend to the previous
        # window_size tokens. This is implemented by updating the causal mask such that for each token
        # all other tokens are masked except the previous window_size tokens.
        # if attention_type == "local":
        #     bias = torch.bitwise_xor(bias, torch.tril(bias, -config.window_size))

        # self.attn_dropout = nn.Dropout(float(config.attention_dropout))
        self.resid_dropout = nn.Dropout(float(config.resid_dropout))

        self.embed_dim = config.hidden_size
        self.num_heads = config.num_heads
        self.head_dim = self.embed_dim // self.num_heads
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )

        self.linear = nn.Sequential(nn.Linear(in_features=self.embed_dim, out_features=self.embed_dim*2, bias=False, dtype=torch.float16),
                                    nn.Linear(in_features=self.embed_dim*2, out_features=self.embed_dim, bias=False, dtype=torch.float16))
        from torch.nn.init import xavier_uniform_
        xavier_uniform_(self.linear[0].weight.data)
        xavier_uniform_(self.linear[1].weight.data)
        self.device = device
        self.is_linear = is_linear

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        layer_past=None,
        head_mask=None,
        use_cache=False,
        output_attentions=False,
    ):

        bsz, q_len, _ = hidden_states.size()
        input_dtype = hidden_states.dtype
        if input_dtype != torch.float16:
            self.linear = self.linear.to(input_dtype)

        key = torch.ones((bsz, q_len, self.num_heads, self.head_dim), dtype=input_dtype).to(
            self.device).transpose(1, 2)
        value = torch.ones((bsz, q_len, self.num_heads, self.head_dim), dtype=input_dtype).to(
            self.device).transpose(1, 2)

        attn_weights = torch.zeros((bsz, self.num_heads, q_len, q_len), dtype=input_dtype).to(self.device)

        attn_output = self.linear(hidden_states)
        p = self.linear.parameters()
        if self.is_linear:
            # attn_output = self.resid_dropout(attn_output)
            pass
        else:
            attn_output = torch.ones_like(attn_output, dtype=input_dtype).to(self.device)

        if layer_past is not None:
            past_key = layer_past[0]
            past_value = layer_past[1]
            key = torch.cat((past_key, key), dim=-2)
            value = torch.cat((past_value, value), dim=-2)

        if use_cache is True:
            present = (key, value)
        else:
            present = None

        outputs = (attn_output, present)
        if output_attentions:
            outputs += (attn_weights,)

        return outputs  # a, present, (attentions)

class MyLlamaAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config: LlamaConfig, device, is_linear):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.hidden_size // self.num_heads
        self.num_key_value_heads = config.num_key_value_heads
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
        self.max_position_embeddings = config.max_position_embeddings
        self.rope_theta = config.rope_theta
        self.is_causal = True

        # self.resid_dropout = nn.Dropout(float(config.resid_dropout))
        self.linear = nn.Sequential(nn.Linear(in_features=2048, out_features=5632, bias=False, dtype=torch.float16),
                                                      nn.Linear(in_features=5632, out_features=2048, bias=False, dtype=torch.float16))
        self.device = device
        self.is_linear =is_linear
        if (self.head_dim * self.num_heads) != self.hidden_size:
            raise ValueError(
                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                f" and `num_heads`: {self.num_heads})."
            )


    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:

        bsz, q_len, _ = hidden_states.size()
        input_dtype = hidden_states.dtype
        if input_dtype != torch.float16:
            self.linear = self.linear.to(input_dtype)


        key_states = torch.zeros((bsz, q_len, self.num_key_value_heads, self.head_dim), dtype=input_dtype).to(self.device).transpose(1, 2)
        value_states = torch.zeros((bsz, q_len, self.num_key_value_heads, self.head_dim), dtype=input_dtype).to(self.device).transpose(1, 2)

        past_key_value = (key_states, value_states) if use_cache else None

        attn_weights = torch.zeros((bsz, self.num_heads, q_len, q_len), dtype=input_dtype).to(self.device)

        attn_output = self.linear(hidden_states)
        if not self.is_linear:
            attn_output = torch.zeros_like(attn_output, dtype=input_dtype).to(self.device)

        return attn_output, attn_weights, past_key_value


In [3]:
def plt_heatMap_sns(scores, save_path=None, title=None, cmap=None, y_ticks=None, x_ticks=None, show=None):
    plt.subplots(figsize=(20, 20), dpi=200)
    plt.rcParams['font.size'] = '10'
    if cmap is None:
        cmap = sns.color_palette("Reds", as_cmap=True)
    if x_ticks and y_ticks:
        sns.heatmap(scores, cmap=cmap,  xticklabels=x_ticks, yticklabels=y_ticks)
    else:
        sns.heatmap(scores, cmap=cmap)
    if title is not None:
        plt.title(title)
    if save_path:
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        plt.savefig(os.path.join(save_path, f'{title}.png'), bbox_inches="tight")
    if show:
        plt.show()
    plt.close()


def load_model(model_name: str, device='cuda', low_cpu_mem_usage=False, layers=[], train_layers=[], is_linear=False):
    tokenizer: GPT2TokenizerFast = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    if model_name.find('gpt-neo') != -1:
        model: GPTNeoForCausalLM = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)# .to(device)

        for layer_id in layers:
            model.transformer.h[layer_id].attn.attention = MyGPTNeoSelfAttention(model.config, device=device,
                                                                                 is_linear=is_linear).to(device)
            # model.model.layers[layer_id].self_attn = LlamaAttentionLinear(model.config , device).to(device)

        MODEL_CONFIG = {"n_heads": model.config.num_attention_heads,
                        "n_layers": model.config.num_hidden_layers,
                        "resid_dim": model.config.hidden_size,
                        "name_or_path": model.config.name_or_path,
                        "attn_hook_names": [f'transformer.h.{layer}.attn.attention.attn_dropout' for layer in
                                            range(model.config.num_hidden_layers)]
                        }
        # p = model.named_parameters()
        if len(train_layers) > 0:
            for k, params in model.named_parameters():
                flag = False
                for layer_id in train_layers:
                    if is_linear:
                        name = f'transformer.h.{layer_id}.attn.attention'
                    else:
                        name = f'transformer.h.{layer_id}.mlp'
                    if k.find(name) != -1:
                        flag = True
                params.requires_grad = flag

        return model, tokenizer, MODEL_CONFIG
    if model_name.find('TinyLlama') != -1:
        model: LlamaForCausalLM = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)#.to(device)

        for layer_id in layers:
            # model.transformer.h[layer_id].attn.attention = MyGPTNeoSelfAttention(model.config, device=device,is_linear=is_linear).to(device)
            model.model.layers[layer_id].self_attn = MyLlamaAttention(model.config, device, is_linear=is_linear).to(
                device)

        MODEL_CONFIG = {"n_heads": model.config.num_attention_heads,
                        "n_layers": model.config.num_hidden_layers,
                        "resid_dim": model.config.hidden_size,
                        "name_or_path": model.config.name_or_path,
                        "attn_hook_names": [f'model.layers.{layer}.self_attn.attn_dropout' for layer in
                                            range(model.config.num_hidden_layers)]
                        }
        if len(train_layers) > 0:
            for k, params in model.named_parameters():
                for layer_id in train_layers:
                    if k.find(str(layer_id)) == -1:
                        params.requires_grad = False

        return model, tokenizer, MODEL_CONFIG


def draw_attention(model, tokenizer, MODEL_CONFIG, device, prompt, check_token_id, save_path, title):
    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    output_and_cache = model(**inputs, output_hidden_states=True, output_attentions=True)
    ground_attentions = torch.cat(output_and_cache.attentions, dim=0).detach().cpu().numpy()
    ground_attentions = ground_attentions[:, :, check_token_id, :]

    x_ticks = [f"layer{i + 1}" for i in range(MODEL_CONFIG['n_layers'])]
    encoded_line = tokenizer.encode(prompt)
    codes = tokenizer.convert_ids_to_tokens(encoded_line)
    y_ticks = [f"head{i_head}-{c}" for i_head in range(MODEL_CONFIG['n_heads']) for i, c in enumerate(codes)]
    plt_heatMap_sns(ground_attentions.reshape(ground_attentions.shape[0], -1).T,
                    title=title, x_ticks=x_ticks, y_ticks=y_ticks
                    , show=True, save_path=save_path)


In [4]:
layers = [22]  # [23]
train_layers = layers
is_linear = False
save_model_path = f'./results/gpt_neo_{layers}_{is_linear}_{train_layers}'

device_str = 'cuda:1'

In [5]:
torch.cuda.empty_cache()
model_name = 'EleutherAI/gpt-neo-1.3B'  # 'EleutherAI/gpt-neo-1.3B' # 'TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T' # 'EleutherAI/gpt-neo-125m'
model, tokenizer, MODEL_CONFIG = load_model(model_name, device=device_str, layers=layers, train_layers=train_layers,
                                            is_linear=is_linear)

In [6]:
print(tokenizer)
# MODEL_CONFIG
print(tokenizer.eos_token)

GPT2TokenizerFast(name_or_path='EleutherAI/gpt-neo-1.3B', vocab_size=50257, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
<|endoftext|>


In [7]:
model

GPTNeoForCausalLM(
  (transformer): GPTNeoModel(
    (wte): Embedding(50257, 2048)
    (wpe): Embedding(2048, 2048)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-21): 22 x GPTNeoBlock(
        (ln_1): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (attn): GPTNeoAttention(
          (attention): GPTNeoSelfAttention(
            (attn_dropout): Dropout(p=0.0, inplace=False)
            (resid_dropout): Dropout(p=0.0, inplace=False)
            (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (out_proj): Linear(in_features=2048, out_features=2048, bias=True)
          )
        )
        (ln_2): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTNeoMLP(
          (c_fc): Linear(in_features=2048, out_features=8192, bias=True)
          (c_proj):

In [8]:
import sys
# print(sys.path)
import os
sys.path.append(os.path.join(sys.path[0], '../'))
from utils.evaluation_lm_eval import run_eval_harness
from gpt_neo import load_model


# model.load_state_dict(torch.load(save_model_path+'pth'))
model = model.to(torch.device(device_str))
# result2 = run_eval_harness(model, tokenizer, "test_gpt_j",None, torch.device(device), 4, sink_token=None)

from transformers import pipeline
#文本生成
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device_str, pad_token_id=tokenizer.eos_token_id)
results= text_generator("As far as I am concerned, I will",
			   max_length=128,
			   do_sample=True)
print(results[0]['generated_text'])

As far as I am concerned, I will be back in the future. I just had to wait until the other team members can give me their opinion before I decided for myself.

I have to say that I'm really looking forward to it especially since this series will be my first-ever anime. It will definitely be a fresh start on both our side :) I think I'll keep this blog for a long time :)

For my birthday, I bought a new laptop for my computer lab...
and it was so convenient as it has dual monitor support. We are currently using a Macbook Pro and I was wondering if


In [9]:
 # dataset = load_dataset('cerebras/SlimPajama-627B',split='train[:100]') # TinyLlama
dataset = load_dataset('monology/pile-uncopyrighted', split='train[:1%]')  # gpt-neo

def process_func(examples):
    contents = [e + tokenizer.eos_token for e in examples["text"]]
    return tokenizer(contents, max_length=128, truncation=True)
tokenized_ds = dataset.map(process_func, batched=True, remove_columns=dataset.column_names)

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

In [10]:
len(tokenized_ds)

1770097

In [11]:
# print(len(tokenized_ds['input_ids'][2]))
# print(len(tokenized_ds['input_ids'][0]))
dl = DataLoader(tokenized_ds, batch_size=16, collate_fn=DataCollatorForLanguageModeling(tokenizer, mlm=False), shuffle=True)
inp = next(enumerate(dl))[1]['input_ids'][0]
print(inp)
print(tokenizer.decode(inp))
tokenizer(tokenizer.decode(inp))

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([   32,  4950, 38504,   279, 23048,  1657,    11, 10691,   422,   262,
        16236,   447,   247,    82,    13,   383,  1115,    12,  2971,   279,
        23048,  3033,   257, 21682,   276,    11, 37204, 29500, 17979, 41860,
          351,   257,  7932,   300,   813, 15061,  1486,    13,   383, 17979,
          468,   257,  1598,  3641,   351,   281, 12531,  1486,    11,   351,
          257,   307,  5286,  4865,  1088,   262,  5743,   286,   262, 17979,
           13, 23302,   262, 17979,   318,   257,  1598,    11,  3652,  5819,
           12, 16760,  5405,  2665,   351,   257,  5405,  9396,   284,   543,
          262,  6333,   318,  7223,    13,   770,  1657, 29220,   318,   287,
         6275,  4006,    26,   340,   468,   587, 28049,   302, 44236,   290,
         2058,   351,   477,   262,  3306, 32161,   329,  3660,  9988,    13,
          383,  5405,   318,   635,   287,  6275,  4006,    11,   351,   645,
        23217,   393,  9457,    11,   655,   262,  2938,  5104])

{'input_ids': [32, 4950, 38504, 279, 23048, 1657, 11, 10691, 422, 262, 16236, 447, 247, 82, 13, 383, 1115, 12, 2971, 279, 23048, 3033, 257, 21682, 276, 11, 37204, 29500, 17979, 41860, 351, 257, 7932, 300, 813, 15061, 1486, 13, 383, 17979, 468, 257, 1598, 3641, 351, 281, 12531, 1486, 11, 351, 257, 307, 5286, 4865, 1088, 262, 5743, 286, 262, 17979, 13, 23302, 262, 17979, 318, 257, 1598, 11, 3652, 5819, 12, 16760, 5405, 2665, 351, 257, 5405, 9396, 284, 543, 262, 6333, 318, 7223, 13, 770, 1657, 29220, 318, 287, 6275, 4006, 26, 340, 468, 587, 28049, 302, 44236, 290, 2058, 351, 477, 262, 3306, 32161, 329, 3660, 9988, 13, 383, 5405, 318, 635, 287, 6275, 4006, 11, 351, 645, 23217, 393, 9457, 11, 655, 262, 2938, 5104], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [12]:
from torch.optim import AdamW
from tensorboardX import SummaryWriter
writer = SummaryWriter(save_model_path)
optimizer = AdamW(model.parameters(), lr=2e-8, eps=1e-5, weight_decay=0.01)
model = model.to(torch.device(device_str))
def train(trainloader, epoch=1, log_step=100):
    global_step = 0
    lowest_loss = 0
    save_step = 100000
    for ep in range(epoch):
        model.train()
        for batch in trainloader:
            # inp = batch['input_ids'][0]
            # print(inp)
            # print(tokenizer.decode(inp))
            # tokenizer(tokenizer.decode(inp))
            if torch.cuda.is_available():
                batch = {k: v.to(torch.device(device_str)) for k, v in batch.items()}
            # with torch.autograd.detect_anomaly():
            optimizer.zero_grad()
            output = model(**batch)
            # for name, param in model.named_parameters():
                # if param.requires_grad:
                #     # print(torch.isnan(param.grad).any())
                #     # print('name:{} param grad:{} param requires_grad:{},params:{}'.format(name, param.grad, param.requires_grad,param))
                #     print('name:{} param requires_grad:{}'.format(name, param.requires_grad))
            loss = output.loss
            loss.backward()
            # torch.nn.utils.clip_grad_norm_(model.parameters(), 20)
            optimizer.step()
            optimizer.zero_grad()
            
            if global_step % log_step == 0:
                print(f"ep: {ep}, global_step: {global_step}, loss: {output.loss.item()}")
                if lowest_loss > output.loss.item():
                    lowest_loss = output.loss.item()
                    torch.save(model.state_dict(), save_model_path+'pth')
                if  global_step % save_step == 0: 
                    torch.save(model.state_dict(), save_model_path+'pth')
                writer.add_scalar('loss', output.loss.item(), global_step=global_step)
            global_step += 1
train(dl)

ep: 0, global_step: 0, loss: 2.298828125
ep: 0, global_step: 100, loss: 2.626953125
ep: 0, global_step: 200, loss: 2.2421875
ep: 0, global_step: 300, loss: 2.150390625
ep: 0, global_step: 400, loss: 2.244140625
ep: 0, global_step: 500, loss: 2.103515625
ep: 0, global_step: 600, loss: 2.73828125
ep: 0, global_step: 700, loss: 2.197265625
ep: 0, global_step: 800, loss: 2.59765625
ep: 0, global_step: 900, loss: 2.103515625
ep: 0, global_step: 1000, loss: 2.08984375
ep: 0, global_step: 1100, loss: 2.21875
ep: 0, global_step: 1200, loss: 2.27734375
ep: 0, global_step: 1300, loss: 2.40234375
ep: 0, global_step: 1400, loss: 2.267578125
ep: 0, global_step: 1500, loss: 2.36328125
ep: 0, global_step: 1600, loss: 2.416015625
ep: 0, global_step: 1700, loss: 2.84375
ep: 0, global_step: 1800, loss: 2.447265625
ep: 0, global_step: 1900, loss: 2.583984375
ep: 0, global_step: 2000, loss: 2.09375
ep: 0, global_step: 2100, loss: 2.138671875
ep: 0, global_step: 2200, loss: 2.556640625
ep: 0, global_step: 

ep: 0, global_step: 19300, loss: 1.978515625
ep: 0, global_step: 19400, loss: 2.37890625
ep: 0, global_step: 19500, loss: 2.412109375
ep: 0, global_step: 19600, loss: 2.439453125
ep: 0, global_step: 19700, loss: 2.1171875
ep: 0, global_step: 19800, loss: 2.357421875
ep: 0, global_step: 19900, loss: 2.322265625
ep: 0, global_step: 20000, loss: 2.1875
ep: 0, global_step: 20100, loss: 2.482421875
ep: 0, global_step: 20200, loss: 1.974609375
ep: 0, global_step: 20300, loss: 2.59765625
ep: 0, global_step: 20400, loss: 1.654296875
ep: 0, global_step: 20500, loss: 2.291015625
ep: 0, global_step: 20600, loss: 2.216796875
ep: 0, global_step: 20700, loss: 1.86328125
ep: 0, global_step: 20800, loss: 2.580078125
ep: 0, global_step: 20900, loss: 2.10546875
ep: 0, global_step: 21000, loss: 2.4765625
ep: 0, global_step: 21100, loss: 2.271484375
ep: 0, global_step: 21200, loss: 2.294921875
ep: 0, global_step: 21300, loss: 2.10546875
ep: 0, global_step: 21400, loss: 2.125
ep: 0, global_step: 21500, los

ep: 0, global_step: 38000, loss: 2.11328125
ep: 0, global_step: 38100, loss: 2.029296875
ep: 0, global_step: 38200, loss: 2.421875
ep: 0, global_step: 38300, loss: 2.5703125
ep: 0, global_step: 38400, loss: 2.203125
ep: 0, global_step: 38500, loss: 2.41015625
ep: 0, global_step: 38600, loss: 2.115234375
ep: 0, global_step: 38700, loss: 2.541015625
ep: 0, global_step: 38800, loss: 2.31640625
ep: 0, global_step: 38900, loss: 2.6796875
ep: 0, global_step: 39000, loss: 2.46484375
ep: 0, global_step: 39100, loss: 2.09765625
ep: 0, global_step: 39200, loss: 2.326171875
ep: 0, global_step: 39300, loss: 2.46875
ep: 0, global_step: 39400, loss: 2.505859375
ep: 0, global_step: 39500, loss: 2.369140625
ep: 0, global_step: 39600, loss: 2.373046875
ep: 0, global_step: 39700, loss: 1.7421875
ep: 0, global_step: 39800, loss: 2.0703125
ep: 0, global_step: 39900, loss: 1.83203125
ep: 0, global_step: 40000, loss: 1.8076171875
ep: 0, global_step: 40100, loss: 2.294921875
ep: 0, global_step: 40200, loss: 

ep: 0, global_step: 56600, loss: 2.224609375
ep: 0, global_step: 56700, loss: 2.359375
ep: 0, global_step: 56800, loss: 2.455078125
ep: 0, global_step: 56900, loss: 2.509765625
ep: 0, global_step: 57000, loss: 2.404296875
ep: 0, global_step: 57100, loss: 2.458984375
ep: 0, global_step: 57200, loss: 2.060546875
ep: 0, global_step: 57300, loss: 2.2734375
ep: 0, global_step: 57400, loss: 2.392578125
ep: 0, global_step: 57500, loss: 2.134765625
ep: 0, global_step: 57600, loss: 2.357421875
ep: 0, global_step: 57700, loss: 2.298828125
ep: 0, global_step: 57800, loss: 2.048828125
ep: 0, global_step: 57900, loss: 2.53125
ep: 0, global_step: 58000, loss: 2.23828125
ep: 0, global_step: 58100, loss: 2.375
ep: 0, global_step: 58200, loss: 1.916015625
ep: 0, global_step: 58300, loss: 2.00390625
ep: 0, global_step: 58400, loss: 2.548828125
ep: 0, global_step: 58500, loss: 2.302734375
ep: 0, global_step: 58600, loss: 2.107421875
ep: 0, global_step: 58700, loss: 2.552734375
ep: 0, global_step: 58800, 

ep: 0, global_step: 75200, loss: 1.85546875
ep: 0, global_step: 75300, loss: 2.140625
ep: 0, global_step: 75400, loss: 2.40625
ep: 0, global_step: 75500, loss: 2.126953125
ep: 0, global_step: 75600, loss: 2.587890625
ep: 0, global_step: 75700, loss: 2.044921875
ep: 0, global_step: 75800, loss: 2.20703125
ep: 0, global_step: 75900, loss: 1.990234375
ep: 0, global_step: 76000, loss: 1.93359375
ep: 0, global_step: 76100, loss: 2.23046875
ep: 0, global_step: 76200, loss: 2.19921875
ep: 0, global_step: 76300, loss: 2.1484375
ep: 0, global_step: 76400, loss: 2.38671875
ep: 0, global_step: 76500, loss: 2.39453125
ep: 0, global_step: 76600, loss: 2.525390625
ep: 0, global_step: 76700, loss: 2.359375
ep: 0, global_step: 76800, loss: 2.564453125
ep: 0, global_step: 76900, loss: 2.494140625
ep: 0, global_step: 77000, loss: 2.740234375
ep: 0, global_step: 77100, loss: 2.287109375
ep: 0, global_step: 77200, loss: 2.275390625
ep: 0, global_step: 77300, loss: 1.9267578125
ep: 0, global_step: 77400, l

ep: 0, global_step: 93800, loss: 2.2578125
ep: 0, global_step: 93900, loss: 2.484375
ep: 0, global_step: 94000, loss: 2.12890625
ep: 0, global_step: 94100, loss: 2.23046875
ep: 0, global_step: 94200, loss: 2.521484375
ep: 0, global_step: 94300, loss: 2.07421875
ep: 0, global_step: 94400, loss: 2.416015625
ep: 0, global_step: 94500, loss: 2.34375
ep: 0, global_step: 94600, loss: 2.421875
ep: 0, global_step: 94700, loss: 2.234375
ep: 0, global_step: 94800, loss: 1.8310546875
ep: 0, global_step: 94900, loss: 2.1640625
ep: 0, global_step: 95000, loss: 2.330078125
ep: 0, global_step: 95100, loss: 2.2578125
ep: 0, global_step: 95200, loss: 2.294921875
ep: 0, global_step: 95300, loss: 1.9833984375
ep: 0, global_step: 95400, loss: 2.33984375
ep: 0, global_step: 95500, loss: 2.1640625
ep: 0, global_step: 95600, loss: 2.029296875
ep: 0, global_step: 95700, loss: 2.35546875
ep: 0, global_step: 95800, loss: 2.177734375
ep: 0, global_step: 95900, loss: 2.25390625
ep: 0, global_step: 96000, loss: 2.