In [1]:
import pandas as pd
import re
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling, GPTNeoForCausalLM, GPT2Tokenizer
from transformers import pipeline
from transformers import Trainer, TrainingArguments
import torch
import gc
import wandb

torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3070 Laptop GPU'

In [2]:
train_path = 'datasets/train_dataset.csv'
test_path = 'datasets/test_dataset.csv'

model_path = "./models/gpt-neo-dialogs"

In [4]:
def tokenize_function(examples):
    return tokenizer(examples['text'], return_special_tokens_mask=True)

datasets = load_dataset('csv', data_files={'train': train_path, 'test': test_path})

tokenized_datasets = datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=['text'],
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Using custom data configuration default-d27f0e2a7b0ecb18
Reusing dataset csv (C:\Users\zhuzi\.cache\huggingface\datasets\csv\default-d27f0e2a7b0ecb18\0.0.0\9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/75 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2085 > 2048). Running this sequence through the model will result in indexing errors


  0%|          | 0/9 [00:00<?, ?ba/s]

In [5]:
max_seq_length = 128

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // max_seq_length) * max_seq_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
        for k, t in concatenated_examples.items()
    }
    return result

tokenized_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
)

  0%|          | 0/75 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

In [10]:
hyperparams = [
    (5e-5, 'linear', True), 
    (1e-5, 'cosine', False),
    (5e-6, 'constant', False),
    (5e-5, 'cosine', True),
    (5e-6, 'linear', False),
    (5e-5, 'linear', False),
]

for (learning_rate, lr_scheduler_type, adafactor) in hyperparams:
    model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M")
    tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
    tokenizer.pad_token = tokenizer.eos_token
    
    wandb.init(project='gpt-neo-dialogs', entity='ziyizhu')

    training_args = TrainingArguments(
        output_dir=model_path,         # The output directory
        overwrite_output_dir=True,     # overwrite the content of the output directory
        num_train_epochs=3,            # number of training epochs
        learning_rate=learning_rate,
        lr_scheduler_type=lr_scheduler_type,
        adafactor=adafactor,
        per_device_train_batch_size=8, # batch size for training
        per_device_eval_batch_size=16, # batch size for evaluation
        evaluation_strategy="steps",
        eval_steps=500,                # Number of update steps between two evaluations.
        save_steps=1000,               # after # steps model is saved
        warmup_steps=500,              # number of warmup steps for learning rate scheduler
        report_to="wandb",             # enable logging to W&B
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['test'],
    )
    
    torch.cuda.empty_cache()
    gc.collect()

    trainer.train()
    
    wandb.config.test_size = 0.1
    wandb.finish()

loading configuration file https://huggingface.co/EleutherAI/gpt-neo-125M/resolve/main/config.json from cache at C:\Users\zhuzi/.cache\huggingface\transformers\29380fef22a43cbfb3d3a6c8e2f4fd951459584d87c34e4621b30580a54aca84.f0f7ebddfc6e15a23ac33e7fa95cd8cca05edf87cc74f9e3be7905f538a59762
Model config GPTNeoConfig {
  "activation_function": "gelu_new",
  "architectures": [
    "GPTNeoForCausalLM"
  ],
  "attention_dropout": 0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      6
    ]
  ],
  "bos_token_id": 50256,
  "embed_dropout": 0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": null,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neo",


PyTorch: setting up devices
The following columns in the training set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running training *****
  Num examples = 43768
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 16413
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
500,2.8268,2.844819
1000,2.7585,2.845232
1500,2.7611,2.840645
2000,2.7494,2.840839
2500,2.735,2.833065
3000,2.7134,2.833573
3500,2.7095,2.827738
4000,2.7016,2.825691
4500,2.6867,2.821231
5000,2.6848,2.818866


The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
Saving model checkpoint to ./models/gpt-neo-dialogs\checkpoint-1000
Configuration saved in ./models/gpt-neo-dialogs\checkpoint-1000\config.json
Model weights saved in ./models/gpt-neo-dialogs\checkpoint-1000\pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoFo

***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
Saving model checkpoint to ./models/gpt-neo-dialogs\checkpoint-13000
Configuration saved in ./models/gpt-neo-dialogs\checkpoint-13000\config.json
Model weights saved in ./models/gpt-neo-dialogs\checkpoint-13000\pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
Saving model checkp

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
eval/loss,2.82982
eval/runtime,22.5599
eval/samples_per_second,220.612
eval/steps_per_second,13.83
train/epoch,3.0
train/global_step,16413.0
train/learning_rate,0.0
train/loss,2.418
train/total_flos,8574384177414144.0
train/train_loss,2.56298


0,1
eval/loss,██▇▇▅▅▃▃▂▁▁▃▃▃▃▂▂▂▂▁▁▄▆▅▆▅▅▅▄▄▄▄
eval/runtime,▅▃█▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/samples_per_second,▄▆▁██▇██████████████████████████
eval/steps_per_second,▄▆▁██▇██████████████████████████
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/learning_rate,███▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▁▁▁
train/loss,█▇▇▇▆▆▆▆▆▆▅▃▃▃▃▃▃▃▃▃▃▃▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁


loading configuration file https://huggingface.co/EleutherAI/gpt-neo-125M/resolve/main/config.json from cache at C:\Users\zhuzi/.cache\huggingface\transformers\29380fef22a43cbfb3d3a6c8e2f4fd951459584d87c34e4621b30580a54aca84.f0f7ebddfc6e15a23ac33e7fa95cd8cca05edf87cc74f9e3be7905f538a59762
Model config GPTNeoConfig {
  "activation_function": "gelu_new",
  "architectures": [
    "GPTNeoForCausalLM"
  ],
  "attention_dropout": 0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      6
    ]
  ],
  "bos_token_id": 50256,
  "embed_dropout": 0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": null,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neo",


PyTorch: setting up devices
The following columns in the training set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running training *****
  Num examples = 43768
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 16413
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
500,2.8825,2.819076
1000,2.7448,2.79779
1500,2.7399,2.790134
2000,2.7301,2.78462
2500,2.7175,2.780637
3000,2.7003,2.7784
3500,2.6984,2.777615
4000,2.6928,2.775136
4500,2.6804,2.77297
5000,2.6807,2.771894


The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
Saving model checkpoint to ./models/gpt-neo-dialogs\checkpoint-1000
Configuration saved in ./models/gpt-neo-dialogs\checkpoint-1000\config.json
Model weights saved in ./models/gpt-neo-dialogs\checkpoint-1000\pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoFo

***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
Saving model checkpoint to ./models/gpt-neo-dialogs\checkpoint-13000
Configuration saved in ./models/gpt-neo-dialogs\checkpoint-13000\config.json
Model weights saved in ./models/gpt-neo-dialogs\checkpoint-13000\pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
Saving model checkp

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
eval/loss,2.76787
eval/runtime,22.5731
eval/samples_per_second,220.484
eval/steps_per_second,13.822
train/epoch,3.0
train/global_step,16413.0
train/learning_rate,0.0
train/loss,2.5899
train/total_flos,8574384177414144.0
train/train_loss,2.64739


0,1
eval/loss,█▅▄▃▃▂▂▂▂▂▁▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,█▃▃▃▂▂▁▁▃▃▃▃▁▂▂▃▃▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/samples_per_second,▁▆▆▆▇▇██▆▆▆▆█▇▇▆▆▆▆█████████████
eval/steps_per_second,▁▆▆▆▇▇██▆▆▆▆█▇▇▆▆▆▆█████████████
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/learning_rate,██████▇▇▇▇▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁▁
train/loss,█▅▅▅▄▄▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂▁▁▂▁▁▁
train/total_flos,▁
train/train_loss,▁


loading configuration file https://huggingface.co/EleutherAI/gpt-neo-125M/resolve/main/config.json from cache at C:\Users\zhuzi/.cache\huggingface\transformers\29380fef22a43cbfb3d3a6c8e2f4fd951459584d87c34e4621b30580a54aca84.f0f7ebddfc6e15a23ac33e7fa95cd8cca05edf87cc74f9e3be7905f538a59762
Model config GPTNeoConfig {
  "activation_function": "gelu_new",
  "architectures": [
    "GPTNeoForCausalLM"
  ],
  "attention_dropout": 0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      6
    ]
  ],
  "bos_token_id": 50256,
  "embed_dropout": 0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": null,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neo",


PyTorch: setting up devices
The following columns in the training set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running training *****
  Num examples = 43768
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 16413
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
500,2.8259,2.818169
1000,2.7507,2.801069
1500,2.7499,2.793846
2000,2.7425,2.787749
2500,2.73,2.783694
3000,2.7142,2.780505
3500,2.7123,2.778916
4000,2.707,2.776764
4500,2.6945,2.774407
5000,2.6949,2.773157


The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
Saving model checkpoint to ./models/gpt-neo-dialogs\checkpoint-1000
Configuration saved in ./models/gpt-neo-dialogs\checkpoint-1000\config.json
Model weights saved in ./models/gpt-neo-dialogs\checkpoint-1000\pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoFo

***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
Saving model checkpoint to ./models/gpt-neo-dialogs\checkpoint-13000
Configuration saved in ./models/gpt-neo-dialogs\checkpoint-13000\config.json
Model weights saved in ./models/gpt-neo-dialogs\checkpoint-13000\pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
Saving model checkp

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
eval/loss,2.76933
eval/runtime,22.5579
eval/samples_per_second,220.632
eval/steps_per_second,13.831
train/epoch,3.0
train/global_step,16413.0
train/learning_rate,1e-05
train/loss,2.613
train/total_flos,8574384177414144.0
train/train_loss,2.66716


0,1
eval/loss,█▆▅▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,█▃▂▃▁▃▄▅▅▅▄▆▅▄▆▇▅▄▅▅▅▅▆▅▇▅▄▅▆▅▆▆
eval/samples_per_second,▁▆▇▆█▆▅▄▄▄▅▃▄▅▃▂▄▅▄▄▄▄▃▄▂▄▅▄▃▄▃▃
eval/steps_per_second,▁▆▇▆█▆▅▄▄▄▅▃▄▅▃▂▅▅▄▄▄▄▃▄▂▄▅▄▃▄▃▃
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▆▆▅▅▅▄▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▁▂▁▂▂▂▂▁
train/total_flos,▁
train/train_loss,▁


loading configuration file https://huggingface.co/EleutherAI/gpt-neo-125M/resolve/main/config.json from cache at C:\Users\zhuzi/.cache\huggingface\transformers\29380fef22a43cbfb3d3a6c8e2f4fd951459584d87c34e4621b30580a54aca84.f0f7ebddfc6e15a23ac33e7fa95cd8cca05edf87cc74f9e3be7905f538a59762
Model config GPTNeoConfig {
  "activation_function": "gelu_new",
  "architectures": [
    "GPTNeoForCausalLM"
  ],
  "attention_dropout": 0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      6
    ]
  ],
  "bos_token_id": 50256,
  "embed_dropout": 0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": null,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neo",


PyTorch: setting up devices
The following columns in the training set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running training *****
  Num examples = 43768
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 16413
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
500,2.8268,2.844819
1000,2.759,2.846451
1500,2.7627,2.843015
2000,2.7519,2.844218
2500,2.7381,2.837416
3000,2.717,2.839216
3500,2.7133,2.833132
4000,2.7055,2.831294
4500,2.6903,2.827171
5000,2.688,2.82477


The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
Saving model checkpoint to ./models/gpt-neo-dialogs\checkpoint-1000
Configuration saved in ./models/gpt-neo-dialogs\checkpoint-1000\config.json
Model weights saved in ./models/gpt-neo-dialogs\checkpoint-1000\pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoFo

***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
Saving model checkpoint to ./models/gpt-neo-dialogs\checkpoint-13000
Configuration saved in ./models/gpt-neo-dialogs\checkpoint-13000\config.json
Model weights saved in ./models/gpt-neo-dialogs\checkpoint-13000\pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
Saving model checkp

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
eval/loss,2.83107
eval/runtime,22.5915
eval/samples_per_second,220.304
eval/steps_per_second,13.81
train/epoch,3.0
train/global_step,16413.0
train/learning_rate,0.0
train/loss,2.4017
train/total_flos,8574384177414144.0
train/train_loss,2.55555


0,1
eval/loss,██▇▇▆▆▅▄▃▂▃▅▄▄▄▃▃▃▂▂▁▃▆▄▅▄▄▄▄▄▄▄
eval/runtime,▇█▃▂▁▂▁▁▂▁▁▂▁▂▂▃▄▄▅▄▆▆▆▅▅▅▆▅▅▄▄▅
eval/samples_per_second,▂▁▆▇█▇██▇██▇█▇▇▆▅▅▄▅▃▃▃▄▄▄▃▄▄▅▅▄
eval/steps_per_second,▂▁▆▇▇▇▇▇▇██▇█▇▇▆▅▅▄▅▃▃▃▄▄▄▃▄▄▅▅▄
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/learning_rate,██████▇▇▇▇▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁▁
train/loss,█▇▇▇▇▆▆▆▆▆▆▃▃▃▃▃▃▃▃▃▃▃▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁


loading configuration file https://huggingface.co/EleutherAI/gpt-neo-125M/resolve/main/config.json from cache at C:\Users\zhuzi/.cache\huggingface\transformers\29380fef22a43cbfb3d3a6c8e2f4fd951459584d87c34e4621b30580a54aca84.f0f7ebddfc6e15a23ac33e7fa95cd8cca05edf87cc74f9e3be7905f538a59762
Model config GPTNeoConfig {
  "activation_function": "gelu_new",
  "architectures": [
    "GPTNeoForCausalLM"
  ],
  "attention_dropout": 0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      6
    ]
  ],
  "bos_token_id": 50256,
  "embed_dropout": 0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": null,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neo",


PyTorch: setting up devices
The following columns in the training set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running training *****
  Num examples = 43768
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 16413
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
500,2.922,2.837582
1000,2.7633,2.805496
1500,2.7553,2.795511
2000,2.7463,2.788532
2500,2.7331,2.784108
3000,2.7172,2.780734
3500,2.7153,2.778864
4000,2.7101,2.776872
4500,2.698,2.77455
5000,2.6986,2.773267


The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
Saving model checkpoint to ./models/gpt-neo-dialogs\checkpoint-1000
Configuration saved in ./models/gpt-neo-dialogs\checkpoint-1000\config.json
Model weights saved in ./models/gpt-neo-dialogs\checkpoint-1000\pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoFo

***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
Saving model checkpoint to ./models/gpt-neo-dialogs\checkpoint-13000
Configuration saved in ./models/gpt-neo-dialogs\checkpoint-13000\config.json
Model weights saved in ./models/gpt-neo-dialogs\checkpoint-13000\pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
Saving model checkp

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
eval/loss,2.7664
eval/runtime,22.5419
eval/samples_per_second,220.789
eval/steps_per_second,13.841
train/epoch,3.0
train/global_step,16413.0
train/learning_rate,0.0
train/loss,2.6378
train/total_flos,8574384177414144.0
train/train_loss,2.68266


0,1
eval/loss,█▅▄▃▃▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁
eval/samples_per_second,▇▇██████████████████████▁███████
eval/steps_per_second,▇▇██████████████████████▁███████
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/learning_rate,███▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▁▁▁
train/loss,█▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂▁▁▂▁▁▁
train/total_flos,▁
train/train_loss,▁


loading configuration file https://huggingface.co/EleutherAI/gpt-neo-125M/resolve/main/config.json from cache at C:\Users\zhuzi/.cache\huggingface\transformers\29380fef22a43cbfb3d3a6c8e2f4fd951459584d87c34e4621b30580a54aca84.f0f7ebddfc6e15a23ac33e7fa95cd8cca05edf87cc74f9e3be7905f538a59762
Model config GPTNeoConfig {
  "activation_function": "gelu_new",
  "architectures": [
    "GPTNeoForCausalLM"
  ],
  "attention_dropout": 0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      6
    ]
  ],
  "bos_token_id": 50256,
  "embed_dropout": 0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": null,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neo",


PyTorch: setting up devices
The following columns in the training set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running training *****
  Num examples = 43768
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 16413
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
500,2.8222,2.828465
1000,2.7425,2.824556
1500,2.7426,2.819348
2000,2.7309,2.820631
2500,2.7173,2.811087
3000,2.6964,2.810147
3500,2.6928,2.81199
4000,2.6853,2.805316
4500,2.6716,2.80337
5000,2.6704,2.801223


The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
Saving model checkpoint to ./models/gpt-neo-dialogs\checkpoint-1000
Configuration saved in ./models/gpt-neo-dialogs\checkpoint-1000\config.json
Model weights saved in ./models/gpt-neo-dialogs\checkpoint-1000\pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoFo

***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
Saving model checkpoint to ./models/gpt-neo-dialogs\checkpoint-13000
Configuration saved in ./models/gpt-neo-dialogs\checkpoint-13000\config.json
Model weights saved in ./models/gpt-neo-dialogs\checkpoint-13000\pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
Saving model checkp

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
eval/loss,2.81839
eval/runtime,22.5675
eval/samples_per_second,220.538
eval/steps_per_second,13.825
train/epoch,3.0
train/global_step,16413.0
train/learning_rate,0.0
train/loss,2.4165
train/total_flos,8574384177414144.0
train/train_loss,2.55383


0,1
eval/loss,█▇▆▆▄▃▄▂▂▁▃▄▃▄▃▃▃▃▃▂▂▅▇▆▇▆▆▆▆▆▅▅
eval/runtime,▄█▄▁▃▄▄▄▃▆▄▅▄▄▄▆▄▅▅▆▅▄▅▅▅▅▄▄▅▅▅▅
eval/samples_per_second,▅▁▅█▆▅▅▅▅▃▅▄▅▅▅▃▅▄▄▃▄▅▄▄▄▄▅▅▄▄▄▄
eval/steps_per_second,▅▁▅█▆▅▅▅▆▃▆▄▆▅▅▃▅▄▄▃▄▅▄▄▄▄▅▅▄▄▄▄
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/learning_rate,███▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▁▁▁
train/loss,█▇▇▆▆▆▆▆▅▅▅▃▃▃▃▃▃▃▃▃▃▃▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁
