In [1]:
import pandas as pd
import re
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling, GPTNeoForCausalLM, GPT2Tokenizer
from transformers import pipeline
from transformers import Trainer, TrainingArguments
import torch
import gc
import wandb

# wandb.init(project='gpt-neo-dialogs', entity='ziyizhu')

torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3070 Laptop GPU'

In [2]:
train_path = 'datasets/train_dataset.csv'
test_path = 'datasets/test_dataset.csv'

model_path = "./models/gpt-neo-dialogs"
test_size = 0.1

In [3]:
lines = pd.read_csv('./datasets/raw/cornell-movie-dialogs-corpus/movie_lines.txt',
                    sep=re.escape(' +++$+++ '),
                    names=['lineID', 'characterID', 'movieID', 'characterName', 'utterance'],
                    index_col=0,
                    engine='python', 
                    encoding="latin1")

conversations = pd.read_csv('./datasets/raw/cornell-movie-dialogs-corpus/movie_conversations.txt',
                    sep=re.escape(' +++$+++ '),
                    names=['characterID_0', 'characterID_1', 'movieID', 'utteranceList'],
                    engine='python', 
                    encoding="latin1")

In [4]:
def generate_datasets(conversations, lines):
    dataset = []
    for index, row in tqdm(conversations.iterrows(), total=conversations.shape[0]):
        data = []
        for line_id in eval(row['utteranceList']):
            line = lines.loc[line_id]
            data.append(f'{str(line.characterName).title()} said: "{re.sub(" +", " ", str(line.utterance))}"')
        dataset.append('\n'.join(data))

    dataset = pd.DataFrame(dataset, columns=['text'])
    return train_test_split(dataset, test_size=test_size, shuffle=False)

train, test = generate_datasets(conversations, lines)

print("Train dataset length: " + str(len(train)))
print("Test dataset length: " + str(len(test)))

train.to_csv(train_path, index=False)
test.to_csv(test_path, index=False)

  0%|          | 0/83097 [00:00<?, ?it/s]

Train dataset length: 74787
Test dataset length: 8310


In [5]:
train.head()

Unnamed: 0,text
0,"Bianca said: ""Can we make this quick? Roxanne..."
1,"Bianca said: ""You're asking me out. That's so..."
2,"Bianca said: ""No, no, it's my fault -- we didn..."
3,"Cameron said: ""Why?""\nBianca said: ""Unsolved m..."
4,"Bianca said: ""Gosh, if only we could find Kat ..."


In [6]:
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M")
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
tokenizer.pad_token = tokenizer.eos_token

In [7]:
prompt = '''Kabuo said: "What makes you think I play?"
Nels said: "'''

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

generated_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100, num_return_sequences=3)
result = tokenizer.batch_decode(generated_tokens)

for text in result:
    print(text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Kabuo said: "What makes you think I play?"
Nels said: "If I want to play, I'll play."
Nigerian man: "That's the reason to play. You have been there before. If you want you, I'll play."
Nigerian man: "That's what I'll do."
Nigerian man: "You're probably right, Nels. You're probably right."
Nigerian man: "That
Kabuo said: "What makes you think I play?"
Nels said: "The world is over, the world is over. What makes you think I play?"
"That's funny," Maki said. "I didn't see those two players playing."
They shook hands as the pair of the girls headed for the door, and they both stood up. Jiro and Kaba were the only players remaining after their final tournament, and they were already engaged in a game
Kabuo said: "What makes you think I play?"
Nels said: "That's a good point. So you know what it's like to play." I should have said that. You don't have to think about it. I am sure you have a great deal of respect for the players who we had in your first game. I think the players who played in

In [8]:
def tokenize_function(examples):
    return tokenizer(examples['text'], return_special_tokens_mask=True)

datasets = load_dataset('csv', data_files={'train': train_path, 'test': test_path})

tokenized_datasets = datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=['text'],
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Using custom data configuration default-8ec15e1b83c807f7


Downloading and preparing dataset csv/default to C:\Users\zhuzi\.cache\huggingface\datasets\csv\default-8ec15e1b83c807f7\0.0.0\9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to C:\Users\zhuzi\.cache\huggingface\datasets\csv\default-8ec15e1b83c807f7\0.0.0\9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/75 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2085 > 2048). Running this sequence through the model will result in indexing errors


  0%|          | 0/9 [00:00<?, ?ba/s]

In [9]:
max_seq_length = 128

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // max_seq_length) * max_seq_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
        for k, t in concatenated_examples.items()
    }
    return result

tokenized_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
)

  0%|          | 0/75 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

In [10]:
training_args = TrainingArguments(
    output_dir=model_path,         # The output directory
    overwrite_output_dir=True,     # overwrite the content of the output directory
    num_train_epochs=3,            # number of training epochs
    learning_rate=5e-6,
    lr_scheduler_type='linear',
    adafactor=False,
    per_device_train_batch_size=8, # batch size for training
    per_device_eval_batch_size=16, # batch size for evaluation
    evaluation_strategy="steps",
    eval_steps=500,                # Number of update steps between two evaluations.
    save_steps=1000,               # after # steps model is saved
    warmup_steps=500,              # number of warmup steps for learning rate scheduler
    report_to="tensorboard",
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
)

In [11]:
torch.cuda.empty_cache()
gc.collect()

trainer.train()

The following columns in the training set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running training *****
  Num examples = 43768
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 16413


Step,Training Loss,Validation Loss
500,2.922,2.837582
1000,2.7633,2.805496
1500,2.7553,2.795511
2000,2.7463,2.788532
2500,2.7331,2.784108
3000,2.7172,2.780734
3500,2.7153,2.778864
4000,2.7101,2.776872
4500,2.698,2.77455
5000,2.6986,2.773267


The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
Saving model checkpoint to ./models/gpt-neo-dialogs\checkpoint-1000
Configuration saved in ./models/gpt-neo-dialogs\checkpoint-1000\config.json
Model weights saved in ./models/gpt-neo-dialogs\checkpoint-1000\pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoFo

***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
Saving model checkpoint to ./models/gpt-neo-dialogs\checkpoint-13000
Configuration saved in ./models/gpt-neo-dialogs\checkpoint-13000\config.json
Model weights saved in ./models/gpt-neo-dialogs\checkpoint-13000\pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 4977
  Batch size = 16
Saving model checkp

TrainOutput(global_step=16413, training_loss=2.682661901246344, metrics={'train_runtime': 3111.3796, 'train_samples_per_second': 42.201, 'train_steps_per_second': 5.275, 'total_flos': 8574384177414144.0, 'train_loss': 2.682661901246344, 'epoch': 3.0})

In [12]:
trainer.save_model()

Saving model checkpoint to ./models/gpt-neo-dialogs
Configuration saved in ./models/gpt-neo-dialogs\config.json
Model weights saved in ./models/gpt-neo-dialogs\pytorch_model.bin


In [13]:
# wandb.config.test_size = test_size
# wandb.finish()

In [14]:
generator = pipeline('text-generation', model=model_path, tokenizer='EleutherAI/gpt-neo-125M')
result = generator(prompt, do_sample=True, temperature=0.9, max_length=100, num_return_sequences=3)

for text in result:
    print(text['generated_text'])

loading configuration file ./models/gpt-neo-dialogs\config.json
Model config GPTNeoConfig {
  "_name_or_path": "EleutherAI/gpt-neo-125M",
  "activation_function": "gelu_new",
  "architectures": [
    "GPTNeoForCausalLM"
  ],
  "attention_dropout": 0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      6
    ]
  ],
  "bos_token_id": 50256,
  "embed_dropout": 0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": null,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neo",
  "num_heads": 12,
  "num_layers": 12,
  "resid_dropout": 0,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_

Kabuo said: "What makes you think I play?"
Nels said: "You have no idea."Nels said: "They're not allowed in school."
Kabuo said: "You can stay in the house till you die."
Nels said: "Is there a school in Tofino?"Kabuo said: "I don't know.  I just have to play this game."
Nels said: "What is that, Keb
Kabuo said: "What makes you think I play?"
Nels said: "I'll play, if you like."
Von Kremen said: "I play, I am a part of the game.  A little bit of the game...it's an adventure."
Nels said: "What games?                               
Kabuo said: "What makes you think I play?"
Nels said: "I'm not trying to help you, Mama."
Mumford said: "I am an adult. But I don't get married."
Nels said: "And you don't get any kids."Nels said: "You don't have any kids."
Mumford said: "So... it's for a girl who's already pregnant."
Nels said: "It
