In [1]:
!git clone https://github.com/wtaisner/atla-generator.git

Cloning into 'atla-generator'...
remote: Enumerating objects: 143, done.[K
remote: Counting objects: 100% (143/143), done.[K
remote: Compressing objects: 100% (87/87), done.[K
remote: Total 143 (delta 51), reused 108 (delta 29), pack-reused 0[K
Receiving objects: 100% (143/143), 1.26 MiB | 23.03 MiB/s, done.
Resolving deltas: 100% (51/51), done.


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 27.9 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 57.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.5 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 65.3 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalli

In [3]:
import importlib.util       
 
import_data = importlib.util.spec_from_file_location(
  "mod", "/content/atla-generator/src/data.py")   
data = importlib.util.module_from_spec(import_data)       
import_data.loader.exec_module(data)

import_dialoGPT = importlib.util.spec_from_file_location(
  "mod", "/content/atla-generator/src/DialoGPT.py")
dialoGPT = importlib.util.module_from_spec(import_dialoGPT)
import_dialoGPT.loader.exec_module(dialoGPT)

Add zip with data (if needed change the name - default is archive.zip)

In [4]:
import torch
from torch import cuda
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Set parameters

In [5]:
SEED = 42
MODEL = "microsoft/DialoGPT-small"

NAME = "Iroh"
N = 5

TRAIN_SIZE = 0.8

TRAIN_BATCH = 4
EVAL_BATCH = 4
EPOCHS = 10
OUTPUT_DIR = "/content/atla-generator/outputs/DialoGPT"
OVERWRITE_OUTPUT_DIR = True
EVAL_STRATEGY="epoch"
LEARNING_RATE = 5e-5
LOAD_BEST_MODEL_AT_THE_END = True
PREDICTION_LOSS_ONLY = True

STEPS = 7

In [6]:
torch.manual_seed(SEED)
device = 'cuda' if cuda.is_available() else 'cpu'

# Load model and tokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.pad_token = 0

model = AutoModelForCausalLM.from_pretrained(MODEL)
model.to(device)

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/641 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/335M [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

# Read and reformat data, create train and eval split

In [8]:
train_size = TRAIN_SIZE

df = data.read_dataframe(path='/content/atla-generator/data/avatar.csv')
df = dialoGPT.create_context(df, name=NAME, n=N)

train_dataset = df.sample(frac=train_size, random_state=SEED)
eval_dataset = df.drop(train_dataset.index).reset_index(drop = True)
train_dataset = train_dataset.reset_index(drop = True)

print(f"df shape: {df.shape}")
print(f"train shape: {train_dataset.shape}")
print(f"eval shape: {eval_dataset.shape}")

df shape: (337, 6)
train shape: (270, 6)
eval shape: (67, 6)


In [9]:
train_set = dialoGPT.ConversationDataset(train_dataset, tokenizer)
eval_set = dialoGPT.ConversationDataset(eval_dataset, tokenizer)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# Fine-tune model (training)

In [10]:
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=OVERWRITE_OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCH,
    per_device_eval_batch_size=EVAL_BATCH,
    prediction_loss_only=PREDICTION_LOSS_ONLY,
    evaluation_strategy=EVAL_STRATEGY,
    save_strategy=EVAL_STRATEGY,
    learning_rate=LEARNING_RATE,
    seed=SEED,
    load_best_model_at_end=LOAD_BEST_MODEL_AT_THE_END,
)

In [11]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_set,
    eval_dataset=eval_set,
    data_collator=data_collator,
)

In [12]:
trainer.train()

***** Running training *****
  Num examples = 270
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 680


Epoch,Training Loss,Validation Loss
1,No log,3.466888
2,No log,3.171084
3,No log,3.0006
4,No log,2.878943
5,No log,2.810711
6,No log,2.749407
7,No log,2.715672
8,2.799800,2.703493
9,2.799800,2.688128
10,2.799800,2.690668


***** Running Evaluation *****
  Num examples = 67
  Batch size = 4
Saving model checkpoint to /content/atla-generator/outputs/DialoGPT/checkpoint-68
Configuration saved in /content/atla-generator/outputs/DialoGPT/checkpoint-68/config.json
Model weights saved in /content/atla-generator/outputs/DialoGPT/checkpoint-68/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 67
  Batch size = 4
Saving model checkpoint to /content/atla-generator/outputs/DialoGPT/checkpoint-136
Configuration saved in /content/atla-generator/outputs/DialoGPT/checkpoint-136/config.json
Model weights saved in /content/atla-generator/outputs/DialoGPT/checkpoint-136/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 67
  Batch size = 4
Saving model checkpoint to /content/atla-generator/outputs/DialoGPT/checkpoint-204
Configuration saved in /content/atla-generator/outputs/DialoGPT/checkpoint-204/config.json
Model weights saved in /content/atla-generator/outputs/DialoGPT/checkpoint-204/pyto

TrainOutput(global_step=680, training_loss=2.549476578656365, metrics={'train_runtime': 212.7507, 'train_samples_per_second': 12.691, 'train_steps_per_second': 3.196, 'total_flos': 213518458368000.0, 'train_loss': 2.549476578656365, 'epoch': 10.0})

In [13]:
trainer.save_model(OUTPUT_DIR) #remeber to download the model

Saving model checkpoint to /content/atla-generator/outputs/DialoGPT
Configuration saved in /content/atla-generator/outputs/DialoGPT/config.json
Model weights saved in /content/atla-generator/outputs/DialoGPT/pytorch_model.bin


# Perplexity

In [14]:
eval_results = trainer.evaluate(eval_dataset=eval_set)
perplexity =  2 ** eval_results['eval_loss']
print(f"perplexity: {perplexity}")
eval_results

***** Running Evaluation *****
  Num examples = 67
  Batch size = 4


perplexity: 6.444763946455313


{'epoch': 10.0,
 'eval_loss': 2.6881275177001953,
 'eval_runtime': 1.0462,
 'eval_samples_per_second': 64.039,
 'eval_steps_per_second': 16.249}

# Load saved model and chat

In [15]:
model = AutoModelForCausalLM.from_pretrained(OUTPUT_DIR)
tokenizer = AutoTokenizer.from_pretrained(MODEL)

loading configuration file /content/atla-generator/outputs/DialoGPT/config.json
Model config GPT2Config {
  "_name_or_path": "/content/atla-generator/outputs/DialoGPT",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "conversational": {
      "max_length": 1000
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.19.2",
  "use_cache": true,
 

In [18]:
dialoGPT.chat_with_me(model, tokenizer, steps=STEPS)

>> User:hi, how are you?
Bot: I'm fine.
>> User:do you know the Avatar?
Bot: No, but I can tell!  I sense he's special!  And special!
>> User:But I will defeat him in the name of Fire Nation
Bot: You must be proud.
>> User:I want to make my father proud
Bot: He's right. He's the Avatar.  But he's not the only one who can defeat him.
>> User:I can, I'm an excellent Fire Bender
Bot: I know.
>> User:Do you know who I am?
Bot: You're the Avatar, and you're the only reason I'm going to this meeting.
>> User:And who are you?
Bot: Who knows more than you do?
