<a href="https://colab.research.google.com/github/xaesalvaje/me.ai/blob/main/create.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --upgrade jupyter_http_over_ws>=0.0.7 && \
  jupyter serverextension enable --py jupyter_http_over_ws

Enabling: jupyter_http_over_ws
- Writing config: /root/.jupyter
    - Validating...
      jupyter_http_over_ws 0.0.7 [32mOK[0m


In [2]:
!pip install transformers torch accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate
  Downloading accelerate-0.19.0-py3-none-any.whl (219 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m30.2

In [6]:
from transformers import pipeline, set_seed
import random

In [7]:
set_seed(42)

In [8]:
with open("mywords.txt", "r") as f:
    texts = f.read().splitlines()

In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from transformers import TrainerCallback
from torch.nn.utils import clip_grad_norm_
from transformers.utils.versions import require_version
from accelerate import Accelerator

require_version("accelerate", "0.3.0")

class MixedPrecisionCallback(TrainerCallback):
    def __init__(self):
        self.scaler = None
        
    def on_init_end(self, args, state, control, **kwargs):
        self.scaler = Accelerator().scaler
    
    def on_before_step(self, args, state, control, **kwargs):
        if self.scaler is None:
            return
        self.scaler.step(state.optimizer)
        self.scaler.update()
        
    def on_after_backward(self, args, state, control, **kwargs):
        if self.scaler is None:
            return
        clip_grad_norm_(self.scaler.master_params(state.optimizer), state.args.max_grad_norm)

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")

train_dataset = TextDataset(tokenizer=tokenizer, 
                            file_path="mywords.txt", 
                            block_size=128)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, 
                                                mlm=False)

training_args = TrainingArguments(
    output_dir="./models",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=1,  # Decreased batch size
    gradient_accumulation_steps=32,  # Increase effective batch size
    save_steps=1000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    callbacks=[MixedPrecisionCallback()]
)

trainer.train()


Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (86719 > 1024). Running this sequence through the model will result in indexing errors


Step,Training Loss


TrainOutput(global_step=21, training_loss=4.753547668457031, metrics={'train_runtime': 4991.9042, 'train_samples_per_second': 0.136, 'train_steps_per_second': 0.004, 'total_flos': 156021716680704.0, 'train_loss': 4.753547668457031, 'epoch': 0.99})

In [11]:
generator = pipeline('text-generation', model=model, tokenizer='microsoft/DialoGPT-medium')

def generate_response(prompt, length=50):
    response = generator(prompt, max_length=length, do_sample=True, temperature=0.7)[0]['generated_text']
    return response.strip()

In [13]:
prompt = "Hi, how are you?"
response = generate_response(prompt)
print(response)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Hi, how are you?


In [14]:
# Save the model
model.save_pretrained("models/model")
tokenizer.save_pretrained("models/model")

('models/model/tokenizer_config.json',
 'models/model/special_tokens_map.json',
 'models/model/vocab.json',
 'models/model/merges.txt',
 'models/model/added_tokens.json',
 'models/model/tokenizer.json')