In [1]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
import datasets

def load_dataset(file_path, tokenizer, block_size=128):
    dataset = datasets.load_dataset('csv', data_files=file_path, split='train', delimiter=',', column_names=['input', 'response'])

    def tokenize_function(examples):
        return tokenizer(examples['input'], examples['response'], truncation=True, max_length=block_size)

    dataset = dataset.map(tokenize_function, batched=True, num_proc=4)
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

    return dataset

def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator


def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  tokenizer.pad_token = tokenizer.eos_token
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)
      
  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )
      
  trainer.train()
  trainer.save_model()


In [3]:
# you need to set parameters 
train_file_path = './data/cleaned/conversations.csv'
model_name = 'gpt2'
output_dir = './output/'

overwrite_output_dir = True
per_device_train_batch_size = 4
num_train_epochs = 50
save_steps = 500

In [4]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

Found cached dataset csv (C:/Users/zhuqu/.cache/huggingface/datasets/csv/default-bbcb8f1a69490e34/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
Loading cached processed dataset at C:\Users\zhuqu\.cache\huggingface\datasets\csv\default-bbcb8f1a69490e34\0.0.0\6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-2f0d3cde7d706f7a_*_of_00004.arrow
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: input, response. If input, response are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1134
  Num Epochs = 50
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 14200
  Number of trainable parameters = 124439808
Automatic Weights & Biases logging enabled, to disable set os.env

  4%|▎         | 500/14200 [00:58<32:31,  7.02it/s]Saving model checkpoint to ./output/checkpoint-500
Configuration saved in ./output/checkpoint-500\config.json


{'loss': 3.2655, 'learning_rate': 4.8239436619718316e-05, 'epoch': 1.76}


Model weights saved in ./output/checkpoint-500\pytorch_model.bin
  7%|▋         | 1000/14200 [01:54<32:53,  6.69it/s] Saving model checkpoint to ./output/checkpoint-1000
Configuration saved in ./output/checkpoint-1000\config.json


{'loss': 2.4801, 'learning_rate': 4.647887323943662e-05, 'epoch': 3.52}


Model weights saved in ./output/checkpoint-1000\pytorch_model.bin
 11%|█         | 1500/14200 [02:50<35:20,  5.99it/s]  Saving model checkpoint to ./output/checkpoint-1500
Configuration saved in ./output/checkpoint-1500\config.json


{'loss': 1.917, 'learning_rate': 4.471830985915493e-05, 'epoch': 5.28}


Model weights saved in ./output/checkpoint-1500\pytorch_model.bin
 14%|█▍        | 2000/14200 [03:48<18:27, 11.02it/s]  Saving model checkpoint to ./output/checkpoint-2000
Configuration saved in ./output/checkpoint-2000\config.json


{'loss': 1.4214, 'learning_rate': 4.295774647887324e-05, 'epoch': 7.04}


Model weights saved in ./output/checkpoint-2000\pytorch_model.bin
 18%|█▊        | 2500/14200 [04:43<26:01,  7.49it/s]  Saving model checkpoint to ./output/checkpoint-2500
Configuration saved in ./output/checkpoint-2500\config.json


{'loss': 1.0371, 'learning_rate': 4.119718309859155e-05, 'epoch': 8.8}


Model weights saved in ./output/checkpoint-2500\pytorch_model.bin
 21%|██        | 3000/14200 [05:37<26:31,  7.04it/s]  Saving model checkpoint to ./output/checkpoint-3000
Configuration saved in ./output/checkpoint-3000\config.json


{'loss': 0.7656, 'learning_rate': 3.943661971830986e-05, 'epoch': 10.56}


Model weights saved in ./output/checkpoint-3000\pytorch_model.bin
 25%|██▍       | 3500/14200 [06:33<23:21,  7.63it/s]  Saving model checkpoint to ./output/checkpoint-3500
Configuration saved in ./output/checkpoint-3500\config.json


{'loss': 0.6087, 'learning_rate': 3.767605633802817e-05, 'epoch': 12.32}


Model weights saved in ./output/checkpoint-3500\pytorch_model.bin
 28%|██▊       | 4000/14200 [07:32<26:56,  6.31it/s]  Saving model checkpoint to ./output/checkpoint-4000
Configuration saved in ./output/checkpoint-4000\config.json


{'loss': 0.4786, 'learning_rate': 3.5915492957746486e-05, 'epoch': 14.08}


Model weights saved in ./output/checkpoint-4000\pytorch_model.bin
 32%|███▏      | 4500/14200 [08:27<27:36,  5.85it/s]  Saving model checkpoint to ./output/checkpoint-4500
Configuration saved in ./output/checkpoint-4500\config.json


{'loss': 0.392, 'learning_rate': 3.4154929577464786e-05, 'epoch': 15.85}


Model weights saved in ./output/checkpoint-4500\pytorch_model.bin
 35%|███▌      | 5000/14200 [09:24<20:15,  7.57it/s]  Saving model checkpoint to ./output/checkpoint-5000
Configuration saved in ./output/checkpoint-5000\config.json


{'loss': 0.3348, 'learning_rate': 3.23943661971831e-05, 'epoch': 17.61}


Model weights saved in ./output/checkpoint-5000\pytorch_model.bin
 39%|███▊      | 5500/14200 [10:20<18:47,  7.72it/s]  Saving model checkpoint to ./output/checkpoint-5500
Configuration saved in ./output/checkpoint-5500\config.json


{'loss': 0.3053, 'learning_rate': 3.063380281690141e-05, 'epoch': 19.37}


Model weights saved in ./output/checkpoint-5500\pytorch_model.bin
 42%|████▏     | 6000/14200 [11:17<23:44,  5.76it/s]  Saving model checkpoint to ./output/checkpoint-6000
Configuration saved in ./output/checkpoint-6000\config.json


{'loss': 0.2756, 'learning_rate': 2.887323943661972e-05, 'epoch': 21.13}


Model weights saved in ./output/checkpoint-6000\pytorch_model.bin
 46%|████▌     | 6500/14200 [12:15<18:02,  7.11it/s]  Saving model checkpoint to ./output/checkpoint-6500
Configuration saved in ./output/checkpoint-6500\config.json


{'loss': 0.2624, 'learning_rate': 2.711267605633803e-05, 'epoch': 22.89}


Model weights saved in ./output/checkpoint-6500\pytorch_model.bin
 49%|████▉     | 7000/14200 [13:13<16:48,  7.14it/s]  Saving model checkpoint to ./output/checkpoint-7000
Configuration saved in ./output/checkpoint-7000\config.json


{'loss': 0.2345, 'learning_rate': 2.535211267605634e-05, 'epoch': 24.65}


Model weights saved in ./output/checkpoint-7000\pytorch_model.bin
 53%|█████▎    | 7500/14200 [14:09<14:59,  7.45it/s]  Saving model checkpoint to ./output/checkpoint-7500
Configuration saved in ./output/checkpoint-7500\config.json


{'loss': 0.2282, 'learning_rate': 2.359154929577465e-05, 'epoch': 26.41}


Model weights saved in ./output/checkpoint-7500\pytorch_model.bin
 56%|█████▋    | 8000/14200 [15:04<14:42,  7.02it/s]  Saving model checkpoint to ./output/checkpoint-8000
Configuration saved in ./output/checkpoint-8000\config.json


{'loss': 0.2162, 'learning_rate': 2.1830985915492956e-05, 'epoch': 28.17}


Model weights saved in ./output/checkpoint-8000\pytorch_model.bin
 60%|█████▉    | 8500/14200 [15:59<14:28,  6.56it/s]Saving model checkpoint to ./output/checkpoint-8500
Configuration saved in ./output/checkpoint-8500\config.json


{'loss': 0.207, 'learning_rate': 2.007042253521127e-05, 'epoch': 29.93}


Model weights saved in ./output/checkpoint-8500\pytorch_model.bin
 63%|██████▎   | 9000/14200 [16:56<12:07,  7.14it/s]Saving model checkpoint to ./output/checkpoint-9000
Configuration saved in ./output/checkpoint-9000\config.json


{'loss': 0.2002, 'learning_rate': 1.830985915492958e-05, 'epoch': 31.69}


Model weights saved in ./output/checkpoint-9000\pytorch_model.bin
 67%|██████▋   | 9500/14200 [17:51<10:34,  7.41it/s]Saving model checkpoint to ./output/checkpoint-9500
Configuration saved in ./output/checkpoint-9500\config.json


{'loss': 0.1982, 'learning_rate': 1.6549295774647887e-05, 'epoch': 33.45}


Model weights saved in ./output/checkpoint-9500\pytorch_model.bin
 70%|███████   | 10000/14200 [18:46<10:14,  6.83it/s]Saving model checkpoint to ./output/checkpoint-10000
Configuration saved in ./output/checkpoint-10000\config.json


{'loss': 0.1884, 'learning_rate': 1.4788732394366198e-05, 'epoch': 35.21}


Model weights saved in ./output/checkpoint-10000\pytorch_model.bin
 74%|███████▍  | 10500/14200 [19:42<08:44,  7.06it/s]Saving model checkpoint to ./output/checkpoint-10500
Configuration saved in ./output/checkpoint-10500\config.json


{'loss': 0.185, 'learning_rate': 1.3028169014084506e-05, 'epoch': 36.97}


Model weights saved in ./output/checkpoint-10500\pytorch_model.bin
 77%|███████▋  | 11000/14200 [20:38<07:44,  6.89it/s]Saving model checkpoint to ./output/checkpoint-11000
Configuration saved in ./output/checkpoint-11000\config.json


{'loss': 0.1803, 'learning_rate': 1.1267605633802817e-05, 'epoch': 38.73}


Model weights saved in ./output/checkpoint-11000\pytorch_model.bin
 81%|████████  | 11500/14200 [21:34<06:13,  7.23it/s]Saving model checkpoint to ./output/checkpoint-11500
Configuration saved in ./output/checkpoint-11500\config.json


{'loss': 0.1765, 'learning_rate': 9.507042253521127e-06, 'epoch': 40.49}


Model weights saved in ./output/checkpoint-11500\pytorch_model.bin
 85%|████████▍ | 12000/14200 [22:31<05:48,  6.31it/s]Saving model checkpoint to ./output/checkpoint-12000
Configuration saved in ./output/checkpoint-12000\config.json


{'loss': 0.1683, 'learning_rate': 7.746478873239436e-06, 'epoch': 42.25}


Model weights saved in ./output/checkpoint-12000\pytorch_model.bin
 88%|████████▊ | 12500/14200 [23:27<04:00,  7.06it/s]Saving model checkpoint to ./output/checkpoint-12500
Configuration saved in ./output/checkpoint-12500\config.json


{'loss': 0.1698, 'learning_rate': 5.9859154929577465e-06, 'epoch': 44.01}


Model weights saved in ./output/checkpoint-12500\pytorch_model.bin
 92%|█████████▏| 13000/14200 [24:23<02:33,  7.80it/s]Saving model checkpoint to ./output/checkpoint-13000
Configuration saved in ./output/checkpoint-13000\config.json


{'loss': 0.1735, 'learning_rate': 4.225352112676056e-06, 'epoch': 45.77}


Model weights saved in ./output/checkpoint-13000\pytorch_model.bin
 95%|█████████▌| 13500/14200 [25:21<01:37,  7.19it/s]Saving model checkpoint to ./output/checkpoint-13500
Configuration saved in ./output/checkpoint-13500\config.json


{'loss': 0.162, 'learning_rate': 2.464788732394366e-06, 'epoch': 47.54}


Model weights saved in ./output/checkpoint-13500\pytorch_model.bin
 99%|█████████▊| 14000/14200 [26:18<00:27,  7.37it/s]Saving model checkpoint to ./output/checkpoint-14000
Configuration saved in ./output/checkpoint-14000\config.json


{'loss': 0.1627, 'learning_rate': 7.042253521126761e-07, 'epoch': 49.3}


Model weights saved in ./output/checkpoint-14000\pytorch_model.bin
100%|██████████| 14200/14200 [26:43<00:00,  9.77it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 14200/14200 [26:43<00:00,  8.85it/s]
Saving model checkpoint to ./output/
Configuration saved in ./output/config.json


{'train_runtime': 1612.6411, 'train_samples_per_second': 35.16, 'train_steps_per_second': 8.805, 'train_loss': 0.5794415524979712, 'epoch': 50.0}


Model weights saved in ./output/pytorch_model.bin


In [45]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(sequence, max_length):
    model_path = output_dir
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        do_sample=True,  # enable sampling
        top_p=0.95,  # control randomness of generation
        top_k=50,  # control diversity of generation
        temperature=1,  # control randomness of generation
        num_beams=5,  # generate multiple sequences
        no_repeat_ngram_size=2,  # control repetitiveness
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [72]:
sequence = 'Who is Daenerys Targaryen?'
max_len = 100
generate_text(sequence, max_len)

loading configuration file ./output/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "use_cache": true,
  "vocab_size": 50257
}

loading weights file

Who is Daenerys Targaryen?She is the queen of the Seven Kingdoms. She has a powerful army at her back and if this message is to be believed, three dragons.Lord Commander Mormont thought you'd never come back to Westeros. Now he has me. I must go home now. If you return to the North, I will have you hanged as a murderer. My people will hate me for all the crimes I commit. But if I do not, they will


In [None]:
from flask import Flask, request, jsonify
from flask_ngrok import run_with_ngrok

# creates an instance of a Flask web application
app = Flask(__name__)
run_with_ngrok(app)

#ends a GET request to this URL, Flask will execute the code
@app.route('/health', methods=['GET'])
def health():
    response = {
        'text': 'Success'
    }
    return jsonify(response)

# Define the route for the Chatbot
@app.route('/chatbot', methods=['POST'])
def chatbot():
    input_text = request.json['text']
    response_text = generate_text(input_text, max_length=50)
    response = {
        'text': response_text
    }
    return jsonify(response)

# Define the main function to run the app
# the Flask application listens for incoming requests 
if __name__ == '__main__':
    app.run()