In [26]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForCausalLM

In [27]:
tf.random.set_seed(42)


In [28]:
checkpoint = "distilgpt2"


tokenizer = AutoTokenizer.from_pretrained(checkpoint, padding_side='left')


tokenizer.seed = 42


model = TFAutoModelForCausalLM.from_pretrained(checkpoint)

# Seed all random generators
model.config.seed = 42

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [29]:

text = "Oh HAI, I'm just a plan 'ol input sentence prompt."

encoded_input = tokenizer(text, return_tensors='tf')
print(encoded_input)

output = model(encoded_input)

{'input_ids': <tf.Tensor: shape=(1, 15), dtype=int32, numpy=
array([[ 5812, 14558,    40,    11,   314,  1101,   655,   257,  1410,
          705,   349,  5128,  6827,  6152,    13]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 15), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}


In [31]:
from transformers import pipeline, set_seed

generator = pipeline('text-generation', model=checkpoint)

set_seed(42)

generator("A 5-star review of the book \"The art of war\": ", 
          max_length=64, 
          num_return_sequences=1,
          pad_token_id=tokenizer.eos_token_id)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': 'A 5-star review of the book "The art of war": Â\n\n\nNow that the book isn\'t out for grabs, I have to take a minute to read the first half of the book so I feel like a lot of readers are already familiar with the world of the art book and the series'}]

In [32]:
generator("A 1-star review of the book \"The Evolution of Useful Things\": ", 
          max_length=64, 
          num_return_sequences=1, 
          pad_token_id=tokenizer.eos_token_id)

[{'generated_text': 'A 1-star review of the book "The Evolution of Useful Things": ____________________\n\nI highly recommend this book, and it\'s worth a visit to those of you who are skeptical enough to accept it, and are aware this book is a big deal. But the only time an e-book needs to'}]

In [33]:
max_length = 64
batch_size = 32

In [35]:
data = load_dataset("yatharth2307/modified_llm_finetune")

In [45]:
# Set the padding token to the EOS token.
tokenizer.pad_token = tokenizer.eos_token


tokenized_data = tokenizer.batch_encode_plus(
    data['train']['data'],
    return_tensors='tf',
    padding=True,
    truncation=True,
    max_length=max_length
)

{'input_ids': <tf.Tensor: shape=(19138, 64), dtype=int32, numpy=
array([[50256, 50256,    27, ...,  7359,    82,    29],
       [   27,    82,    29, ...,  2476,    13,  7359],
       [50256, 50256, 50256, ...,  7359,    82,    29],
       ...,
       [50256, 50256, 50256, ...,  7359,    82,    29],
       [50256, 50256, 50256, ...,  7359,    82,    29],
       [50256,    27,    82, ...,  7359,    82,    29]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(19138, 64), dtype=int32, numpy=
array([[0, 0, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1],
       ...,
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1],
       [0, 1, 1, ..., 1, 1, 1]], dtype=int32)>}

In [47]:
print("Input IDs:", tokenized_data['input_ids'])
print("Attention Mask:", tokenized_data['attention_mask'])

Input IDs: tf.Tensor(
[[50256 50256    27 ...  7359    82    29]
 [   27    82    29 ...  2476    13  7359]
 [50256 50256 50256 ...  7359    82    29]
 ...
 [50256 50256 50256 ...  7359    82    29]
 [50256 50256 50256 ...  7359    82    29]
 [50256    27    82 ...  7359    82    29]], shape=(19138, 64), dtype=int32)
Attention Mask: tf.Tensor(
[[0 0 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 [0 0 0 ... 1 1 1]
 ...
 [0 0 0 ... 1 1 1]
 [0 0 0 ... 1 1 1]
 [0 1 1 ... 1 1 1]], shape=(19138, 64), dtype=int32)


In [49]:
tfds = tf.data.Dataset.from_tensor_slices((
    {
        'input_ids': tokenized_data['input_ids'],
        'attention_mask': tokenized_data['attention_mask']
    },
    tokenized_data['input_ids']  # this becomes the labels, labels are just the next word 
                                 # (shifted internally inside the model)
))

tfds = tfds.batch(batch_size=batch_size)

In [70]:
total_size = sum(1 for _ in tfds)
print(f"Total dataset size: {total_size}")

Total dataset size: 599


In [50]:
for input_batch, label_batch in tfds.take(1):
    print("Input IDs:", input_batch['input_ids'])
    print("Attention Mask:", input_batch['attention_mask'])
    print("Label:", label_batch)
    print("=" * 50)

Input IDs: tf.Tensor(
[[50256 50256    27 ...  7359    82    29]
 [   27    82    29 ...  2476    13  7359]
 [50256 50256 50256 ...  7359    82    29]
 ...
 [50256 50256 50256 ...  7359    82    29]
 [50256    27    82 ...  7359    82    29]
 [   27    82    29 ...    13  7359    82]], shape=(32, 64), dtype=int32)
Attention Mask: tf.Tensor(
[[0 0 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 [0 0 0 ... 1 1 1]
 ...
 [0 0 0 ... 1 1 1]
 [0 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]], shape=(32, 64), dtype=int32)
Label: tf.Tensor(
[[50256 50256    27 ...  7359    82    29]
 [   27    82    29 ...  2476    13  7359]
 [50256 50256 50256 ...  7359    82    29]
 ...
 [50256 50256 50256 ...  7359    82    29]
 [50256    27    82 ...  7359    82    29]
 [   27    82    29 ...    13  7359    82]], shape=(32, 64), dtype=int32)


In [51]:
for input_batch, label_batch in tfds.take(1):
    print("Input IDs:", input_batch['input_ids'][0])
    print(tokenizer.batch_decode(input_batch['input_ids'][0]))
    print("Attention Mask:", input_batch['attention_mask'][0])
    print("Label:", label_batch)
    print("=" * 50)

Input IDs: tf.Tensor(
[50256 50256    27    82    29  1374   466   345 32980  6491  2139    30
 22092  2139   318   674  1353  8475    13   775  3031 19268   284 23538
   393  4786   290  4031   284  2148   257 28949  1998   329  1123  6491
    13  3954  3061   318   284  7074   534  9027   416  1016   262  3131
 10591   284  2209   534  2476    13  3406 14676   318   644 10182   514
    13  7359    82    29], shape=(64,), dtype=int32)
['<|endoftext|>', '<|endoftext|>', '<', 's', '>', ' How', ' do', ' you', ' prioritize', ' customer', ' service', '?', ' Customer', ' service', ' is', ' our', ' top', ' priority', '.', ' We', ' respond', ' promptly', ' to', ' inquiries', ' or', ' concerns', ' and', ' aim', ' to', ' provide', ' a', ' personalized', ' experience', ' for', ' each', ' customer', '.', ' Our', ' goal', ' is', ' to', ' exceed', ' your', ' expectations', ' by', ' going', ' the', ' extra', ' mile', ' to', ' address', ' your', ' needs', '.', ' Your', ' satisfaction', ' is', ' what'

In [71]:

train_size = 500
train_tfds = tfds.take(train_size)
val_tfds = tfds.skip(train_size)

print(f"Training set size: {train_size}")


Training set size: 500


In [72]:

num_epochs = 5
print(f"Epochs: {num_epochs}")

num_train_steps = train_size * num_epochs
print(f"Training steps: {num_train_steps}")

lr_scheduler = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)

opt = tf.keras.optimizers.Adam(learning_rate=lr_scheduler)

model.compile(optimizer=opt,

)

Epochs: 5
Training steps: 2500


In [73]:
model.summary()


Model: "tfgpt2lm_head_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer (TFGPT2MainLay  multiple                  81912576  
 er)                                                             
                                                                 
Total params: 81912576 (312.47 MB)
Trainable params: 81912576 (312.47 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [74]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


Num GPUs Available:  2


In [75]:
class PrintLearningRateCB(tf.keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs=None):
        lr = float(tf.keras.backend.get_value(self.model.optimizer.learning_rate))
        print(f'Epoch {epoch + 1} - Learning Rate: {lr}')

In [76]:
for data in train_tfds.take(1):
    print(data)


({'input_ids': <tf.Tensor: shape=(32, 64), dtype=int32, numpy=
array([[50256, 50256,    27, ...,  7359,    82,    29],
       [   27,    82,    29, ...,  2476,    13,  7359],
       [50256, 50256, 50256, ...,  7359,    82,    29],
       ...,
       [50256, 50256, 50256, ...,  7359,    82,    29],
       [50256,    27,    82, ...,  7359,    82,    29],
       [   27,    82,    29, ...,    13,  7359,    82]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(32, 64), dtype=int32, numpy=
array([[0, 0, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1],
       ...,
       [0, 0, 0, ..., 1, 1, 1],
       [0, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]], dtype=int32)>}, <tf.Tensor: shape=(32, 64), dtype=int32, numpy=
array([[50256, 50256,    27, ...,  7359,    82,    29],
       [   27,    82,    29, ...,  2476,    13,  7359],
       [50256, 50256, 50256, ...,  7359,    82,    29],
       ...,
       [50256, 50256, 50256, ...,  7359,    82,    29],
  

In [77]:

for data in val_tfds.take(1):
    print(data)

({'input_ids': <tf.Tensor: shape=(32, 64), dtype=int32, numpy=
array([[50256, 50256, 50256, ...,  7359,    82,    29],
       [50256, 50256, 50256, ...,  7359,    82,    29],
       [   27,    82,    29, ...,  7359,    82,    29],
       ...,
       [   27,    82,    29, ..., 13205, 21811,    11],
       [   27,    82,    29, ..., 15843,   290, 14676],
       [   27,    82,    29, ...,  8280,    11,   290]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(32, 64), dtype=int32, numpy=
array([[0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]], dtype=int32)>}, <tf.Tensor: shape=(32, 64), dtype=int32, numpy=
array([[50256, 50256, 50256, ...,  7359,    82,    29],
       [50256, 50256, 50256, ...,  7359,    82,    29],
       [   27,    82,    29, ...,  7359,    82,    29],
       ...,
       [   27,    82,    29, ..., 13205, 21811,    11],
  

In [78]:
model.fit(train_tfds, 
          validation_data=val_tfds, 
          epochs=num_epochs,
          callbacks=[PrintLearningRateCB()],
         )

Epoch 1 - Learning Rate: 4.999999873689376e-05
Epoch 1/5
Epoch 2 - Learning Rate: 4.00200005969964e-05
Epoch 2/5
Epoch 3 - Learning Rate: 3.0019997211638838e-05
Epoch 3/5
Epoch 4 - Learning Rate: 2.0020001102238894e-05
Epoch 4/5
Epoch 5 - Learning Rate: 1.0019999535870738e-05
Epoch 5/5


<tf_keras.src.callbacks.History at 0x7e616c4f79d0>

In [81]:
model.save('distillgpt-finetuned.keras')




In [96]:
prompt1 = "How should we prioritize customer service "
prompt2 = "What makes a product different from others in the market? "
prompt3 = "I'm really struggling with my finances right now. I don't know how to manage my expenses. "
prompt4 = "How can I create a feedback-driven culture within my sales team?"
prompt5 = "I'm having a difficult time negotiating with a potential client. Can you give me any advice?"

In [97]:
# the input sequences all the same length.
encodings = tokenizer([prompt1, prompt2, prompt3, prompt4 , prompt5], 
                      return_tensors='tf',
                      padding=True,
                      truncation=True
                     )


In [98]:
outputs = model.generate(**encodings, 
                         max_new_tokens=64,         
                         do_sample=True,           
                         pad_token_id=tokenizer.eos_token_id,
                         top_k=250,
                         top_p=0.92,                
                         no_repeat_ngram_size=3,   
                         num_beams=5,            
                         num_return_sequences=1,
                         early_stopping=True,       
                        )

decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Easier to read if we unroll the list
for out in decoded:
    print(f"{out}\n")

How should we prioritize customer service ills? Our customer service team is dedicated to addressing any inquiries or concerns you may have. We have a dedicated support team that can assist you with any questions or

What makes a product different from others in the market? ive a great question! Let me share a story with you about a customer who had similar doubts. They were skeptical about the effectiveness of our product, but after

I'm really struggling with my finances right now. I don't know how to manage my expenses. ills can be challenging, but Im here to help. Can you tell me more about your current financial situation and what youre looking to achieve? This will help

How can I create a feedback-driven culture within my sales team? Creating a culture of feedback is crucial. Encourage open and honest communication, where feedback is welcomed and valued. This fosters a supportive and collaborative environment where feedback

I'm having a difficult time negotiating with a potenti