## GPT2 on Custom Data

In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import tensorflow as tf

### Training Sentences

In [2]:
train_sentences = [
'If the Main Appointment is terminated for any reason or if under the terms of the Main Appointment the '
'Consultant is instructed to remove the Sub-Consultant from involvement in the Project, then the Agreement ' 
'shall automatically terminate. '
'The Consultant shall notify the Sub-Consultant as soon as it is reasonably able to do so.',

'The Consultant may terminate this Agreement at any time on giving seven (7) Days '
'notice in writing to the Sub-Consultant.',

'If either party becomes insolvent or bankrupt then the other party may immediately '
'terminate the Agreement by notice in writing.',

'Upon suspension or termination of the Agreement, '
'the Sub-Consultant shall immediately take steps to bring the performance of the Sub-Consultancy '
'Services to an end in an orderly manner. The Sub-Consultant shall deliver up to '
'the Consultant all documents (including electronic documents) relating to the Project including any Materials.',

'Following termination of this Agreement the Consultant may carry out the Sub-Consultancy Services '
'itself or may engage others to carry out the Sub-Consultancy Services and the Sub-Consultant shall '
'have no claim against the Consultant for so doing.'

]

In [3]:
# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Prepare the dataset
# Assume that you have a list of sentences called `train_sentences`
# Convert the sentences to input ids and attention masks
input_ids = [tokenizer.encode(sent, add_special_tokens = True) for sent in train_sentences]
attention_masks = [[float(i > 0) for i in ii] for ii in input_ids]

In [4]:
input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids)
attention_masks = tf.keras.preprocessing.sequence.pad_sequences(attention_masks)

In [5]:
print(len(attention_masks[2]))

68


In [6]:
# Convert the data to PyTorch tensors
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)

In [7]:
# Load the pre-trained model
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [8]:
from torch.nn import CrossEntropyLoss

# Define the loss function
loss_fn = torch.nn.CrossEntropyLoss()

# Fine-tune the model on the dataset
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [9]:
#num_epochs = 200 (With GPU PC)
num_epochs = 20
for _ in range(num_epochs):
    output = model(input_ids, attention_mask = attention_masks)
    #loss = model.config.lm_coef * output[0].mean()
    loss = loss_fn(output[0].view(-1, output[0].size(-1)), input_ids.view(-1).long())
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

### Save the fine-tuned model

In [10]:
# Save the fine-tuned model
model.save_pretrained('../2_ChatGPT/')