In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
import json



In [2]:
# method to load the data from json files
def load_exercises(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    text = ""
    for body_part, content in data.items():
        text += f"{body_part}:\n"
        for exercise in content['exercises']:
            text += f"- {exercise['name']}: {exercise['explanation']}\n"
    return text


In [3]:
train_text = load_exercises('/content/training_data.json')
test_text = load_exercises('/content/test_data.json')

In [4]:

# saving the preprocessed
with open('/content/train.txt', 'w') as f:
    f.write(train_text)

with open('/content/test.txt', 'w') as f:
    f.write(test_text)


In [None]:

# loading tokenizer and the gpt2 model used
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')


In [6]:

# dataset preparation
def load_dataset(file_path, tokenizer):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=128,
    )


train_dataset = load_dataset('/content/train.txt', tokenizer)
test_dataset = load_dataset('/content/test.txt', tokenizer)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)



In [7]:
# setting up the training arguments
training_args = TrainingArguments(
    output_dir='/content/results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# initializing trainer for data input
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [8]:

# llm model training process
trainer.train()
trainer.save_model('/content/trained_model')
tokenizer.save_pretrained('/content/trained_model')  # Save the tokenizer as well



Step,Training Loss


('/content/trained_model/tokenizer_config.json',
 '/content/trained_model/special_tokens_map.json',
 '/content/trained_model/vocab.json',
 '/content/trained_model/merges.txt',
 '/content/trained_model/added_tokens.json')

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# path specification to save the model
model_path = '/content/trained_model'

# loading fine tuned model
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

#  setting pad token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

#  example user input
input_text = (
    "I have lower back pain and I'm looking for exercises to help relieve it. "
    "Please suggest some effective exercises along with brief explanations.\n\n"
    "Exercises:\n"
)

# input encoding and attention mask generation
#inputs = tokenizer.encode(input_text, return_tensors='pt')
inputs = tokenizer(input_text, return_tensors='pt').input_ids.to(model.device)

attention_mask = inputs.ne(tokenizer.pad_token_id).long()

# generating suggestions with attention mask and adjusted parameters
outputs = model.generate(
    inputs,
    attention_mask=attention_mask,
    max_length=250,
    num_return_sequences=1,
    #temperature=0.9,
    do_sample = False,
    #top_p=0.9,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id
)[0]

print(outputs)
int_answer = int(inputs.shape[-1])
suggestions = tokenizer.decode(outputs[int_answer:], skip_special_tokens=True)

print(suggestions)

print(f'### User Input:\n{input_text}\n\n### Assistant Output:\n{suggestions}')
