In [3]:
import json
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

# Load the JSON file from the specified path
file_path = '/content/training_data.json'
with open(file_path) as file:
    data = json.load(file)


In [4]:

# Prepare the data for training
exercises = []
for body_part, content in data.items():
    for exercise in content['exercises']:
        exercises.append({
            "body_part": body_part,
            "exercise_name": exercise["name"],
            "explanation": exercise["explanation"]
        })


In [None]:

# Initialize the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add a padding token if it doesn't exist
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))  # Resize model embeddings to account for new tokens


In [6]:
def tokenize_data(exercise):
    tokenized_explanation = tokenizer(exercise['explanation'], padding='max_length', truncation=True, max_length=512)
    tokenized_explanation['labels'] = tokenized_explanation['input_ids'].copy()  # Use input_ids as labels
    return tokenized_explanation



tokenized_data = [tokenize_data(ex) for ex in exercises]


In [7]:
class ExerciseDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_data):
        self.input_ids = [td['input_ids'] for td in tokenized_data]
        self.attention_mask = [td['attention_mask'] for td in tokenized_data]
        self.labels = [td['labels'] for td in tokenized_data]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_mask[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

dataset = ExerciseDataset(tokenized_data)



In [8]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)


In [None]:

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

# Train the model
trainer.train()