In [1]:
# !pip install transformers datasets

In [2]:
# !pip install accelerate -U

In [3]:
import json
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments

In [4]:
with open(r"/content/drive/MyDrive/train-v2.0.json", "r") as read_file:
    data_raw = json.load(read_file)

In [5]:
def transform_json(input_json):
    transformed_train = []

    for item in input_json['data']:
        for paragraph in item['paragraphs']:
            for qas in paragraph['qas']:
                if 'answers' in qas and qas['answers']:
                    transformed_item = {
                        "input": qas['question'],
                        "output": qas['answers'][0]['text']
                    }
                    transformed_train.append(transformed_item)

    return transformed_train

transformed_train_json = transform_json(data_raw)
json_data = json.dumps(transformed_train_json, ensure_ascii=False, indent=2)

In [6]:
with open('/content/drive/MyDrive/trainGPT.json' , 'w', encoding='utf-8') as f:
    f.write(json_data)

In [7]:
with open('/content/drive/MyDrive/trainGPT.json', 'r') as f:
    data = json.load(f)

columns = {
    'input': [item['input'] for item in data],
    'output': [item['output'] for item in data]
}

# Convert to Hugging Face dataset
dataset = Dataset.from_dict(columns)
dataset = dataset.train_test_split(test_size=0.2)

In [8]:
def tokenize_function(examples):
    inputs = tokenizer(examples['input'], padding='max_length', truncation=True)
    outputs = tokenizer(examples['output'], padding='max_length', truncation=True)

    inputs['labels'] = outputs['input_ids']
    return inputs

In [9]:
# Load tokenizer and model
model_name = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.eos_token:
    tokenizer.pad_token = tokenizer.eos_token
else:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = AutoModelForCausalLM.from_pretrained(model_name)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/69456 [00:00<?, ? examples/s]

Map:   0%|          | 0/17365 [00:00<?, ? examples/s]

In [10]:
tokenized_datasets = tokenized_datasets.remove_columns(["input", "output"])

tokenized_datasets.set_format("torch")

In [11]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)



In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
trainer.evaluate()

input_text = "Hello, how are you?"
inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0]))