# **Training**

In [None]:
!pip install evaluate

In [None]:
!pip install transformers

In [3]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling ,TextDataset
import evaluate

In [4]:
import json 
train_set = []
valid_set = []
data = json.load(open("./dataset.json", "r"))
for idx, row in enumerate(data):
    if idx % 2 != 0:
        train_set.append(row)
        continue 
    row = row.split(" . ")
    train_set.append(" . ".join(row[:-1]))
    valid_set.append(row[-1])
with open("./train.json", "w") as f:
    json.dump(train_set, f)
    f.close()
with open("./valid.json", "w") as f:
    json.dump(valid_set, f)
    f.close()

In [None]:
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")

In [7]:
for name, param in model.named_parameters():
    if "h.23" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

In [None]:
for name, param in model.named_parameters():
    print(name, param.requires_grad)

In [9]:
for param in model.lm_head.parameters():
    param.requires_grad = True
    print(param, param.requires_grad)

Parameter containing:
tensor([[-0.0115,  0.0031, -0.0073,  ..., -0.0526, -0.1757,  0.0257],
        [-0.0086,  0.0636, -0.0182,  ..., -0.0136, -0.1215,  0.0535],
        [ 0.0585,  0.0689,  0.0262,  ..., -0.1006, -0.1979, -0.0039],
        ...,
        [ 0.0016, -0.0441, -0.0517,  ..., -0.1008, -0.0087,  0.0264],
        [-0.1437, -0.0463, -0.0065,  ...,  0.0746, -0.0472, -0.0383],
        [ 0.0207, -0.0133, -0.0259,  ...,  0.0389, -0.0023,  0.0011]],
       requires_grad=True) True


In [10]:
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='./train.json',
    block_size=50
)
valid_dataset = TextDataset(
    tokenizer=tokenizer, 
    file_path="./valid.json", 
    block_size=50
)



In [11]:
def compute_metrics(pred):
    global tokenizer
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    bleu = evaluate.load("bleu")
    results = bleu.compute(predictions=preds, references=labels)
    return {
        "bleu": results["bleu"],
        # "bleu": 1.0,
        "brevity_penalty": results["brevity_penalty"],
        "length_ratio": results["length_ratio"]
    }

In [13]:
training_args = TrainingArguments(
    output_dir="./checkpoint",
    overwrite_output_dir=True,
    num_train_epochs=20,
    save_strategy="epoch",
    logging_strategy ="epoch",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=3e-4,
    save_total_limit=1,
    warmup_steps=2000,
    warmup_ratio = 0.1,
    lr_scheduler_type='linear',
    # max_steps=4
)

collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
trainer = Trainer(
    model=model.to("cuda"),
    compute_metrics=compute_metrics,              
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=collator,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Brevity Penalty,Length Ratio
1,3.7755,3.689424,0.099489,0.969381,0.96984


In [None]:
!mkdir ./model_weight_v1 && mkdir ./tokenizer_weight_v1

In [None]:
model.save_pretrained("./model_weight_v1")
tokenizer.save_pretrained("./tokenizer_weight_v1")

In [None]:
!mv ./model_weight_v1 ./drive/MyDrive/checkpoint_gpt && mv ./tokenizer_weight_v1 ./drive/MyDrive/checkpoint_gpt 

# **Inference**

In [None]:
from transformers import pipeline


GENERATOR = pipeline('text-generation', model='./model/', tokenizer="./tokenizer")

In [None]:
output = GENERATOR(txt, max_length=50, num_return_sequences=3, num_beams=3, no_repeat_ngram_size=2, early_stopping=True)