## 安裝 Transformers

In [None]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## 運用GPT2 產生預測結果

In [None]:
# Import required libraries
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Encode a text inputs
text = "What is the fastest car in the"
indexed_tokens = tokenizer.encode(text)

# Convert indexed tokens in a PyTorch tensor
tokens_tensor = torch.tensor([indexed_tokens])

# Load pre-trained model (weights)
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Set the model in evaluation mode to deactivate the DropOut modules
model.eval()

# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
model.to('cuda')

# Predict all tokens
with torch.no_grad():
    outputs = model(tokens_tensor)
    predictions = outputs[0]

# Get the predicted next sub-word
predicted_index = torch.argmax(predictions[0, -1, :]).item()
predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])

# Print the predicted word
print(predicted_text)

What is the fastest car in the world


## 產生更長的輸出

In [None]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generator("Hello,", max_length=30, num_return_sequences=5)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello, but what I'm really curious about is who I actually met on my wedding night in Las Vegas. The story doesn't end there: I"},
 {'generated_text': "Hello, this is not the best and I don't like it. I hope you didn't understand this when you did these pictures. For the record"},
 {'generated_text': "Hello, if you've chosen, if you've given, if you know, it's been a privilege to have it be so for this last year"},
 {'generated_text': "Hello, you can do that, too!\n\nWhy, you said the answer was simple, and so you did it, and so I'm"},
 {'generated_text': "Hello, I've been watching a TV series called G.I. Joe and watched it for quite a while. It was one of those sitcoms"}]

## 產生 Embedding

In [None]:
# Define input sentence
sentence = "hello, world"

# Encode input sentence using the tokenizer
input_ids = tokenizer.encode(sentence, return_tensors='pt')

# Convert input tensor to PyTorch
input_ids = input_ids.to(torch.int64).to('cuda')

# Pass input tensor through the model to get embeddings
with torch.no_grad():
    embeddings = model(input_ids)[0]

In [None]:
embeddings

tensor([[[ -37.0708,  -36.4855,  -40.3520,  ...,  -46.5168,  -45.4142,
           -37.9090],
         [-108.6662, -109.3310, -110.0967,  ..., -114.7922, -112.9426,
          -106.3587],
         [ -72.9768,  -73.6322,  -75.5144,  ...,  -86.3169,  -81.2544,
           -74.9500]]], device='cuda:0')

## 微調(Fine-Tuning) GPT2

In [None]:
! wget https://github.com/ywchiu/sns_mining/raw/main/data/lord_of_rings.txt

--2023-04-21 15:24:56--  https://github.com/ywchiu/sns_mining/raw/main/data/lord_of_rings.txt
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/ywchiu/sns_mining/main/data/lord_of_rings.txt [following]
--2023-04-21 15:24:56--  https://raw.githubusercontent.com/ywchiu/sns_mining/main/data/lord_of_rings.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1021631 (998K) [text/plain]
Saving to: ‘lord_of_rings.txt.1’


2023-04-21 15:24:56 (55.6 MB/s) - ‘lord_of_rings.txt.1’ saved [1021631/1021631]



In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
from transformers import LineByLineTextDataset, DataCollatorForLanguageModeling, TrainingArguments, Trainer


def load_dataset(file_path, tokenizer):
    dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=128,
    )
    return dataset


file_path = "load_of_rings.txt"
train_dataset = load_dataset(file_path, tokenizer)




In [None]:

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)


In [None]:
training_args = TrainingArguments(
    output_dir="./gpt2_finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    logging_steps=100,
    logging_dir="./logs",
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

trainer.train()