In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import torch
import os

In [2]:
# 設定使用單個 GPU
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# 初始化 tokenizer 並添加特殊標記
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', 
                                          bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>',
                                          pad_token='<|pad|>')
tokenizer.add_special_tokens({"sep_token": "<|sep|>"})

# 加載 GPT-2 模型並調整詞彙表大小
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

Embedding(50260, 768)

In [3]:
# 讀取 netflix_titles.csv 並選取 title 和 description 欄位
data = pd.read_csv('data/netflix_en.csv')
data = data[['title', 'description']]

# 將 pandas DataFrame 轉換為 Hugging Face 的 Dataset 並劃分訓練和測試集
dataset = Dataset.from_pandas(data)
train_test_split = dataset.train_test_split(test_size=0.2)
datasets = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

In [4]:
# 將 train 和 test 資料分別轉換為 pandas DataFrame
train_df = train_test_split['train'].to_pandas()
test_df = train_test_split['test'].to_pandas()

# 將 DataFrame 保存為 CSV 文件
train_df.to_csv('data/netflix_train.csv', index=False)
test_df.to_csv('data/netflix_test.csv', index=False)

In [5]:
# 定義 tokenization 函數，處理 input_ids 和 labels
def tokenize_function(example):
    text = f"<|startoftext|>Title: {example['title']}<|sep|>Description: {example['description']}<|endoftext|>"
    tokens = tokenizer(
        text,
        max_length=128,
        truncation=True,
        padding="max_length",
    )
    input_ids = tokens['input_ids']
    
    # 確定 <|sep|> 的索引，並設置 labels
    sep_token_id = tokenizer.convert_tokens_to_ids("<|sep|>")
    if sep_token_id in input_ids:
        sep_index = input_ids.index(sep_token_id)
    else:
        sep_index = -1

    # 將 labels 複製自 input_ids，並忽略標題部分和填充部分的損失計算
    labels = input_ids.copy()
    if sep_index != -1:
        for i in range(sep_index + 1):
            labels[i] = -100
    pad_token_id = tokenizer.pad_token_id
    labels = [label if label != pad_token_id else -100 for label in labels]
    tokens['labels'] = labels
    return tokens

In [6]:
# 應用 tokenization 函數
tokenized_datasets = datasets.map(tokenize_function, remove_columns=["title", "description"])


Map:   0%|          | 0/7045 [00:00<?, ? examples/s]

Map:   0%|          | 0/1762 [00:00<?, ? examples/s]

In [7]:
# 或者查看隨機一筆資料
random_example = tokenized_datasets['train'].shuffle(seed=42).select([0])
print(random_example)
generated_text = tokenizer.decode(random_example['input_ids'][0], skip_special_tokens=False)
print('inputs:')
print(generated_text)
print('labels:')
print(random_example['labels'][0])
generated_text = tokenizer.decode([token for token in random_example['labels'][0] if token != -100], skip_special_tokens=False)
print(generated_text)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1
})
inputs:
<|startoftext|>Title: The Debt Collector<|sep|>Description: A broke martial arts instructor takes a side gig with a mobster, who pairs him with a veteran thug for a weekend of fisticuffs-fueled debt collection.<|endoftext|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|>
labels:
[-100, -100, -100, -100, -100, -100, -100, 11828, 25, 317, 6265, 15618, 10848, 21187, 

In [8]:
# 設定訓練參數
training_args = TrainingArguments(
    output_dir="./NetflixGPT-english",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    eval_steps=200,
    save_steps=200,
    save_total_limit=1,
    logging_steps=200,
    report_to="none"  # Disable wandb or other integrations
)

# 初始化 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test']
)

In [9]:
# 開始訓練
trainer.train()



Step,Training Loss,Validation Loss
200,5.7235,3.2231
400,3.2824,3.175246
600,3.1861,3.157892
800,3.1165,3.148762
1000,3.051,3.142247
1200,2.9991,3.141649
1400,2.9534,3.13844
1600,2.9142,3.14292
1800,2.8666,3.152623
2000,2.8375,3.14931


Checkpoint destination directory ./gpt2-netflix/checkpoint-600 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=4420, training_loss=2.9585331040809595, metrics={'train_runtime': 1330.137, 'train_samples_per_second': 105.929, 'train_steps_per_second': 3.323, 'total_flos': 9204011827200000.0, 'train_loss': 2.9585331040809595, 'epoch': 20.0})

In [10]:
# 假設模型和 tokenizer 的保存目錄為 "./NetflixGPT-english"
tokenizer.save_pretrained("./NetflixGPT-english")

('./gpt2-netflix/tokenizer_config.json',
 './gpt2-netflix/special_tokens_map.json',
 './gpt2-netflix/vocab.json',
 './gpt2-netflix/merges.txt',
 './gpt2-netflix/added_tokens.json')

In [12]:
trainer.save_model()

In [11]:
# 定義 inference 測試函數
def generate_description(title):
    input_text = f"<|startoftext|>Title: {title} <|sep|>Description:"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(model.device)
    
    # 使用模型進行生成
    output = model.generate(input_ids, max_length=100, num_return_sequences=1, no_repeat_ngram_size=2, 
                            pad_token_id=tokenizer.eos_token_id, early_stopping=True)
    
    # 解碼生成的描述
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text.replace(input_text, "").strip()

# 測試生成效果
test_titles = ["Stranger Things", "Breaking Bad", "The Crown"]
for title in test_titles:
    print(f"Title: {title}")
    print("Generated Description:", generate_description(title))
    print("-" * 50)

Title: Stranger Things
Generated Description: Title: Stranger Things Description: When a young woman is abducted by a group of strangers, the only way to save her is to be with them.
--------------------------------------------------
Title: Breaking Bad




Generated Description: Title: Breaking Bad Description: A group of friends is caught between two rivalries when a mysterious figure threatens to destroy their friendship.
--------------------------------------------------
Title: The Crown
Generated Description: Title: The Crown Description: A young man's life is turned upside down when he's forced to marry a woman he loves, who's been cheating on him for years.
--------------------------------------------------
