In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import BertTokenizerFast, GPT2LMHeadModel, Trainer, TrainingArguments, EarlyStoppingCallback
import torch
import os

In [2]:
# 設定使用單個 GPU
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
PRETRAINMODEL = "ckiplab/gpt2-base-chinese" #"uer/gpt2-chinese-cluecorpussmall" #
# 初始化 tokenizer 並添加特殊標記
try:
    tokenizer = BertTokenizerFast.from_pretrained(PRETRAINMODEL)
    print("Tokenizer loaded successfully.")
except Exception as e:
    print("Error loading tokenizer:", e)
    pass
# 調整 tokenizer，設置特殊標記
tokenizer.add_special_tokens({"bos_token": "<|startoftext|>", 
                              "eos_token": "<|endoftext|>", 
                              "sep_token": "<|sep|>"})

# 設置 pad_token_id 為 eos_token_id
tokenizer.pad_token_id = tokenizer.eos_token_id

# 加載 GPT-2 模型並調整詞彙表大小
model = GPT2LMHeadModel.from_pretrained(PRETRAINMODEL)
# uer/gpt2-chinese-cluecorpussmall
model.resize_token_embeddings(len(tokenizer))

Tokenizer loaded successfully.


  return self.fget.__get__(instance, owner)()


Embedding(21131, 768)

In [3]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(21131, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=21131, bias=False)
)

In [4]:
# 讀取 netflix_titles.csv 並選取 title 和 description 欄位
data = pd.read_csv('data/netflix_zhcn.csv',encoding="utf_8_sig")
data = data[['title', 'content']]

# 將 pandas DataFrame 轉換為 Hugging Face 的 Dataset 並劃分訓練和測試集
dataset = Dataset.from_pandas(data)
train_test_split = dataset.train_test_split(test_size=0.2)
datasets = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

In [5]:
# 將 train 和 test 資料分別轉換為 pandas DataFrame
train_df = train_test_split['train'].to_pandas()
test_df = train_test_split['test'].to_pandas()

# 將 DataFrame 保存為 CSV 文件
train_df.to_csv('data/netflix_train_zhcn.csv', encoding="utf_8_sig", index=False)
test_df.to_csv('data/netflix_test_zhcn.csv', encoding="utf_8_sig", index=False)

In [6]:
# 定義 tokenization 函數，處理 input_ids 和 labels
def tokenize_function(example):
    text = f"<|startoftext|> 標題:{example['title']} <|sep|>描述:{example['content']}"
    tokens = tokenizer(
        text,
        max_length=128,
        truncation=True,
        padding="max_length",
    )
    input_ids = tokens['input_ids']
    
    # 確定 <|sep|> 的索引，並設置 labels
    sep_token_id = tokenizer.convert_tokens_to_ids("<|sep|>")
    if sep_token_id in input_ids:
        sep_index = input_ids.index(sep_token_id)
    else:
        sep_index = -1

    # 將 labels 複製自 input_ids，並忽略標題部分和填充部分的損失計算
    labels = input_ids.copy()
    if sep_index != -1:
        for i in range(sep_index + 1):
            labels[i] = -100
    pad_token_id = tokenizer.pad_token_id
    labels = [label if label != pad_token_id else -100 for label in labels]
    tokens['labels'] = labels
    return tokens

In [7]:
# 應用 tokenization 函數
tokenized_datasets = datasets.map(tokenize_function, remove_columns=["title", "content"])


Map:   0%|          | 0/7045 [00:00<?, ? examples/s]

Map:   0%|          | 0/1762 [00:00<?, ? examples/s]

In [8]:
# 或者查看隨機一筆資料
random_example = tokenized_datasets['train'].shuffle(seed=100).select([0])
print(random_example)
generated_text = tokenizer.decode(random_example['input_ids'][0], skip_special_tokens=False)
print('inputs:')
print(generated_text)
print('labels:')
print(random_example['labels'][0])
generated_text = tokenizer.decode([token for token in random_example['labels'][0] if token != -100], skip_special_tokens=False)
print(generated_text)

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 1
})
inputs:
[CLS] <|startoftext|> 標 題 : 掃 帚 上 的 小 房 間 <|sep|> 描 述 : 一 位 溫 和 的 女 巫 用 她 的 紅 髮 辮 子 為 各 種 動 物 提 供 搭 乘 ， 讓 她 脾 氣 暴 躁 的 貓 感 到 非 常 惱 怒 。 [SEP] <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endo

In [9]:
# 訓練參數設定
training_args = TrainingArguments(
    output_dir="./NetflixGPT-chinese",
    evaluation_strategy="steps",        # 每隔一定步數進行評估
    eval_steps=500,                     # 每200步評估一次
    save_steps=500,                     # 每200步保存一次
    load_best_model_at_end=True,        # 在訓練結束時加載最佳模型
    metric_for_best_model="eval_loss",  # 使用驗證損失作為早停和選擇最佳模型的依據
    greater_is_better=False,            # 對於損失，越低越好
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=100,
    save_total_limit=1,                 # 僅保留一個最優 checkpoint
)

# 初始化 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]  # 當指標在 3 次評估步驟中無改善時早停
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [10]:
# 開始訓練
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33mswguo[0m. Use [1m`wandb login --relogin`[0m to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




Step,Training Loss,Validation Loss
500,2.8335,2.732565
1000,2.4315,2.720229
1500,2.1371,2.753879
2000,1.8729,2.808597
2500,1.6247,2.867477
3000,1.3899,2.948426
3500,1.1772,3.02476


Checkpoint destination directory ./NetflixGPT-chinese/checkpoint-2500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=3500, training_loss=1.9238209402901785, metrics={'train_runtime': 492.3703, 'train_samples_per_second': 1430.834, 'train_steps_per_second': 89.567, 'total_flos': 3653058576384000.0, 'train_loss': 1.9238209402901785, 'epoch': 7.94})

In [11]:
# 假設模型和 tokenizer 的保存目錄為 "./gpt2-netflix"
tokenizer.save_pretrained("./NetflixGPT-chinese")

('./NetflixGPT-chinese/tokenizer_config.json',
 './NetflixGPT-chinese/special_tokens_map.json',
 './NetflixGPT-chinese/vocab.txt',
 './NetflixGPT-chinese/added_tokens.json',
 './NetflixGPT-chinese/tokenizer.json')

In [12]:
trainer.save_model()

In [13]:
# 定義 inference 測試函數
def generate_description(title):
    input_text = f"標題:{title} 描述:"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(model.device)
    
    # 使用模型進行生成
    output = model.generate(input_ids, max_length=128, num_return_sequences=1, no_repeat_ngram_size=2, 
                            pad_token_id=tokenizer.pad_token_id, early_stopping=True)
    
    # 解碼生成的描述
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    #print(generated_text)
    generated_text = ''.join(generated_text.split(' '))
    new_input_text = ''.join(input_text.split(' '))
    return generated_text.replace(new_input_text, "").strip()

# 測試生成效果
test_titles = ["精神病特工", "追星女孩", "牛奶之水"]
for title in test_titles:
    print(f"Title: {title}")
    print("Generated Description:", generate_description(title))
    print("-" * 50)

Title: 精神病特工




Generated Description: 特·格里爾斯和他的朋友們在一個小鎮上度過了一年的假期，他們的生活在他最好的時刻裡面臨著一些令人毛骨悚然的事情。
--------------------------------------------------
Title: 追星女孩
Generated Description: 在一個小鎮上，一位年輕的女子在她的家鄉度過了一年的假期，她在那裡遇到了兩個女人，他們都在尋找自己的方法，並在這個時候遇見了他。
--------------------------------------------------
Title: 牛奶之水
Generated Description: 是一個被遺忘的小鎮，一位年輕的牧場工人和一名年邁的女性在一起，他們在這個充滿活力的故事中找到了一種新的感覺，並學習了他的人生。
--------------------------------------------------


wandb: ERROR Error while calling W&B API: context deadline exceeded (<Response [500]>)
wandb: ERROR Error while calling W&B API: context deadline exceeded (<Response [500]>)
