In [1]:
import sys
sys.path.append('./minGPT')
from mingpt.model import GPT

import torch
import torch.nn as nn
from torch.nn import TransformerDecoder, TransformerDecoderLayer
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.model_selection import train_test_split
import re

# from mingpt.model import GPT
 
# model_config = GPT.get_default_config()
# model_config.vocab_size = len(vocab)  # 词汇表大小
# model_config.block_size = 20          # 序列最大长度
# model_config.n_layer = 6              # 参考网页6的层数配置
# model_config.n_head = 8
# model = GPT(model_config)


In [2]:
import os
print(os.getcwd())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == 'cpu':
    file_path = '../kaggleData/data/eng-cmn.txt'
else:
    file_path = '/kaggle/input/eng-cmn/eng-cmn.txt'

d:\GITrepo\DL-MDS5122-hw2\mingpt


In [3]:
text_pairs = []

with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:

        parts = line.strip().split('\t')
        

        if len(parts) < 3:
            continue
            
        en_text = parts[0].strip()
        zh_text = parts[1].strip()
        

        if en_text and zh_text:

            text_pairs.append((zh_text, en_text))

In [6]:
from collections import defaultdict
import re
# def build_vocab(text_pairs):
#     vocab = defaultdict(lambda: len(vocab))
#     special_tokens = ["<pad>", "<sos>", "<eos>", "<sep>"]
#     for token in special_tokens:
#         vocab[token]
    
#     for ch, en in text_pairs:
#         for char in (ch + en):
#             vocab[char]
#     return vocab
def build_vocab(text_pairs):
    vocab = defaultdict(lambda: len(vocab))
    special_tokens = ["<pad>", "<sos>", "<eos>", "<sep>"]
    for token in special_tokens:
        vocab[token]
    
    for ch, en in text_pairs:
        ch = re.sub(r'[^\w\s\u4e00-\u9fa5]', '', ch, flags=re.UNICODE)
        for char in ch.lower():
            vocab[char]
        en = re.sub(r'[^\w\s\u4e00-\u9fa5]', '', en, flags=re.UNICODE)
        for char in en.lower().split(' '):
            vocab[char]
    return vocab

def text_to_ids_cn(text, vocab, add_special_tokens=False):
    text = re.sub(r'[^\w\s\u4e00-\u9fa5]', '', text, flags=re.UNICODE)
    text = re.sub(r'[^\w\s\u4e00-\u9fa5]', '', text, flags=re.UNICODE)
    text=text.lower()
    ids = []
    if add_special_tokens:
        ids.append(vocab["<sos>"])
    for char in text:
        ids.append(vocab[char])
    # ids += [10, 2476, 2477, 85]
    if add_special_tokens:
        ids.append(vocab["<eos>"])
    return ids

def text_to_ids_eng(text, vocab, add_special_tokens=False):
    text = re.sub(r'[^\w\s\u4e00-\u9fa5]', '', text, flags=re.UNICODE)
    text = re.sub(r'[^\w\s\u4e00-\u9fa5]', '', text, flags=re.UNICODE)
    text=text.lower()
    ids = []
    if add_special_tokens:
        ids.append(vocab["<sos>"])
    for char in text.split(' '):
        ids.append(vocab[char])
    
    if add_special_tokens:
        ids.append(vocab["<eos>"])
    return ids


from torch.utils.data import Dataset, DataLoader

class TranslationDataset(Dataset):
    def __init__(self, pairs, vocab, max_len=50):
        self.pairs = pairs
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        ch, en = self.pairs[idx]

        input_ids = (
            [self.vocab["<sos>"]] +
            text_to_ids_cn(ch, self.vocab) +
            [self.vocab["<sep>"]] +
            text_to_ids_eng(en, self.vocab) +
            [self.vocab["<eos>"]]
        )

        input_ids = input_ids[:self.max_len]
        src = input_ids[:-1]
        tgt = input_ids[1:]
        # input_ids += [self.vocab["<pad>"]] * (self.max_len - len(input_ids))
        src += [self.vocab["<pad>"]]* (self.max_len - len(src))
        tgt += [self.vocab["<pad>"]]* (self.max_len - len(tgt))
        # return torch.tensor(input_ids[:-1]), torch.tensor(input_ids[1:]) 
        return torch.tensor(src), torch.tensor(tgt)
    

vocab = build_vocab(text_pairs)
    
vocab_size = len(vocab)
eval_num = 100 # translate eval_num of inputs
BATCH_SIZE = 32
MAX_LEN = 49
model_max_seq=128
max_epoch = 30
num_heads = 2


dataset = TranslationDataset(text_pairs, vocab, MAX_LEN)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [20]:
from mingpt.model import GPT
from mingpt.trainer import Trainer



# # 配置GPT模型参数[7](@ref)
# config = GPT.get_default_config()
# config.model_type = 'gpt-mini'
# config.vocab_size = len(dataset.vocab)  # 动态词汇量
# config.block_size = dataset.max_length  # 序列最大长度
# # config.n_layer = 4                      # 减小层数以适配翻译任务
# # config.n_head = 4
# model = GPT(config)
# from mingpt.model import GPT
model_config = GPT.get_default_config()
model_config.model_type = 'gpt-mini'
model_config.vocab_size = len(vocab) # openai's model vocabulary
model_config.block_size = 100  # openai's model block_size (i.e. input context length)
model = GPT(model_config)
# 配置训练参数
train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4      
train_config.batch_size = 32
train_config.max_iters = 5000          
train_config.num_workers = 0

# 启动训练
trainer = Trainer(train_config, model, dataset)
# trainer.run()

def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()

number of parameters: 4.78M
running on device cpu
iter_dt 0.00ms; iter 0: train loss 9.39832
iter_dt 295.72ms; iter 100: train loss 2.29008
iter_dt 201.28ms; iter 200: train loss 1.56079
iter_dt 246.24ms; iter 300: train loss 1.70178
iter_dt 249.06ms; iter 400: train loss 1.96999
iter_dt 242.18ms; iter 500: train loss 1.76812
iter_dt 245.91ms; iter 600: train loss 1.61267
iter_dt 228.36ms; iter 700: train loss 1.66291
iter_dt 239.50ms; iter 800: train loss 1.69228
iter_dt 252.66ms; iter 900: train loss 1.44959
iter_dt 238.86ms; iter 1000: train loss 1.63701
iter_dt 252.69ms; iter 1100: train loss 1.57210
iter_dt 251.72ms; iter 1200: train loss 1.31319
iter_dt 219.13ms; iter 1300: train loss 1.33743
iter_dt 241.27ms; iter 1400: train loss 1.34280
iter_dt 220.41ms; iter 1500: train loss 1.45900
iter_dt 267.05ms; iter 1600: train loss 1.43678
iter_dt 249.22ms; iter 1700: train loss 1.30029
iter_dt 235.19ms; iter 1800: train loss 1.18036
iter_dt 270.38ms; iter 1900: train loss 1.28492
iter

In [8]:
id2w = {v:k for k, v in vocab.items()}
def decode(ids, id2w=id2w):
    tokens = [id2w.get(t, 'notfound') for t in ids if t !=2]
    return ' '.join(tokens)

('嗨。', 'Hi.')

In [19]:
import random
for i in range(20):
    pair = random.choice(text_pairs)
    src = pair[0]
    tgt = pair[1]
    input_ids = (
        [vocab["<sos>"]] +
        text_to_ids_cn(src, vocab) +
        [vocab["<sep>"]]
    )
    input_tensor = torch.tensor([input_ids], device=device)
    print(f'用户输入: {src}')
    print(f'正确翻译: {tgt}')

    max_new_token = 20
    model.eval()
    gen = model.generate(input_tensor,  max_new_tokens=max_new_token, do_sample=False, top_k=40)
    print('模型翻译：',decode(gen[0][-max_new_token:].tolist()))
    print('='*20)

用户输入: 我每天都给他打电话。
正确翻译: I phone him every day.
模型翻译： i dont go this book
用户输入: 你们自己吃蛋糕。
正确翻译: Please help yourself to the cake.
模型翻译： your favorite exhausted
用户输入: 法语不是汤姆的母语。
正确翻译: French isn't Tom's native language.
模型翻译： tom is a good brother
用户输入: 我的房子又舊又難看。
正确翻译: My house is old and ugly.
模型翻译： my father is a lot of the same
用户输入: 請思考一下明天。
正确翻译: Think about tomorrow.
模型翻译： please be a good week
用户输入: 因為以前見過他, 所以我立刻就認出他來。
正确翻译: Having met him before, I recognized him at once.
模型翻译： he was not to me that he was going to be
用户输入: 這一個還活著。
正确翻译: This one's still alive.
模型翻译： this is a good
用户输入: 他们就快从香港抵达了。
正确翻译: They are arriving here soon from Hong Kong.
模型翻译： they have to the train
用户输入: 她會唱歌而且舞跳得很美。
正确翻译: She can sing and dance beautifully.
模型翻译： she is not to be a good
用户输入: 汤姆睡着了。
正确翻译: Tom fell asleep.
模型翻译： tom is a good
用户输入: 你們家在哪？
正确翻译: Where is your house?
模型翻译： where are you
用户输入: 我有跟你同樣的麻煩。
正确翻译: I have the same trouble as you have.
模型翻译： i have you to the same
用户输入: 我想改善我的