In [4]:
import sys
sys.path.append('./minGPT')
from mingpt.model import GPT

import torch
import torch.nn as nn
from torch.nn import TransformerDecoder, TransformerDecoderLayer
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.model_selection import train_test_split
import re

# from mingpt.model import GPT
 
# model_config = GPT.get_default_config()
# model_config.vocab_size = len(vocab)  # 词汇表大小
# model_config.block_size = 20          # 序列最大长度
# model_config.n_layer = 6              # 参考网页6的层数配置
# model_config.n_head = 8
# model = GPT(model_config)


In [8]:
import os
print(os.getcwd())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == 'cpu':
    file_path = '../kaggleData/data/eng-cmn.txt'
else:
    file_path = '/kaggle/input/eng-cmn/eng-cmn.txt'

d:\GITrepo\DL-MDS5122-hw2\mingpt


In [9]:
text_pairs = []

with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:

        parts = line.strip().split('\t')
        

        if len(parts) < 3:
            continue
            
        en_text = parts[0].strip()
        zh_text = parts[1].strip()
        

        if en_text and zh_text:

            text_pairs.append((zh_text, en_text))

In [10]:
from collections import defaultdict
import re
# def build_vocab(text_pairs):
#     vocab = defaultdict(lambda: len(vocab))
#     special_tokens = ["<pad>", "<sos>", "<eos>", "<sep>"]
#     for token in special_tokens:
#         vocab[token]
    
#     for ch, en in text_pairs:
#         for char in (ch + en):
#             vocab[char]
#     return vocab
def build_vocab(text_pairs):
    vocab = defaultdict(lambda: len(vocab))
    special_tokens = ["<pad>", "<sos>", "<eos>", "<sep>"]
    for token in special_tokens:
        vocab[token]
    
    for ch, en in text_pairs:
        ch = re.sub(r'[^\w\s\u4e00-\u9fa5]', '', ch, flags=re.UNICODE)
        for char in ch.lower():
            vocab[char]
        en = re.sub(r'[^\w\s\u4e00-\u9fa5]', '', en, flags=re.UNICODE)
        for char in en.lower().split(' '):
            vocab[char]
    return vocab

def text_to_ids_cn(text, vocab, add_special_tokens=False):
    text = re.sub(r'[^\w\s\u4e00-\u9fa5]', '', text, flags=re.UNICODE)
    text = re.sub(r'[^\w\s\u4e00-\u9fa5]', '', text, flags=re.UNICODE)
    text=text.lower()
    ids = []
    if add_special_tokens:
        ids.append(vocab["<sos>"])
    for char in text:
        ids.append(vocab[char])
    # ids += [10, 2476, 2477, 85]
    if add_special_tokens:
        ids.append(vocab["<eos>"])
    return ids

def text_to_ids_eng(text, vocab, add_special_tokens=False):
    text = re.sub(r'[^\w\s\u4e00-\u9fa5]', '', text, flags=re.UNICODE)
    text = re.sub(r'[^\w\s\u4e00-\u9fa5]', '', text, flags=re.UNICODE)
    text=text.lower()
    ids = []
    if add_special_tokens:
        ids.append(vocab["<sos>"])
    for char in text.split(' '):
        ids.append(vocab[char])
    
    if add_special_tokens:
        ids.append(vocab["<eos>"])
    return ids


from torch.utils.data import Dataset, DataLoader

class TranslationDataset(Dataset):
    def __init__(self, pairs, vocab, max_len=50):
        self.pairs = pairs
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        ch, en = self.pairs[idx]

        input_ids = (
            [self.vocab["<sos>"]] +
            text_to_ids_cn(ch, self.vocab) +
            [self.vocab["<sep>"]] +
            text_to_ids_eng(en, self.vocab) +
            [self.vocab["<eos>"]]
        )

        input_ids = input_ids[:self.max_len]
        src = input_ids[:-1]
        tgt = input_ids[1:]
        # input_ids += [self.vocab["<pad>"]] * (self.max_len - len(input_ids))
        src += [self.vocab["<pad>"]]* (self.max_len - len(src))
        tgt += [self.vocab["<pad>"]]* (self.max_len - len(tgt))
        # return torch.tensor(input_ids[:-1]), torch.tensor(input_ids[1:]) 
        return torch.tensor(src), torch.tensor(tgt)
    

vocab = build_vocab(text_pairs)
    
vocab_size = len(vocab)
eval_num = 100 # translate eval_num of inputs
BATCH_SIZE = 32
MAX_LEN = 49
model_max_seq=128
max_epoch = 10
num_heads = 2


dataset = TranslationDataset(text_pairs, vocab, MAX_LEN)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [55]:
from mingpt.model import GPT
from mingpt.trainer import Trainer



# # 配置GPT模型参数[7](@ref)
# config = GPT.get_default_config()
# config.model_type = 'gpt-mini'
# config.vocab_size = len(dataset.vocab)  # 动态词汇量
# config.block_size = dataset.max_length  # 序列最大长度
# # config.n_layer = 4                      # 减小层数以适配翻译任务
# # config.n_head = 4
# model = GPT(config)
# from mingpt.model import GPT
model_config = GPT.get_default_config()
model_config.model_type = 'gpt-mini'
model_config.vocab_size = len(vocab) # openai's model vocabulary
model_config.block_size = 100  # openai's model block_size (i.e. input context length)
model = GPT(model_config)
# 配置训练参数
train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4       # 调高学习率
train_config.batch_size = 32
train_config.max_iters = 1000          # 减少迭代次数
train_config.num_workers = 0

# 启动训练
trainer = Trainer(train_config, model, dataset)
# trainer.run()

def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()

number of parameters: 4.78M
running on device cpu
iter_dt 0.00ms; iter 0: train loss 9.36291
iter_dt 329.57ms; iter 100: train loss 2.09860
iter_dt 202.07ms; iter 200: train loss 1.81397
iter_dt 248.69ms; iter 300: train loss 1.80720
iter_dt 313.52ms; iter 400: train loss 1.68396
iter_dt 228.83ms; iter 500: train loss 1.54499
iter_dt 219.02ms; iter 600: train loss 1.53388
iter_dt 230.85ms; iter 700: train loss 1.50895
iter_dt 246.45ms; iter 800: train loss 1.36626
iter_dt 248.58ms; iter 900: train loss 1.49999


In [82]:
id2w = {v:k for k, v in vocab.items()}
def decode(ids, id2w=id2w):
    tokens = [id2w.get(t, 'notfound') for t in ids if t !=2]
    return ' '.join(tokens)

In [None]:
t

In [91]:
text_to_ids_cn('我测你的马', vocab)

[23, 1859, 6, 10, 2358]

In [102]:
for i in range(5):
    src = '帮我买一橘子' # text_pairs[i][1]
    tgt = ''
    input_ids = (
        [vocab["<sos>"]] +
        text_to_ids_cn(src, vocab) +
        [vocab["<sep>"]]
    )
    input_tensor = torch.tensor([input_ids], device=device)
    print(f'用户输入: {src}')
    print(f'正确翻译: {tgt}')

    max_new_token = 20
    model.eval()
    gen = model.generate(input_tensor,  max_new_tokens=max_new_token, do_sample=False, top_k=40)
    print('模型翻译：',decode(gen[0][-max_new_token:].tolist()))
    print('='*20)

用户输入: 帮我买一橘子
正确翻译: 
模型翻译： please tell me the door
用户输入: 帮我买一橘子
正确翻译: 
模型翻译： please tell me the door
用户输入: 帮我买一橘子
正确翻译: 
模型翻译： please tell me the door
用户输入: 帮我买一橘子
正确翻译: 
模型翻译： please tell me the door
用户输入: 帮我买一橘子
正确翻译: 
模型翻译： please tell me the door
