In [1]:
import sys
sys.path.append('./minGPT')
from mingpt.model import GPT

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np

from collections import defaultdict
import re

In [2]:
import os
print(os.getcwd())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == 'cpu':
    file_path = '../kaggleData/data/eng-cmn.txt'
else:
    file_path = '/kaggle/input/eng-cmn/eng-cmn.txt'

d:\GITrepo\DL-MDS5122-hw2\mingpt


In [3]:
text_pairs = []

with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split('\t')
        
        if len(parts) < 3:
            continue
            
        en_text = parts[0].strip()
        zh_text = parts[1].strip()
    
        if en_text and zh_text:
            text_pairs.append((zh_text, en_text))

In [32]:
def build_vocab(text_pairs):
    vocab = defaultdict(lambda: len(vocab))
    special_tokens = ["<pad>", "<sos>", "<eos>", "<sep>"]
    for token in special_tokens:
        vocab[token]
    
    for ch, en in text_pairs:
        ch = re.sub(r'[^\w\s\u4e00-\u9fa5]', '', ch, flags=re.UNICODE)
        for char in ch.lower():
            vocab[char]
        en = re.sub(r'[^\w\s\u4e00-\u9fa5]', '', en, flags=re.UNICODE)
        for char in en.lower().split(' '):
            vocab[char]
    return vocab

def text_to_ids_cn(text, vocab, add_special_tokens=False):
    text = re.sub(r'[^\w\s\u4e00-\u9fa5]', '', text, flags=re.UNICODE)
    text = re.sub(r'[^\w\s\u4e00-\u9fa5]', '', text, flags=re.UNICODE)
    text=text.lower()
    ids = []
    if add_special_tokens:
        ids.append(vocab["<sos>"])
    for char in text:
        ids.append(vocab[char])
    # ids += [10, 2476, 2477, 85]
    if add_special_tokens:
        ids.append(vocab["<eos>"])
    return ids

def text_to_ids_eng(text, vocab, add_special_tokens=False):
    text = re.sub(r'[^\w\s\u4e00-\u9fa5]', '', text, flags=re.UNICODE)
    text = re.sub(r'[^\w\s\u4e00-\u9fa5]', '', text, flags=re.UNICODE)
    text=text.lower()
    ids = []
    if add_special_tokens:
        ids.append(vocab["<sos>"])
    for char in text.split(' '):
        ids.append(vocab[char])
    
    if add_special_tokens:
        ids.append(vocab["<eos>"])
    return ids

class TranslationDataset(Dataset):
    def __init__(self, pairs, vocab, max_len=50):
        self.pairs = pairs
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        ch, en = self.pairs[idx]

        input_ids = (
            [self.vocab["<sos>"]] +
            text_to_ids_cn(ch, self.vocab) +
            [self.vocab["<sep>"]] +
            text_to_ids_eng(en, self.vocab) +
            [self.vocab["<eos>"]]
        )

        input_ids = input_ids[:self.max_len]
        src = input_ids[:-1]
        tgt = input_ids[1:]
        # input_ids += [self.vocab["<pad>"]] * (self.max_len - len(input_ids))
        src += [self.vocab["<pad>"]]* (self.max_len - len(src))
        tgt += [self.vocab["<pad>"]]* (self.max_len - len(tgt))
        # return torch.tensor(input_ids[:-1]), torch.tensor(input_ids[1:]) 
        return torch.tensor(src), torch.tensor(tgt)
    

# build dataset and dataloader
vocab = build_vocab(text_pairs)
vocab_size = len(vocab)
BATCH_SIZE = 32
MAX_LEN = 45

dataset = TranslationDataset(text_pairs, vocab, MAX_LEN)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [39]:
from mingpt.model import GPT
from mingpt.trainer import Trainer

model_config = GPT.get_default_config()
model_config.model_type = 'gpt-mini' # use 'gpt-mini' for better translation performance, weights ~ 26MB
model_config.vocab_size = len(vocab) # model vocabulary
model_config.block_size = 128  # model block_size (i.e. input context length)
model = GPT(model_config)

# train config
train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4      
train_config.batch_size = 32
train_config.max_iters = 3000          
train_config.num_workers = 0

# start training
trainer = Trainer(train_config, model, dataset)
# trainer.run()

def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run() # gpt-mini: 10min for 3000 iters
torch.save(model.state_dict(), 'mingpt.pth') # about 26MB for gpt-min, 4.5MB for gpt-nano

number of parameters: 4.78M
running on device cpu
iter_dt 0.00ms; iter 0: train loss 9.30095
iter_dt 179.11ms; iter 100: train loss 2.17026
iter_dt 183.84ms; iter 200: train loss 1.90296
iter_dt 178.93ms; iter 300: train loss 1.97812
iter_dt 179.58ms; iter 400: train loss 1.91965
iter_dt 234.74ms; iter 500: train loss 1.86092
iter_dt 191.83ms; iter 600: train loss 1.72459
iter_dt 222.67ms; iter 700: train loss 1.92066
iter_dt 186.77ms; iter 800: train loss 1.61618
iter_dt 232.92ms; iter 900: train loss 1.81775
iter_dt 218.06ms; iter 1000: train loss 1.48267
iter_dt 179.22ms; iter 1100: train loss 1.65501
iter_dt 198.70ms; iter 1200: train loss 1.61101
iter_dt 183.27ms; iter 1300: train loss 1.62081
iter_dt 179.49ms; iter 1400: train loss 1.48914
iter_dt 211.45ms; iter 1500: train loss 1.42379
iter_dt 182.45ms; iter 1600: train loss 1.34717
iter_dt 211.42ms; iter 1700: train loss 1.37301
iter_dt 205.69ms; iter 1800: train loss 1.45805
iter_dt 173.50ms; iter 1900: train loss 1.57338
iter

In [40]:
# weights reload
model.load_state_dict(torch.load('mingpt.pth'))

<All keys matched successfully>

In [41]:
id2w = {v:k for k, v in vocab.items()}
def decode(ids, id2w=id2w):
    tokens = [id2w.get(t, 'notfound') for t in ids if t !=2]
    return ' '.join(tokens)

In [44]:
import random
max_new_token = 20 # 20 is enough for eval
model.eval()
for i in range(20):
    with torch.no_grad():
        pair = random.choice(text_pairs)
        src = pair[0]
        tgt = pair[1]
        input_ids = (
            [vocab["<sos>"]] +
            text_to_ids_cn(src, vocab) +
            [vocab["<sep>"]]
        )
        input_tensor = torch.tensor([input_ids], device=device)
        print(f'用户输入: {src}')
        print(f'正确翻译: {tgt}')

        gen = model.generate(input_tensor,  max_new_tokens=max_new_token, do_sample=False, top_k=40)
        print('模型翻译：',decode(gen[0][-max_new_token:].tolist()))
        print('='*20)

用户输入: 你可以选择任何你想要的。
正确翻译: You may choose whichever you want.
模型翻译： you can think you want to be right
用户输入: 你绝对肯定吗？
正确翻译: Are you absolutely sure?
模型翻译： are you sure about it
用户输入: 他們從我的果園偷了蘋果。
正确翻译: They stole apples from my orchard.
模型翻译： they went out of the apple apple
用户输入: 汤姆对局势一无所知。
正确翻译: Tom knows nothing about the situation.
模型翻译： tom doesnt know that mary is a thing
用户输入: 我们的车走了。
正确翻译: There goes our bus.
模型翻译： our car went
用户输入: 他們一整年都必須工作。
正确翻译: They had to work all year round.
模型翻译： they have to work all day
用户输入: 赶快回家。
正确翻译: Go home quickly.
模型翻译： get back home
用户输入: 這個柳橙太酸了。
正确翻译: This orange is too sour.
模型翻译： the apple juice
用户输入: 请说得更清楚些。
正确翻译: Please speak more clearly.
模型翻译： please say more more more
用户输入: 我喜欢你走路的方式。
正确翻译: I like the way you walk.
模型翻译： i like you to go there
用户输入: 我把他的书还给了他。
正确翻译: I returned his book to him.
模型翻译： i gave him my book
用户输入: 他娶了一位空姐。
正确翻译: He married a stewardess.
模型翻译： he gave a favorite country
用户输入: 如果值得一做，就值得做好。
正确翻译: If it is wort