In [1]:
import random
import requests
import zipfile
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def download_zip(url, output_path):
    response = requests.get(url, headers=headers, stream=True)
    if response.status_code == 200:
        with open(output_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"ZIP file downloaded to {output_path}")
    else:
        raise Exception(f"Failed to download. HTTP Response Code: {response.status_code}")

url = "http://www.manythings.org/anki/fra-eng.zip"
zip_path = "fra-eng.zip"
txt_path = "fra.txt"

if not os.path.exists(txt_path):
    download_zip(url, zip_path)
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall()
        print("Extracted files:", zip_ref.namelist())

pairs = []
with open(txt_path, encoding="utf-8") as f:
    for line in f.readlines():
        eng, fra, *_ = line.strip().split("\t")
        pairs.append((eng, fra))

pairs = pairs[:50000]

print("샘플 데이터:", pairs[0])

샘플 데이터: ('Go.', 'Va !')


In [2]:
pairs[:10]

[('Go.', 'Va !'),
 ('Go.', 'Marche.'),
 ('Go.', 'En route !'),
 ('Go.', 'Bouge !'),
 ('Hi.', 'Salut !'),
 ('Hi.', 'Salut.'),
 ('Run!', 'Cours\u202f!'),
 ('Run!', 'Courez\u202f!'),
 ('Run!', 'Prenez vos jambes à vos cous !'),
 ('Run!', 'File !')]

In [3]:
def tokenize(text):
    return text.lower().split()

def build_vocab(texts, min_freq=2):
    counter = Counter()
    for txt in texts:
        counter.update(tokenize(txt))
    vocab = {"<pad>":0, "<sos>":1, "<eos>":2, "<unk>":3}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    return vocab

src_texts = [src for src, _ in pairs]
trg_texts = [trg for _, trg in pairs]

SRC_VOCAB = build_vocab(src_texts)
TRG_VOCAB = build_vocab(trg_texts)

PAD_IDX = SRC_VOCAB["<pad>"]
SOS_IDX = SRC_VOCAB["<sos>"]
EOS_IDX = SRC_VOCAB["<eos>"]

In [8]:
def numericalize(text, vocab):
    return [vocab.get(tok, vocab["<unk>"]) for tok in tokenize(text)]

def tensor_transform(tokens, vocab):
    return torch.tensor([vocab["<sos>"]] + tokens + [vocab["<eos>"]], dtype=torch.long)

class TranslationDataset(Dataset):
    def __init__(self, pairs, src_vocab, trg_vocab):
        self.pairs = pairs
        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        src, trg = self.pairs[idx]
        src_tensor = tensor_transform(numericalize(src, self.src_vocab), self.src_vocab)
        trg_tensor = tensor_transform(numericalize(trg, self.trg_vocab), self.trg_vocab)
        return src_tensor, trg_tensor

def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    src_batch = nn.utils.rnn.pad_sequence(src_batch, padding_value=PAD_IDX, batch_first=True)
    trg_batch = nn.utils.rnn.pad_sequence(trg_batch, padding_value=PAD_IDX, batch_first=True)
    return src_batch, trg_batch

dataset = TranslationDataset(pairs, SRC_VOCAB, TRG_VOCAB)
train_size = int(len(dataset) * 0.9)
valid_size = len(dataset) - train_size
train_dataset, valid_dataset = torch.utils.data.random_split(dataset, [train_size, valid_size])

train_iter = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
valid_iter = DataLoader(valid_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

In [9]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hidden_dim, n_heads, dropout_ratio, device):
        super().__init__()
        assert hidden_dim % n_heads == 0

        self.hidden_dim = hidden_dim
        self.n_heads = n_heads
        self.head_dim = hidden_dim // n_heads

        self.fc_q = nn.Linear(hidden_dim, hidden_dim)
        self.fc_k = nn.Linear(hidden_dim, hidden_dim)
        self.fc_v = nn.Linear(hidden_dim, hidden_dim)

        self.fc_o = nn.Linear(hidden_dim, hidden_dim)

        self.dropout = nn.Dropout(dropout_ratio)
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)

    def forward(self, query, key, value, mask=None):
        batch_size = query.shape[0]

        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)

        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)

        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)

        attention = torch.softmax(energy, dim=-1)
        x = torch.matmul(self.dropout(attention), V)

        x = x.permute(0, 2, 1, 3).contiguous().view(batch_size, -1, self.hidden_dim)
        x = self.fc_o(x)
        return x, attention

class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hidden_dim, pf_dim, dropout_ratio):
        super().__init__()
        self.fc_1 = nn.Linear(hidden_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout_ratio)

    def forward(self, x):
        x = self.dropout(torch.relu(self.fc_1(x)))
        x = self.fc_2(x)
        return x

class EncoderLayer(nn.Module):
    def __init__(self, hidden_dim, n_heads, pf_dim, dropout_ratio, device):
        super().__init__()
        self.self_attn_layer_norm = nn.LayerNorm(hidden_dim)
        self.ff_layer_norm = nn.LayerNorm(hidden_dim)
        self.self_attention = MultiHeadAttentionLayer(hidden_dim, n_heads, dropout_ratio, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hidden_dim, pf_dim, dropout_ratio)
        self.dropout = nn.Dropout(dropout_ratio)

    def forward(self, src, src_mask):
        _src, _ = self.self_attention(src, src, src, src_mask)
        src = self.self_attn_layer_norm(src + self.dropout(_src))
        _src = self.positionwise_feedforward(src)
        src = self.ff_layer_norm(src + self.dropout(_src))
        return src

class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, n_layers, n_heads, pf_dim, dropout_ratio, device, max_length=100):
        super().__init__()
        self.device = device
        self.tok_embedding = nn.Embedding(input_dim, hidden_dim)
        self.pos_embedding = nn.Embedding(max_length, hidden_dim)
        self.layers = nn.ModuleList([EncoderLayer(hidden_dim, n_heads, pf_dim, dropout_ratio, device) for _ in range(n_layers)])
        self.dropout = nn.Dropout(dropout_ratio)
        self.scale = torch.sqrt(torch.FloatTensor([hidden_dim])).to(device)

    def forward(self, src, src_mask):
        batch_size, src_len = src.shape
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))
        for layer in self.layers:
            src = layer(src, src_mask)
        return src

class DecoderLayer(nn.Module):
    def __init__(self, hidden_dim, n_heads, pf_dim, dropout_ratio, device):
        super().__init__()
        self.self_attn_layer_norm = nn.LayerNorm(hidden_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hidden_dim)
        self.ff_layer_norm = nn.LayerNorm(hidden_dim)
        self.self_attention = MultiHeadAttentionLayer(hidden_dim, n_heads, dropout_ratio, device)
        self.encoder_attention = MultiHeadAttentionLayer(hidden_dim, n_heads, dropout_ratio, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hidden_dim, pf_dim, dropout_ratio)
        self.dropout = nn.Dropout(dropout_ratio)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
        _trg = self.positionwise_feedforward(trg)
        trg = self.ff_layer_norm(trg + self.dropout(_trg))
        return trg, attention

class Decoder(nn.Module):
    def __init__(self, output_dim, hidden_dim, n_layers, n_heads, pf_dim, dropout_ratio, device, max_length=100):
        super().__init__()
        self.device = device
        self.tok_embedding = nn.Embedding(output_dim, hidden_dim)
        self.pos_embedding = nn.Embedding(max_length, hidden_dim)
        self.layers = nn.ModuleList([DecoderLayer(hidden_dim, n_heads, pf_dim, dropout_ratio, device) for _ in range(n_layers)])
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_ratio)
        self.scale = torch.sqrt(torch.FloatTensor([hidden_dim])).to(device)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        batch_size, trg_len = trg.shape
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)
        output = self.fc_out(trg)
        return output, attention

class Transformer(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, trg_pad_idx, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        return (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)

    def make_trg_mask(self, trg):
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        trg_len = trg.shape[1]
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=self.device)).bool()
        trg_mask = trg_pad_mask & trg_sub_mask
        return trg_mask

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
        return output, attention

In [10]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

INPUT_DIM = len(SRC_VOCAB)
OUTPUT_DIM = len(TRG_VOCAB)
HID_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
DROPOUT = 0.1
MAX_LEN = 100

enc = Encoder(INPUT_DIM, HID_DIM, ENC_LAYERS, ENC_HEADS, ENC_PF_DIM, DROPOUT, DEVICE, MAX_LEN)
dec = Decoder(OUTPUT_DIM, HID_DIM, DEC_LAYERS, DEC_HEADS, DEC_PF_DIM, DROPOUT, DEVICE, MAX_LEN)
model = Transformer(enc, dec, PAD_IDX, PAD_IDX, DEVICE).to(DEVICE)

optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [18]:
def train(model, iterator, optimizer, criterion, clip=1):
    model.train()
    epoch_loss = 0
    for src, trg in iterator:
        src, trg = src.to(DEVICE), trg.to(DEVICE)
        optimizer.zero_grad()
        output, _ = model(src, trg[:, :-1])  # teacher forcing
        output_dim = output.shape[-1]
        output = output.contiguous().view(-1, output_dim)
        trg_y = trg[:, 1:].contiguous().view(-1)
        loss = criterion(output, trg_y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

for epoch in range(30):
    loss = train(model, train_iter, optimizer, criterion)
    print(f"Epoch {epoch+1}, Loss: {loss:.3f}")

Epoch 1, Loss: 1.412
Epoch 2, Loss: 1.299
Epoch 3, Loss: 1.202
Epoch 4, Loss: 1.115
Epoch 5, Loss: 1.040
Epoch 6, Loss: 0.964
Epoch 7, Loss: 0.904
Epoch 8, Loss: 0.848
Epoch 9, Loss: 0.800
Epoch 10, Loss: 0.752
Epoch 11, Loss: 0.713
Epoch 12, Loss: 0.678
Epoch 13, Loss: 0.646
Epoch 14, Loss: 0.617
Epoch 15, Loss: 0.590
Epoch 16, Loss: 0.564
Epoch 17, Loss: 0.543
Epoch 18, Loss: 0.525
Epoch 19, Loss: 0.506
Epoch 20, Loss: 0.489
Epoch 21, Loss: 0.476
Epoch 22, Loss: 0.463
Epoch 23, Loss: 0.449
Epoch 24, Loss: 0.437
Epoch 25, Loss: 0.426
Epoch 26, Loss: 0.417
Epoch 27, Loss: 0.409
Epoch 28, Loss: 0.401
Epoch 29, Loss: 0.391
Epoch 30, Loss: 0.382


In [19]:
def translate_sentence(model, sentence, src_vocab, trg_vocab, device, max_len=50):
    model.eval()

    # 1. 토큰화
    tokens = sentence.lower().split()

    # 2. 인덱스로 변환
    src_indexes = [src_vocab.get(tok, src_vocab["<unk>"]) for tok in tokens]
    src_tensor = torch.tensor([src_indexes], dtype=torch.long).to(device)  # [1, src_len]

    # 3. Encoder 통과
    with torch.no_grad():
        src_mask = model.make_src_mask(src_tensor)
        enc_src = model.encoder(src_tensor, src_mask)

    # 4. Decoder 초기 입력: <sos>
    trg_indexes = [trg_vocab["<sos>"]]

    # 5. step-by-step 디코딩
    for i in range(max_len):
        trg_tensor = torch.tensor([trg_indexes], dtype=torch.long).to(device)  # [1, len]
        trg_mask = model.make_trg_mask(trg_tensor)

        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)

        pred_token = output[:, -1, :].argmax(1).item()  # 마지막 단어 예측
        trg_indexes.append(pred_token)

        if pred_token == trg_vocab["<eos>"]:
            break

    # 6. 인덱스를 단어로 변환
    id2word = {idx: word for word, idx in trg_vocab.items()}
    trg_tokens = [id2word[i] for i in trg_indexes]

    return trg_tokens[1:]  # <sos> 제외



In [20]:
example_sentence = "i am hungry"
translation = translate_sentence(model, example_sentence, SRC_VOCAB, TRG_VOCAB, DEVICE)
print(" ".join(translation))

j'ai faim que je suis faim ! <eos>
