In [None]:
!pip uninstall spacy

In [None]:
!pip install spacy

In [None]:

import torch
import spacy
from torchtext.data.metrics import bleu_score
import sys


def translate_sentence(model, sentence, chinese, english, device, max_length=50):
    # Load german tokenizer
    spacy_che = spacy.load("zh_core_web_sm")

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_che(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, chinese.init_token)
    tokens.append(chinese.eos_token)

    # Go through each german token and convert to an index
    text_to_indices = [chinese.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    outputs = [english.vocab.stoi["<sos>"]]
    for i in range(max_length):
        trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)

        with torch.no_grad():
            output = model(sentence_tensor, trg_tensor)

        best_guess = output.argmax(2)[-1, :].item()
        outputs.append(best_guess)

        if best_guess == english.vocab.stoi["<eos>"]:
            break

    translated_sentence = [english.vocab.itos[idx] for idx in outputs]
    # remove start token
    return translated_sentence[1:]


def bleu(data, model, chinese, english, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)["che"]
        trg = vars(example)["eng"]

        prediction = translate_sentence(model, src, chinese, english, device)
        prediction = prediction[:-1]  # remove <eos> token

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)


def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import spacy

import torchtext
#from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint
from torch.utils.tensorboard import SummaryWriter
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator,TabularDataset

In [None]:
from google.colab import files

uploaded = files.upload()

Saving cmn.txt to cmn.txt


In [None]:
import numpy as np
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
import pandas as pd
file_name='cmn.txt'
lines = uploaded[file_name].decode("utf-8").split("\n")
pairs=[line.split('\t')[:2] for line in lines]
df=pd.DataFrame(pairs[:20000],columns=['english','chinese'])
df.shape

(20000, 2)

In [None]:
# create train and test set
train, test = train_test_split(df, test_size=0.2)

# Get train, test data to json and csv format which can be read by torchtext
train.to_json("train.json", orient="records", lines=True)
test.to_json("test.json", orient="records", lines=True)

train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

In [None]:


!python -m spacy download en_core_web_sm
!python -m spacy download zh_core_web_sm

spacy_che = spacy.load("zh_core_web_sm")
spacy_eng = spacy.load("en_core_web_sm")
def tokenize_che(text):
    return [tok.text for tok in spacy_che.tokenizer(text)]


def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]


chinese = Field(tokenize=tokenize_che, lower=True, init_token="<sos>", eos_token="<eos>")

english = Field(
    tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>"
)

#train_data, valid_data, test_data = pairs.splits(
#    exts=(".ch", ".en"), fields=(chinese, english)
#)

#chinese.build_vocab(train_data, max_size=10000, min_freq=2)
#english.build_vocab(train_data, max_size=10000, min_freq=2)

In [None]:
fields = {'english': ("eng", english),'chinese': ("che", chinese)}

train_data, test_data = TabularDataset.splits(
    path="", train="train.json", test="test.json", format="json", fields=fields
)

english.build_vocab(train_data, max_size=10000, min_freq=2)
chinese.build_vocab(train_data, max_size=10000, min_freq=2)
train_iterator,  test_iterator = BucketIterator.splits(
    (train_data, test_data),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x.che),
   
    device=device,
)


for batch in train_iterator:
    print(batch)


[torchtext.legacy.data.batch.Batch of size 32]
	[.eng]:[torch.cuda.LongTensor of size 10x32 (GPU 0)]
	[.che]:[torch.cuda.LongTensor of size 7x32 (GPU 0)]

[torchtext.legacy.data.batch.Batch of size 32]
	[.eng]:[torch.cuda.LongTensor of size 12x32 (GPU 0)]
	[.che]:[torch.cuda.LongTensor of size 8x32 (GPU 0)]

[torchtext.legacy.data.batch.Batch of size 32]
	[.eng]:[torch.cuda.LongTensor of size 11x32 (GPU 0)]
	[.che]:[torch.cuda.LongTensor of size 7x32 (GPU 0)]

[torchtext.legacy.data.batch.Batch of size 32]
	[.eng]:[torch.cuda.LongTensor of size 11x32 (GPU 0)]
	[.che]:[torch.cuda.LongTensor of size 7x32 (GPU 0)]

[torchtext.legacy.data.batch.Batch of size 32]
	[.eng]:[torch.cuda.LongTensor of size 10x32 (GPU 0)]
	[.che]:[torch.cuda.LongTensor of size 7x32 (GPU 0)]

[torchtext.legacy.data.batch.Batch of size 32]
	[.eng]:[torch.cuda.LongTensor of size 16x32 (GPU 0)]
	[.che]:[torch.cuda.LongTensor of size 13x32 (GPU 0)]

[torchtext.legacy.data.batch.Batch of size 32]
	[.eng]:[torch.cuda.L

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size=32

In [None]:
class Transformer(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
        max_len,
        device,
    ):
        super(Transformer, self).__init__()
        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
        self.src_position_embedding = nn.Embedding(max_len, embedding_size)
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
        self.trg_position_embedding = nn.Embedding(max_len, embedding_size)

        self.device = device
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion,
            dropout,
        )
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx

    def make_src_mask(self, src):
        src_mask = src.transpose(0, 1) == self.src_pad_idx

        # (N, src_len)
        return src_mask.to(self.device)

    def forward(self, src, trg):
        src_seq_length, N = src.shape
        trg_seq_length, N = trg.shape

        src_positions = (
            torch.arange(0, src_seq_length)
            .unsqueeze(1)
            .expand(src_seq_length, N)
            .to(self.device)
        )

        trg_positions = (
            torch.arange(0, trg_seq_length)
            .unsqueeze(1)
            .expand(trg_seq_length, N)
            .to(self.device)
        )

        embed_src = self.dropout(
            (self.src_word_embedding(src) + self.src_position_embedding(src_positions))
        )
        embed_trg = self.dropout(
            (self.trg_word_embedding(trg) + self.trg_position_embedding(trg_positions))
        )

        src_padding_mask = self.make_src_mask(src)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
            self.device
        )

        out = self.transformer(
            embed_src,
            embed_trg,
            src_key_padding_mask=src_padding_mask,
            tgt_mask=trg_mask,
        )
        out = self.fc_out(out)
        return out

In [None]:

# We're ready to define everything we need for training our Seq2Seq model


load_model = True
save_model = True

# Training hyperparameters
num_epochs = 200
learning_rate = 3e-4
batch_size = 32

# Model hyperparameters
src_vocab_size = len(chinese.vocab)
trg_vocab_size = len(english.vocab)
embedding_size = 512
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.10
max_len = 100
forward_expansion = 4
src_pad_idx = chinese.vocab.stoi["<pad>"]

# Tensorboard to get nice loss plot
writer = SummaryWriter("runs/loss_plot")
step = 0

In [None]:



model = Transformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, factor=0.1, patience=10, verbose=True
)

pad_idx = english.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)



In [None]:


sentence = "明天会更好"


for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")

    if save_model:
        checkpoint = {
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict(),
        }
        save_checkpoint(checkpoint)

    model.eval()
    translated_sentence = translate_sentence(
        model, sentence, chinese, english, device, max_length=50
    )

    print(f"Translated example sentence: \n {translated_sentence}")
    model.train()
    losses = []

    for batch_idx, batch in enumerate(train_iterator):
        # Get input and targets and get to cuda
        inp_data = batch.che.to(device)
        target = batch.eng.to(device)
       

        # Forward prop
        output = model(inp_data, target[:-1])

       
     
        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin.
        # Let's also remove the start token while we're at it
        output = output.reshape(-1, output.shape[2])
      
        target = target[1:].reshape(-1)
        

        optimizer.zero_grad()

        loss = criterion(output, target)
        losses.append(loss.item())

        # Back prop
        loss.backward()
        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        # plot to tensorboard
        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1

    mean_loss = sum(losses) / len(losses)
    scheduler.step(mean_loss)

# running on entire test data takes a while
score = bleu(test_data[1:100], model, chinese, english, device)
print(f"Bleu score {score * 100:.2f}")

[Epoch 0 / 200]
=> Saving checkpoint
Translated example sentence: 
 ['it', 'will', 'be', 'better', 'tomorrow', '.', '<eos>']
[Epoch 1 / 200]
=> Saving checkpoint
Translated example sentence: 
 ['tomorrow', ',', 'it', "'s", 'fine', 'tomorrow', '.', '<eos>']
[Epoch 2 / 200]
=> Saving checkpoint
Translated example sentence: 
 ['tomorrow', 'will', 'be', 'better', 'tomorrow', '.', '<eos>']
[Epoch 3 / 200]
=> Saving checkpoint
Translated example sentence: 
 ['tomorrow', 'will', 'be', 'fine', 'tomorrow', '.', '<eos>']
[Epoch 4 / 200]
=> Saving checkpoint
Translated example sentence: 
 ['tomorrow', ',', 'it', "'s", 'going', 'to', 'be', 'tomorrow', '.', '<eos>']
[Epoch 5 / 200]
=> Saving checkpoint
Translated example sentence: 
 ['tomorrow', 'is', 'better', 'than', 'i', "'ll", 'be', 'tomorrow', '.', '<eos>']
[Epoch 6 / 200]
=> Saving checkpoint
Translated example sentence: 
 ['tomorrow', 'is', 'better', 'than', 'you', 'better', 'tomorrow', '.', '<eos>']
[Epoch 7 / 200]
=> Saving checkpoint
Tran