In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn
import torch.nn.utils.rnn
import torch.utils.data
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('arithmetic.csv')
df.info()
# 看一下前幾筆資料是什麼樣子
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2632500 entries, 0 to 2632499
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   src     object
 1   tgt     int64 
dtypes: int64(1), object(1)
memory usage: 40.2+ MB


Unnamed: 0,src,tgt
0,0+0=,0
1,0-0=,0
2,0*0=,0
3,(0+0)*0=,0
4,0+0*0=,0


# 建立字典
- 無法直接利用純文字進行計算
- 將所有文字轉換成數字
- 字典大小約為 `7000`
- 特殊字
    - '&lt;pad&gt;'
        - 每個 batch 所包含的句子長度不同
        - 將長度使用 '&lt;pad&gt;' 補成 batch 中最大值者
    - '&lt;eos&gt;'
        - 指定生成的結尾
        - 沒有 '&lt;eos&gt;' 會不知道何時停止生成

In [3]:
# 一個dict把中文字符轉化成id
char_to_id = {}
# 把id轉回中文字符
id_to_char = {}

# 有一些必須要用的special token先添加進來(一般用來做padding的token的id是0)
char_to_id['<pad>'] = 0
char_to_id['<eos>'] = 1
id_to_char[0] = '<pad>'
id_to_char[1] = '<eos>'

# 把所有資料集中出現的token都記錄到dict中
for char in set(df['src'].str.cat()):
    ch_id = len(char_to_id)
    char_to_id[char] = ch_id
    id_to_char[ch_id] = char

vocab_size = len(char_to_id)
print('字典大小: {}'.format(vocab_size))

字典大小: 18


# Dataset 1: 只留下加減法資料

In [4]:
new_df = df[~df['src'].str.contains("\*")]
new_df.head()

Unnamed: 0,src,tgt
0,0+0=,0
1,0-0=,0
15,0+0+0=,0
16,0-0-0=,0
18,0+0-0=,0


In [5]:
# 把資料轉成id
new_df['src'] = new_df['src'].apply(lambda text: [char_to_id[char] for char in text])
new_df['tgt'] = new_df['tgt'].apply(lambda num: [char_to_id[char] for char in str(num)])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['src'] = new_df['src'].apply(lambda text: [char_to_id[char] for char in text])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tgt'] = new_df['tgt'].apply(lambda num: [char_to_id[char] for char in str(num)])


In [6]:
# 把資料分成train, val, test = 0.8 : 0.1 : 0.1
train_data, val_data = train_test_split(new_df, test_size=0.2, random_state=250)
val_data, test_data = train_test_split(val_data, test_size=0.5, random_state=250)

# 超參數

|超參數|意義|
|-|-|
|`batch_size`|單一 batch 的資料數|
|`epochs`|總共要訓練幾個 epoch|
|`embed_dim`|文字的 embedding 維度|
|`hidden_dim`|RNN 中每個時間的 hidden state 維度|
|`lr`|Learning Rate|
|`grad_clip`|為了避免 RNN 出現梯度爆炸問題，將梯度限制範圍|

In [4]:
batch_size = 64
epochs = 5
embed_dim = 256
hidden_dim = 256
lr = 0.001
grad_clip = 1

# 資料分批
- 使用 `torch.utils.data.Dataset` 建立資料產生的工具 `dataset`
- 再使用 `torch.utils.data.DataLoader` 對資料集 `dataset` 隨機抽樣並作為一個 batch


In [5]:
# 這裏的dataset是文本生成的dataset，輸入和輸出的資料都是文章
# 舉個例子，現在的狀況是：
# input:  A B C D E F
# output: B C D E F <eos>
# 而對於加減法的任務：
# input:  1 + 2 + 3 = 6
# output: / / / / / 6 <eos>
# /的部分都不用算loss，主要是預測=的後面，這裏的答案是6，所以output是6 <eos>


class Dataset(torch.utils.data.Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __getitem__(self, index):
        y = self.sequences['tgt'].iloc[index]
        x = self.sequences['src'].iloc[index] + [char_to_id['<pad>']] * len(y)
        y = self.sequences['tgt'].iloc[index] + [char_to_id['<eos>']]
        y = [char_to_id['<pad>']] * (len(x) - len(y)) + y
        return x, y

    def __len__(self):
        return len(self.sequences)


def collate_fn(batch):
    batch_x = [torch.tensor(data[0]) for data in batch] # list[torch.tensor]
    batch_y = [torch.tensor(data[1]) for data in batch] # list[torch.tensor]
    batch_x_lens = torch.LongTensor([len(x) for x in batch_x])
    batch_y_lens = torch.LongTensor([len(y) for y in batch_y])

    # torch.tensor
    # [[1968, 1891, 3580, ... , 0, 0, 0],
    #  [1014, 2242, 2247, ... , 0, 0, 0],
    #  [3032,  522, 1485, ... , 0, 0, 0]]
    #                       padding↑
    pad_batch_x = torch.nn.utils.rnn.pad_sequence(batch_x,
                                                  batch_first=True, # shape=(batch_size, seq_len)
                                                  padding_value=char_to_id['<pad>'])

    pad_batch_y = torch.nn.utils.rnn.pad_sequence(batch_y,
                                                  batch_first=True, # shape=(batch_size, seq_len)
                                                  padding_value=char_to_id['<pad>'])

    return pad_batch_x, pad_batch_y, batch_x_lens, batch_y_lens

In [9]:
dataset_train = Dataset(train_data)
dataset_val = Dataset(val_data)
dataset_test = Dataset(test_data)

In [10]:
data_loader_train_64 = torch.utils.data.DataLoader(dataset_train,
                                                batch_size=batch_size,
                                                shuffle=True,
                                                collate_fn=collate_fn)

data_loader_val_64 = torch.utils.data.DataLoader(dataset_val,
                                                batch_size=batch_size,
                                                shuffle=True,
                                                collate_fn=collate_fn)

data_loader_test_64 = torch.utils.data.DataLoader(dataset_test,
                                                batch_size=batch_size,
                                                shuffle=True,
                                                collate_fn=collate_fn)

In [11]:
batch_size = 128

data_loader_train_128 = torch.utils.data.DataLoader(dataset_train,
                                                batch_size=batch_size,
                                                shuffle=True,
                                                collate_fn=collate_fn)

data_loader_val_128 = torch.utils.data.DataLoader(dataset_val,
                                                batch_size=batch_size,
                                                shuffle=True,
                                                collate_fn=collate_fn)

data_loader_test_128 = torch.utils.data.DataLoader(dataset_test,
                                                batch_size=batch_size,
                                                shuffle=True,
                                                collate_fn=collate_fn)

# 模型設計

## 執行順序
1. 將句子中的所有字轉換成 embedding
2. 按照句子順序將 embedding 丟入 RNN
3. RNN 的輸出再丟給 RNN，可以接上更多層
4. 最後的 RNN 所有時間點的輸出丟進一層 Fully Connected
5. 輸出結果所有維度中的最大者即為下一個字

## 損失函數
因為是類別預測，所以使用 Cross Entropy

## 梯度更新
使用 Adam 演算法進行梯度更新

In [6]:
class CharRNN(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(CharRNN, self).__init__()

        # Embedding層
        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size,
                                            embedding_dim=embed_dim,
                                            padding_idx=char_to_id['<pad>'])

        # RNN層
        self.rnn_layer1 = torch.nn.RNN(input_size=embed_dim,
                                        hidden_size=hidden_dim,
                                        batch_first=True)

        self.rnn_layer2 = torch.nn.RNN(input_size=hidden_dim,
                                        hidden_size=hidden_dim,
                                        batch_first=True)

        self.rnn_layer3 = torch.nn.RNN(input_size=hidden_dim,
                                        hidden_size=hidden_dim,
                                        batch_first=True)

        # output層
        self.linear = torch.nn.Sequential(torch.nn.Linear(in_features=hidden_dim,
                                                          out_features=hidden_dim),
                                          torch.nn.ReLU(),
                                          torch.nn.Linear(in_features=hidden_dim,
                                                          out_features=vocab_size))

    def forward(self, batch_x, batch_x_lens):
        return self.encoder(batch_x, batch_x_lens)

    def encoder(self, batch_x, batch_x_lens):
        batch_x = self.embedding(batch_x)
        batch_x = torch.nn.utils.rnn.pack_padded_sequence(batch_x,
                                                          batch_x_lens,
                                                          batch_first=True,
                                                          enforce_sorted=False)

        batch_x, _ = self.rnn_layer1(batch_x)
        batch_x, _ = self.rnn_layer2(batch_x)
        batch_x, _ = self.rnn_layer3(batch_x)

        batch_x, _ = torch.nn.utils.rnn.pad_packed_sequence(batch_x,
                                                            batch_first=True)

        batch_x = self.linear(batch_x)

        return batch_x

    def generator(self, start_char, max_len=200):

        char_list = [char_to_id[start_char]]
        next_char = None
        # 生成的長度沒達到max_len就一直生
        while len(char_list) < max_len:
            x = torch.LongTensor(char_list).unsqueeze(0)
            x = self.embedding(x)
            _, (ht, _) = self.rnn_layer1(x)
            _, (ht, _) = self.rnn_layer2(ht)
            y = self.linear(ht)

            next_char = np.argmax(y.numpy())

            # 如果看到新的token是<eos>就說明生成結束了，就停下
            if next_char == char_to_id['<eos>']:
                break

            char_list.append(next_char)

        return [id_to_char[ch_id] for ch_id in char_list]

In [7]:
torch.manual_seed(100)
torch.cuda.manual_seed(100)

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

print(device)

cuda:0


In [8]:
model = CharRNN(vocab_size, embed_dim, hidden_dim)

In [9]:
criterion = torch.nn.CrossEntropyLoss(reduction='mean', ignore_index=char_to_id['<pad>'])
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# 訓練
1. 最外層的 `for` 迴圈控制 `epoch`
    1. 內層的 `for` 迴圈透過 `data_loader` 取得 batch
    2. 丟給 `model` 進行訓練
    3. 預測結果 `batch_pred_y` 跟真正的答案 `batch_y` 進行 Cross Entropy 得到誤差 `loss`
    4. 使用 `loss.backward` 自動計算梯度
    5. 使用 `torch.nn.utils.clip_grad_value_` 將梯度限制在 `-grad_clip` &lt; &lt; `grad_clip` 之間
    6. 使用 `optimizer.step()` 進行更新（back propagation）
2. 每 `1000` 個 batch 就輸出一次當前的 loss 觀察是否有收斂的趨勢

# Model 1 Train
batch_size = 64, lr = 0.001

In [17]:
lr = 0.001

In [20]:
from tqdm import tqdm
model = model.to(device)
model.train()

for epoch in range(1, epochs+1):
    i = 0
    j = 0

    process_bar = tqdm(data_loader_train_64, desc=f"Training epoch {epoch}", ncols=100)
    for batch_x, batch_y, batch_x_lens, batch_y_lens in process_bar:
        optimizer.zero_grad()
        batch_pred_y = model(batch_x.to(device), batch_x_lens)
        batch_pred_y = batch_pred_y.view(-1, vocab_size)
        batch_y = batch_y.view(-1).to(device)
        loss = criterion(batch_pred_y, batch_y)
        loss.backward()
        torch.nn.utils.clip_grad_value_(model.parameters(), grad_clip)
        optimizer.step()
        i+=1
        if i%10==0:
            process_bar.set_postfix(loss="{:.3f}".format(loss.item()))
            process_bar.update(1)

    validation_process_bar = tqdm(data_loader_val_64, desc=f"Validation epoch {epoch}", ncols=100)
    for batch_x, batch_y, batch_x_lens, batch_y_lens in validation_process_bar:
        batch_pred_y = model(batch_x.to(device), batch_x_lens)
        batch_pred_y = batch_pred_y.view(-1, vocab_size)
        batch_y = batch_y.view(-1).to(device)
        loss = criterion(batch_pred_y, batch_y)
        j+=1
        if j%10==0:
            validation_process_bar.set_postfix(loss="{:.3f}".format(loss.item()))
            validation_process_bar.update(1)

Training epoch 1: 100%|███████████████████████████| 12563/12563 [03:22<00:00, 62.03it/s, loss=0.741]
Validation epoch 1: 100%|██████████████████████████| 1571/1571 [00:09<00:00, 168.97it/s, loss=0.663]
Training epoch 2: 100%|███████████████████████████| 12563/12563 [03:24<00:00, 61.30it/s, loss=0.659]
Validation epoch 2: 100%|██████████████████████████| 1571/1571 [00:09<00:00, 167.68it/s, loss=0.724]
Training epoch 3: 100%|███████████████████████████| 12563/12563 [03:22<00:00, 62.03it/s, loss=0.675]
Validation epoch 3: 100%|██████████████████████████| 1571/1571 [00:09<00:00, 165.97it/s, loss=0.659]
Training epoch 4: 100%|███████████████████████████| 12563/12563 [03:48<00:00, 55.03it/s, loss=0.609]
Validation epoch 4: 100%|██████████████████████████| 1571/1571 [00:09<00:00, 168.51it/s, loss=0.704]
Training epoch 5: 100%|███████████████████████████| 12563/12563 [03:19<00:00, 63.08it/s, loss=0.668]
Validation epoch 5: 100%|██████████████████████████| 1571/1571 [00:09<00:00, 168.66it/s, lo

# Model 1 Test
batch_size = 64, lr = 0.001

In [24]:
model.to(device)
model.eval()

x = []
y = []
pred_y = []

j = 0

batch_x, batch_y, batch_x_lens, batch_y_lens = next(iter(data_loader_test_64))
batch_pred_y = model(batch_x.to(device), batch_x_lens)
batch_pred_y = batch_pred_y.view(-1, vocab_size)
batch_y = batch_y.view(-1).to(device)


x = batch_x
y = batch_y.detach().cpu()
pred_y = batch_pred_y.detach().cpu()

In [46]:
def to_char(x):
  return '' if x==char_to_id['<pad>'] else id_to_char[x]

tc = np.vectorize(to_char)
result_x = tc(x.numpy())
result_y = np.reshape(tc(y.numpy()), [x.shape[0],-1])

result_pred_y = np.empty_like(result_y)
reshape_pred_y = np.reshape(pred_y.numpy(), [x.shape[0],x.shape[1],-1])
for i in range(x.shape[0]):
  for j in range(x.shape[1]):
      result_pred_y[i,j] = tc(np.argmax(reshape_pred_y[i,j]))

In [47]:
def get_result(y, pred_y):
    output_pred_y = pred_y
    for i in range(y.shape[0]):
        for j in range(y.shape[1]):
            if y[i,j] == '':
                output_pred_y[i,j] = ''
    return y, output_pred_y

In [53]:
print("{:<20}{:<15}{:<15}".format("input", "output", "answer"))

correct = .0

output_y, output_pred_y = get_result(result_y, result_pred_y)

for i in range(result_x.shape[0]):
    if i<15:
        print("{:<20}{:<15}{:<15}{:<5}".format(''.join(result_x[i]), 
                                             ''.join(output_pred_y[i]), 
                                             ''.join(output_y[i]), 
                                             'correct' if (''.join(output_pred_y[i])==''.join(output_y[i])) else 'wrong'
                                            )
           )
    if ''.join(output_pred_y[i])==''.join(output_y[i]):
        correct = correct + 1.
        
print("...\ntotal: {:} samples".format(result_x.shape[0]))
print("accuracy = {:.2f}".format(correct / result_x.shape[0]))

input               output         answer         
47+11-36=           23<eos>        22<eos>        wrong
29-2+30=            55<eos>        57<eos>        wrong
(19-15)+28=         32<eos>        32<eos>        correct
7+(39-8)=           39<eos>        38<eos>        wrong
14+(14-23)=         6<eos>         5<eos>         wrong
(23-13)+23=         32<eos>        33<eos>        wrong
(46+6)-39=          14<eos>        13<eos>        wrong
11-19-17=           -23<eos>       -25<eos>       wrong
(46-49)+13=         10<eos>        10<eos>        correct
41-(13+40)=         -10<eos>       -12<eos>       wrong
25+45+2=            74<eos>        72<eos>        wrong
(6-37)+11=          -20<eos>       -20<eos>       correct
(33+27)-32=         27<eos>        28<eos>        wrong
(19+18)-29=         8<eos>         8<eos>         correct
26+(0-30)=          -4<eos>        -4<eos>        correct
...
total: 64 samples
accuracy = 0.31


# Model 2 Train
batch_size = 128, lr = 0.001

In [54]:
lr = 0.001

In [55]:
model = CharRNN(vocab_size, embed_dim, hidden_dim)
model = model.to(device)
model.train()

for epoch in range(1, epochs+1):
    i = 0
    j = 0

    process_bar = tqdm(data_loader_train_128, desc=f"Training epoch {epoch}", ncols=100)
    for batch_x, batch_y, batch_x_lens, batch_y_lens in process_bar:
        optimizer.zero_grad()
        batch_pred_y = model(batch_x.to(device), batch_x_lens)
        batch_pred_y = batch_pred_y.view(-1, vocab_size)
        batch_y = batch_y.view(-1).to(device)
        loss = criterion(batch_pred_y, batch_y)
        loss.backward()
        torch.nn.utils.clip_grad_value_(model.parameters(), grad_clip)
        optimizer.step()
        i+=1
        if i%10==0:
            process_bar.set_postfix(loss="{:.3f}".format(loss.item()))
            process_bar.update(1)

    validation_process_bar = tqdm(data_loader_val_128, desc=f"Validation epoch {epoch}", ncols=100)
    for batch_x, batch_y, batch_x_lens, batch_y_lens in validation_process_bar:
        batch_pred_y = model(batch_x.to(device), batch_x_lens)
        batch_pred_y = batch_pred_y.view(-1, vocab_size)
        batch_y = batch_y.view(-1).to(device)
        loss = criterion(batch_pred_y, batch_y)
        j+=1
        if j%10==0:
            validation_process_bar.set_postfix(loss="{:.3f}".format(loss.item()))
            validation_process_bar.update(1)

Training epoch 1: 100%|█████████████████████████████| 6282/6282 [02:18<00:00, 45.20it/s, loss=2.901]
Validation epoch 1: 100%|████████████████████████████| 786/786 [00:07<00:00, 107.32it/s, loss=2.899]
Training epoch 2: 100%|█████████████████████████████| 6282/6282 [01:55<00:00, 54.20it/s, loss=2.899]
Validation epoch 2: 100%|████████████████████████████| 786/786 [00:07<00:00, 106.00it/s, loss=2.902]
Training epoch 3: 100%|█████████████████████████████| 6282/6282 [02:15<00:00, 46.36it/s, loss=2.900]
Validation epoch 3: 100%|████████████████████████████| 786/786 [00:07<00:00, 104.11it/s, loss=2.899]
Training epoch 4: 100%|█████████████████████████████| 6282/6282 [02:26<00:00, 42.75it/s, loss=2.895]
Validation epoch 4: 100%|████████████████████████████| 786/786 [00:07<00:00, 107.04it/s, loss=2.899]
Training epoch 5: 100%|█████████████████████████████| 6282/6282 [01:47<00:00, 58.44it/s, loss=2.896]
Validation epoch 5: 100%|█████████████████████████████| 786/786 [00:08<00:00, 92.78it/s, lo

# Model 2 Test
batch_size = 128, lr = 0.001

In [56]:
model.to(device)
model.eval()

x = []
y = []
pred_y = []

j = 0

batch_x, batch_y, batch_x_lens, batch_y_lens = next(iter(data_loader_test_128))
batch_pred_y = model(batch_x.to(device), batch_x_lens)
batch_pred_y = batch_pred_y.view(-1, vocab_size)
batch_y = batch_y.view(-1).to(device)


x = batch_x
y = batch_y.detach().cpu()
pred_y = batch_pred_y.detach().cpu()

In [57]:
def to_char(x):
  return '' if x==char_to_id['<pad>'] else id_to_char[x]

tc = np.vectorize(to_char)
result_x = tc(x.numpy())
result_y = np.reshape(tc(y.numpy()), [x.shape[0],-1])

result_pred_y = np.empty_like(result_y)
reshape_pred_y = np.reshape(pred_y.numpy(), [x.shape[0],x.shape[1],-1])
for i in range(x.shape[0]):
  for j in range(x.shape[1]):
      result_pred_y[i,j] = tc(np.argmax(reshape_pred_y[i,j]))

In [59]:
print("{:<20}{:<15}{:<15}".format("input", "output", "answer"))

correct = .0

output_y, output_pred_y = get_result(result_y, result_pred_y)

for i in range(result_x.shape[0]):
    if i<15:
        print("{:<20}{:<15}{:<15}{:<5}".format(''.join(result_x[i]), 
                                             ''.join(output_pred_y[i]), 
                                             ''.join(output_y[i]), 
                                             'correct' if (''.join(output_pred_y[i])==''.join(output_y[i])) else 'wrong'
                                            )
           )
    if ''.join(output_pred_y[i])==''.join(output_y[i]):
        correct = correct + 1.
        
print("...\ntotal: {:} samples".format(result_x.shape[0]))
print("accuracy = {:.2f}".format(correct / result_x.shape[0]))

input               output         answer         
(14-1)+16=          (99            29<eos>        wrong
(8+7)-39=           99((           -24<eos>       wrong
30+29+43=           (4((           102<eos>       wrong
5+(46-15)=          944            36<eos>        wrong
26+18-42=           *(             2<eos>         wrong
37+(47-8)=          944            76<eos>        wrong
12-15+25=           *(9            22<eos>        wrong
(29-0)+17=          *44            46<eos>        wrong
22+(10-38)=         (49            -6<eos>        wrong
(15+41)-18=         *9(            38<eos>        wrong
10+33+15=           4((            58<eos>        wrong
(37-12)+36=         949            61<eos>        wrong
12-(3+24)=          944(           -15<eos>       wrong
17-(10+21)=         (4((           -14<eos>       wrong
49-43-17=           *4((           -11<eos>       wrong
...
total: 128 samples
accuracy = 0.00


# Model 3 Train
batch_size = 64, lr = 0.002

In [61]:
lr = 0.002

In [62]:
model = CharRNN(vocab_size, embed_dim, hidden_dim)
model = model.to(device)
model.train()

for epoch in range(1, epochs+1):
    i = 0
    j = 0

    process_bar = tqdm(data_loader_train_64, desc=f"Training epoch {epoch}", ncols=100)
    for batch_x, batch_y, batch_x_lens, batch_y_lens in process_bar:
        optimizer.zero_grad()
        batch_pred_y = model(batch_x.to(device), batch_x_lens)
        batch_pred_y = batch_pred_y.view(-1, vocab_size)
        batch_y = batch_y.view(-1).to(device)
        loss = criterion(batch_pred_y, batch_y)
        loss.backward()
        torch.nn.utils.clip_grad_value_(model.parameters(), grad_clip)
        optimizer.step()
        i+=1
        if i%10==0:
            # process_bar.set_postfix(loss=loss.item())
            process_bar.set_postfix(loss="{:.3f}".format(loss.item()))
            process_bar.update(1)

    validation_process_bar = tqdm(data_loader_val_64, desc=f"Validation epoch {epoch}", ncols=100)
    for batch_x, batch_y, batch_x_lens, batch_y_lens in validation_process_bar:
        batch_pred_y = model(batch_x.to(device), batch_x_lens)
        batch_pred_y = batch_pred_y.view(-1, vocab_size)
        batch_y = batch_y.view(-1).to(device)
        loss = criterion(batch_pred_y, batch_y)
        j+=1
        if j%10==0:
            validation_process_bar.set_postfix(loss="{:.3f}".format(loss.item()))
            validation_process_bar.update(1)

Training epoch 1: 100%|███████████████████████████| 12563/12563 [03:42<00:00, 56.59it/s, loss=2.895]
Validation epoch 1: 100%|██████████████████████████| 1571/1571 [00:09<00:00, 165.13it/s, loss=2.897]
Training epoch 2: 100%|███████████████████████████| 12563/12563 [02:38<00:00, 79.19it/s, loss=2.888]
Validation epoch 2: 100%|██████████████████████████| 1571/1571 [00:09<00:00, 169.23it/s, loss=2.889]
Training epoch 3: 100%|███████████████████████████| 12563/12563 [03:09<00:00, 66.29it/s, loss=2.889]
Validation epoch 3: 100%|██████████████████████████| 1571/1571 [00:09<00:00, 167.63it/s, loss=2.893]
Training epoch 4: 100%|███████████████████████████| 12563/12563 [02:47<00:00, 74.96it/s, loss=2.887]
Validation epoch 4: 100%|██████████████████████████| 1571/1571 [00:09<00:00, 167.52it/s, loss=2.895]
Training epoch 5: 100%|███████████████████████████| 12563/12563 [02:31<00:00, 83.01it/s, loss=2.891]
Validation epoch 5: 100%|██████████████████████████| 1571/1571 [00:09<00:00, 164.32it/s, lo

# Model 3 Test
batch_size = 64, lr = 0.025

In [64]:
model.to(device)
model.eval()

x = []
y = []
pred_y = []

j = 0

batch_x, batch_y, batch_x_lens, batch_y_lens = next(iter(data_loader_test_128))
batch_pred_y = model(batch_x.to(device), batch_x_lens)
batch_pred_y = batch_pred_y.view(-1, vocab_size)
batch_y = batch_y.view(-1).to(device)


x = batch_x
y = batch_y.detach().cpu()
pred_y = batch_pred_y.detach().cpu()

In [65]:
tc = np.vectorize(to_char)
result_x = tc(x.numpy())
result_y = np.reshape(tc(y.numpy()), [x.shape[0],-1])

result_pred_y = np.empty_like(result_y)
reshape_pred_y = np.reshape(pred_y.numpy(), [x.shape[0],x.shape[1],-1])
for i in range(x.shape[0]):
  for j in range(x.shape[1]):
      result_pred_y[i,j] = tc(np.argmax(reshape_pred_y[i,j]))

In [66]:
print("{:<20}{:<15}{:<15}".format("input", "output", "answer"))

correct = .0

output_y, output_pred_y = get_result(result_y, result_pred_y)

for i in range(result_x.shape[0]):
    if i<15:
        print("{:<20}{:<15}{:<15}{:<5}".format(''.join(result_x[i]), 
                                             ''.join(output_pred_y[i]), 
                                             ''.join(output_y[i]), 
                                             'correct' if (''.join(output_pred_y[i])==''.join(output_y[i])) else 'wrong'
                                            )
           )
    if ''.join(output_pred_y[i])==''.join(output_y[i]):
        correct = correct + 1.
        
print("...\ntotal: {:} samples".format(result_x.shape[0]))
print("accuracy = {:.2f}".format(correct / result_x.shape[0]))

input               output         answer         
45+(29-38)=         (<eos>(        36<eos>        wrong
46-3+15=            <eos>(         58<eos>        wrong
47+(4-28)=          (+(            23<eos>        wrong
34-(40+12)=         (<eos>(6       -18<eos>       wrong
(22-46)+41=         +6             17<eos>        wrong
3+10-44=            6<eos>(6       -31<eos>       wrong
23-49+14=           <eos>(6        -12<eos>       wrong
(49-27)+48=         (<eos>(        70<eos>        wrong
28-(49+32)=         (<eos>(6       -53<eos>       wrong
(47-27)+15=         <eos>(         35<eos>        wrong
22-(44+24)=         (6(6           -46<eos>       wrong
45+16-0=            6<eos>6        61<eos>        wrong
17+1-42=            +<eos>66       -24<eos>       wrong
18-15-45=           <eos>(6        -42<eos>       wrong
35-49-28=           6<eos><eos>6   -42<eos>       wrong
...
total: 128 samples
accuracy = 0.01


# Dataset 2 with Model 1

In [10]:
new_df2 = df[~df['src'].str.contains("\*")]

In [13]:
import math

upper = math.ceil(new_df2['src'].str.len().mean() + 2 * new_df2['src'].str.len().std())
lower = math.floor(new_df2['src'].str.len().mean() - 2 * new_df2['src'].str.len().std())

In [14]:
new_df2['len'] = new_df2['src'].str.len()
new_df2 = new_df2[(new_df2['len'] >= lower) & (new_df2['len'] <= upper)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df2['len'] = new_df2['src'].str.len()


In [15]:
# 把資料轉成id
new_df2['src'] = new_df2['src'].apply(lambda text: [char_to_id[char] for char in text])
new_df2['tgt'] = new_df2['tgt'].apply(lambda num: [char_to_id[char] for char in str(num)])

In [16]:
# 把資料分成train, val, test = 0.8 : 0.1 : 0.1
train_data2, val_data2 = train_test_split(new_df2, test_size=0.2, random_state=250)
val_data2, test_data2 = train_test_split(val_data2, test_size=0.5, random_state=250)

In [17]:
dataset2_train = Dataset(train_data2)
dataset2_val = Dataset(val_data2)
dataset2_test = Dataset(test_data2)

In [18]:
data_loader_train_64 = torch.utils.data.DataLoader(dataset2_train,
                                                    batch_size=batch_size,
                                                    shuffle=True,
                                                    collate_fn=collate_fn)

data_loader_val_64 = torch.utils.data.DataLoader(dataset2_val,
                                                    batch_size=batch_size,
                                                    shuffle=True,
                                                    collate_fn=collate_fn)

data_loader_test_64 = torch.utils.data.DataLoader(dataset2_test,
                                                    batch_size=batch_size,
                                                    shuffle=True,
                                                    collate_fn=collate_fn)

## Train

In [19]:
batch_size = 64
lr = 0.001

In [26]:
from tqdm import tqdm
model = model.to(device)
model.train()

for epoch in range(1, epochs+1):
    i = 0
    j = 0

    process_bar = tqdm(data_loader_train_64, desc=f"Training epoch {epoch}", ncols=100)
    for batch_x, batch_y, batch_x_lens, batch_y_lens in process_bar:
        optimizer.zero_grad()
        batch_pred_y = model(batch_x.to(device), batch_x_lens)
        batch_pred_y = batch_pred_y.view(-1, vocab_size)
        batch_y = batch_y.view(-1).to(device)
        loss = criterion(batch_pred_y, batch_y)
        loss.backward()
        torch.nn.utils.clip_grad_value_(model.parameters(), grad_clip)
        optimizer.step()
        i+=1
        if i%10==0:
            process_bar.set_postfix(loss="{:.3f}".format(loss.item()))
            process_bar.update(1)

    validation_process_bar = tqdm(data_loader_val_64, desc=f"Validation epoch {epoch}", ncols=100)
    for batch_x, batch_y, batch_x_lens, batch_y_lens in validation_process_bar:
        batch_pred_y = model(batch_x.to(device), batch_x_lens)
        batch_pred_y = batch_pred_y.view(-1, vocab_size)
        batch_y = batch_y.view(-1).to(device)
        loss = criterion(batch_pred_y, batch_y)
        j+=1
        if j%10==0:
            validation_process_bar.set_postfix(loss="{:.3f}".format(loss.item()))
            validation_process_bar.update(1)

Training epoch 1: 100%|███████████████████████████| 12540/12540 [03:03<00:00, 68.39it/s, loss=0.641]
Validation epoch 1: 100%|██████████████████████████| 1568/1568 [00:09<00:00, 168.74it/s, loss=0.649]
Training epoch 2: 100%|███████████████████████████| 12540/12540 [02:36<00:00, 80.06it/s, loss=0.739]
Validation epoch 2: 100%|██████████████████████████| 1568/1568 [00:09<00:00, 168.68it/s, loss=0.681]
Training epoch 3: 100%|███████████████████████████| 12540/12540 [02:48<00:00, 74.22it/s, loss=0.726]
Validation epoch 3: 100%|██████████████████████████| 1568/1568 [00:10<00:00, 154.44it/s, loss=0.649]
Training epoch 4: 100%|███████████████████████████| 12540/12540 [02:49<00:00, 73.91it/s, loss=0.597]
Validation epoch 4: 100%|██████████████████████████| 1568/1568 [00:10<00:00, 144.24it/s, loss=0.690]
Training epoch 5: 100%|███████████████████████████| 12540/12540 [03:19<00:00, 62.90it/s, loss=0.657]
Validation epoch 5: 100%|██████████████████████████| 1568/1568 [00:09<00:00, 168.27it/s, lo

## Test
batch_size = 64, lr = 0.001

In [27]:
model.to(device)
model.eval()

x = []
y = []
pred_y = []

j = 0

batch_x, batch_y, batch_x_lens, batch_y_lens = next(iter(data_loader_test_64))
batch_pred_y = model(batch_x.to(device), batch_x_lens)
batch_pred_y = batch_pred_y.view(-1, vocab_size)
batch_y = batch_y.view(-1).to(device)


x = batch_x
y = batch_y.detach().cpu()
pred_y = batch_pred_y.detach().cpu()

In [28]:
def to_char(x):
  return '' if x==char_to_id['<pad>'] else id_to_char[x]

tc = np.vectorize(to_char)
result_x = tc(x.numpy())
result_y = np.reshape(tc(y.numpy()), [x.shape[0],-1])

result_pred_y = np.empty_like(result_y)
reshape_pred_y = np.reshape(pred_y.numpy(), [x.shape[0],x.shape[1],-1])
for i in range(x.shape[0]):
  for j in range(x.shape[1]):
      result_pred_y[i,j] = tc(np.argmax(reshape_pred_y[i,j]))

In [29]:
def get_result(y, pred_y):
    output_pred_y = pred_y
    for i in range(y.shape[0]):
        for j in range(y.shape[1]):
            if y[i,j] == '':
                output_pred_y[i,j] = ''
    return y, output_pred_y

In [30]:
print("{:<20}{:<15}{:<15}".format("input", "output", "answer"))

correct = .0

output_y, output_pred_y = get_result(result_y, result_pred_y)

for i in range(result_x.shape[0]):
    if i<15:
        print("{:<20}{:<15}{:<15}{:<5}".format(''.join(result_x[i]), 
                                             ''.join(output_pred_y[i]), 
                                             ''.join(output_y[i]), 
                                             'correct' if (''.join(output_pred_y[i])==''.join(output_y[i])) else 'wrong'
                                            )
           )
    if ''.join(output_pred_y[i])==''.join(output_y[i]):
        correct = correct + 1.
        
print("...\ntotal: {:} samples".format(result_x.shape[0]))
print("accuracy = {:.2f}".format(correct / result_x.shape[0]))

input               output         answer         
23+(40-48)=         16<eos>        15<eos>        wrong
44-2-46=            -4<eos>        -4<eos>        correct
14+44+10=           67<eos>        68<eos>        wrong
6-4+34=             36<eos>        36<eos>        correct
(3-10)+12=          7<eos>         5<eos>         wrong
36-15+6=            27<eos>        27<eos>        correct
(21+2)-45=          -22<eos>       -22<eos>       correct
20-6-33=            -18<eos>       -19<eos>       wrong
(38+34)-30=         41<eos>        42<eos>        wrong
4-28-45=            -60<eos>       -69<eos>       wrong
7-(32+10)=          -40<eos>       -35<eos>       wrong
35+(49-5)=          73<eos>        79<eos>        wrong
6+(38-38)=          6<eos>         6<eos>         correct
46+(1-28)=          10<eos>        19<eos>        wrong
(9-13)+49=          46<eos>        45<eos>        wrong
...
total: 64 samples
accuracy = 0.19


# Origianl Dataset with Model 1

In [18]:
# 把資料轉成id
df_original = df
df_original['src'] = df_original['src'].apply(lambda text: [char_to_id[char] for char in text])
df_original['tgt'] = df_original['tgt'].apply(lambda num: [char_to_id[char] for char in str(num)])

In [19]:
# 把資料分成train, val, test = 0.8 : 0.1 : 0.1
train_data_original, val_data_original = train_test_split(df_original, test_size=0.2, random_state=250)
val_data_original, test_data_original = train_test_split(val_data_original, test_size=0.5, random_state=250)

In [20]:
dataset_original_train = Dataset(train_data_original)
dataset_original_val = Dataset(val_data_original)
dataset_original_test = Dataset(test_data_original)

In [21]:
data_loader_train_orignal = torch.utils.data.DataLoader(dataset_original_train,
                                                    batch_size=batch_size,
                                                    shuffle=True,
                                                    collate_fn=collate_fn)

data_loader_val_orignal = torch.utils.data.DataLoader(dataset_original_val,
                                                    batch_size=batch_size,
                                                    shuffle=True,
                                                    collate_fn=collate_fn)

data_loader_test_orignal = torch.utils.data.DataLoader(dataset_original_test,
                                                    batch_size=batch_size,
                                                    shuffle=True,
                                                    collate_fn=collate_fn)

## Train

In [22]:
batch_size = 64
lr = 0.001

In [23]:
from tqdm import tqdm
model = model.to(device)
model.train()

for epoch in range(1, epochs+1):
    i = 0
    j = 0

    process_bar = tqdm(data_loader_train_orignal, desc=f"Training epoch {epoch}", ncols=100)
    for batch_x, batch_y, batch_x_lens, batch_y_lens in process_bar:
        optimizer.zero_grad()
        batch_pred_y = model(batch_x.to(device), batch_x_lens)
        batch_pred_y = batch_pred_y.view(-1, vocab_size)
        batch_y = batch_y.view(-1).to(device)
        loss = criterion(batch_pred_y, batch_y)
        loss.backward()
        torch.nn.utils.clip_grad_value_(model.parameters(), grad_clip)
        optimizer.step()
        i+=1
        if i%10==0:
            process_bar.set_postfix(loss="{:.3f}".format(loss.item()))
            process_bar.update(1)

    validation_process_bar = tqdm(data_loader_val_orignal, desc=f"Validation epoch {epoch}", ncols=100)
    for batch_x, batch_y, batch_x_lens, batch_y_lens in validation_process_bar:
        batch_pred_y = model(batch_x.to(device), batch_x_lens)
        batch_pred_y = batch_pred_y.view(-1, vocab_size)
        batch_y = batch_y.view(-1).to(device)
        loss = criterion(batch_pred_y, batch_y)
        j+=1
        if j%10==0:
            validation_process_bar.set_postfix(loss="{:.3f}".format(loss.item()))
            validation_process_bar.update(1)

Training epoch 1: 100%|███████████████████████████| 32907/32907 [13:14<00:00, 41.40it/s, loss=1.086]
Validation epoch 1: 100%|███████████████████████████| 4114/4114 [00:47<00:00, 86.95it/s, loss=1.104]
Training epoch 2: 100%|███████████████████████████| 32907/32907 [12:48<00:00, 42.79it/s, loss=1.070]
Validation epoch 2: 100%|███████████████████████████| 4114/4114 [00:43<00:00, 94.21it/s, loss=1.139]
Training epoch 3: 100%|███████████████████████████| 32907/32907 [12:19<00:00, 44.50it/s, loss=1.265]
Validation epoch 3: 100%|███████████████████████████| 4114/4114 [00:43<00:00, 95.42it/s, loss=1.046]
Training epoch 4: 100%|███████████████████████████| 32907/32907 [12:09<00:00, 45.10it/s, loss=1.078]
Validation epoch 4: 100%|███████████████████████████| 4114/4114 [00:44<00:00, 93.23it/s, loss=1.141]
Training epoch 5: 100%|███████████████████████████| 32907/32907 [12:30<00:00, 43.87it/s, loss=1.145]
Validation epoch 5: 100%|███████████████████████████| 4114/4114 [00:48<00:00, 84.24it/s, lo

## Test
batch_size = 64, lr = 0.001

In [24]:
model.to(device)
model.eval()

x = []
y = []
pred_y = []

j = 0

batch_x, batch_y, batch_x_lens, batch_y_lens = next(iter(data_loader_test_orignal))
batch_pred_y = model(batch_x.to(device), batch_x_lens)
batch_pred_y = batch_pred_y.view(-1, vocab_size)
batch_y = batch_y.view(-1).to(device)


x = batch_x
y = batch_y.detach().cpu()
pred_y = batch_pred_y.detach().cpu()

In [25]:
def to_char(x):
  return '' if x==char_to_id['<pad>'] else id_to_char[x]

tc = np.vectorize(to_char)
result_x = tc(x.numpy())
result_y = np.reshape(tc(y.numpy()), [x.shape[0],-1])

result_pred_y = np.empty_like(result_y)
reshape_pred_y = np.reshape(pred_y.numpy(), [x.shape[0],x.shape[1],-1])
for i in range(x.shape[0]):
  for j in range(x.shape[1]):
      result_pred_y[i,j] = tc(np.argmax(reshape_pred_y[i,j]))

In [26]:
def get_result(y, pred_y):
    output_pred_y = pred_y
    for i in range(y.shape[0]):
        for j in range(y.shape[1]):
            if y[i,j] == '':
                output_pred_y[i,j] = ''
    return y, output_pred_y

In [27]:
print("{:<20}{:<15}{:<15}".format("input", "output", "answer"))

correct = .0

output_y, output_pred_y = get_result(result_y, result_pred_y)

for i in range(result_x.shape[0]):
    if i<15:
        print("{:<20}{:<15}{:<15}{:<5}".format(''.join(result_x[i]), 
                                             ''.join(output_pred_y[i]), 
                                             ''.join(output_y[i]), 
                                             'correct' if (''.join(output_pred_y[i])==''.join(output_y[i])) else 'wrong'
                                            )
           )
    if ''.join(output_pred_y[i])==''.join(output_y[i]):
        correct = correct + 1.
        
print("...\ntotal: {:} samples".format(result_x.shape[0]))
print("accuracy = {:.2f}".format(correct / result_x.shape[0]))

input               output         answer         
26*14-45=           301<eos>       319<eos>       wrong
46+(38*26)=         900<eos><eos>  1034<eos>      wrong
(39+8)-36=          12<eos>        11<eos>        wrong
(31+16)*14=         778<eos>       658<eos>       wrong
(49-27)*9=          177<eos>       198<eos>       wrong
19*16*3=            100<eos>       912<eos>       wrong
(19*41)+10=         700<eos>       789<eos>       wrong
10*46-31=           401<eos>       429<eos>       wrong
40-25*15=           -200<eos>      -335<eos>      wrong
27+(45-8)=          60<eos>        64<eos>        wrong
(32+24)*9=          570<eos>       504<eos>       wrong
23*47-47=           107<eos><eos>  1034<eos>      wrong
32-(30+34)=         -32<eos>       -32<eos>       correct
32+(46*36)=         1672<eos>      1688<eos>      wrong
49*5*41=            1070<eos><eos> 10045<eos>     wrong
...
total: 64 samples
accuracy = 0.06
