## 第8章ニューラルネットワーク
第7章で取り組んだポジネガ分類を題材として、ニューラルネットワークで分類モデルを実装する。なお、この章ではPyTorchやTensorFlow、JAXなどの深層学習フレームワークを活用せよ。

### 70.単語埋め込みの読み込み



In [1]:
import gensim
import numpy as np

model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)

vocab = list(model.key_to_index.keys())

word_to_id = {'<PAD>': 0}
id_to_word = {0: '<PAD>'}

for i, word in enumerate(vocab, start=1):
    word_to_id[word] = i
    id_to_word[i] = word

embedding_dim = model.vector_size  # 300
vocab_size = len(word_to_id)

embedding_matrix = np.zeros((vocab_size, embedding_dim), dtype=np.float32)

for word, idx in word_to_id.items():

    if word == '<PAD>':
        continue
    embedding_matrix[idx] = model[word]

print(embedding_matrix)

[[ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ...  0.0000000e+00
   0.0000000e+00  0.0000000e+00]
 [ 1.1291504e-03 -8.9645386e-04  3.1852722e-04 ... -1.5640259e-03
  -1.2302399e-04 -8.6307526e-05]
 [ 7.0312500e-02  8.6914062e-02  8.7890625e-02 ... -4.7607422e-02
   1.4465332e-02 -6.2500000e-02]
 ...
 [-1.9653320e-02 -9.0820312e-02 -1.9409180e-02 ... -1.6357422e-02
  -1.3427734e-02  4.6630859e-02]
 [ 3.2714844e-02 -3.2226562e-02  3.6132812e-02 ... -8.8500977e-03
   2.6977539e-02  1.9042969e-02]
 [ 4.5166016e-02 -4.5166016e-02 -3.9367676e-03 ...  7.9589844e-02
   7.2265625e-02  1.3000488e-02]]


### 71.データセットの読み込み

In [2]:
import pandas as pd
import torch

def make_data(df):
    result = []
    for i, line in df.iterrows():
        text = line['sentence']
        label = float(line['label'])
        words = text.split(' ')
        input_id = []
        for word in words:
            if word in word_to_id:
                input_id.append(word_to_id[word])
        if len(input_id) == 0:
            continue
        result.append({'text': text, 'label': torch.tensor([label], dtype=torch.float32), 'input_id': torch.tensor(input_id, dtype=torch.long)})
    return result

df_train = pd.read_csv('cp07-data/SST-2/train.tsv', sep='\t')
df_dev = pd.read_csv('cp07-data/SST-2/dev.tsv', sep='\t')

data_train = make_data(df_train)
data_dev = make_data(df_dev)

print(data_train[:5])

[{'text': 'hide new secretions from the parental units ', 'label': tensor([0.]), 'input_id': tensor([  5785,     66, 113845,     18,     12,  15095,   1594])}, {'text': 'contains no wit , only labored gags ', 'label': tensor([0.]), 'input_id': tensor([ 3475,    87, 15888,    90, 27695, 42637])}, {'text': 'that loves its characters and communicates something rather beautiful about human nature ', 'label': tensor([1.]), 'input_id': tensor([    4,  5053,    45,  3305, 31647,   348,   904,  2815,    47,  1276,
         1964])}, {'text': 'remains utterly satisfied to remain the same throughout ', 'label': tensor([0.]), 'input_id': tensor([  987, 14528,  4941,   873,    12,   208,   898])}, {'text': 'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ', 'label': tensor([0.]), 'input_id': tensor([    6,    12,  1445, 43789,    12, 10946,    76, 41349,    42])}]


### 72.Bag of Wordsモデルの構築

In [3]:
import torch.nn as nn

class AvgEmbeddingClassifier(nn.Module):
    def __init__(self, embedding_matrix, ):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix), freeze=True)
        self.linear = nn.Linear(embedding_matrix.shape[1], 1)

    def forward(self, input_id):
        embedding = self.embedding(input_id)
        mean_embed = embedding.mean(dim=0)    
        return self.linear(mean_embed)      

### 73.モデルの学習


In [4]:
import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AvgEmbeddingClassifier(embedding_matrix).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.BCEWithLogitsLoss()

def train(model, data_train):
    model.train()
    train_loss = []
    for item in tqdm.tqdm(data_train):
        optimizer.zero_grad()
        input_id = item['input_id'].to(device)
        label = item['label'].to(device)
        logit = model(input_id)
        loss = criterion(logit, label)
        loss.backward()
        optimizer.step()
        train_loss.append(loss.item())
    return np.mean(train_loss)

        
def dev(model, data_dev):
    model.eval()
    dev_loss = []
    for item in data_dev:
        input_id = item['input_id'].to(device)
        label = item['label'].to(device)
        logit = model(input_id)
        loss = criterion(logit, label)
        dev_loss.append(loss.item())
    return np.mean(dev_loss)

for epoch in range(5):
    train_loss = train(model, data_train)
    dev_loss = dev(model, data_dev)
    print(f'epoch:{epoch}\ntrain loss:{train_loss}\ndev loss:{dev_loss}')

torch.save(model.state_dict(), '/home/yokoyama/nlp-100/models/cp08/73.pt')

100%|██████████| 66650/66650 [03:31<00:00, 314.49it/s]


epoch:0
train loss:0.6517127129303631
dev loss:0.6569329247630518


100%|██████████| 66650/66650 [03:31<00:00, 315.65it/s]


epoch:1
train loss:0.5933183649263923
dev loss:0.6275609674085991


100%|██████████| 66650/66650 [03:30<00:00, 317.04it/s]


epoch:2
train loss:0.5529407278199059
dev loss:0.604758490278565


100%|██████████| 66650/66650 [03:28<00:00, 319.06it/s]


epoch:3
train loss:0.5238418927346619
dev loss:0.5870513897688222


100%|██████████| 66650/66650 [03:27<00:00, 320.58it/s]


epoch:4
train loss:0.5022094790140043
dev loss:0.5729928026851608


### 74.モデルの評価

In [5]:
from sklearn.metrics import accuracy_score

def dev_acc(model, data_dev):
    model.eval()
    pred_labels = []
    true_labels = []
    for item in data_dev:
        input_id = item['input_id'].to(device)
        label = item['label'].to(device)
        preds = (torch.sigmoid(model(input_id)) > 0.5).float()
        pred_labels.append(preds.item())
        true_labels.append(label.item())
    return accuracy_score(true_labels, pred_labels)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = AvgEmbeddingClassifier(embedding_matrix).to(device)
model.load_state_dict(torch.load("/home/yokoyama/nlp-100/models/cp08/73.pt"))

acc = dev_acc(model, data_dev)
print(f'acc:{acc}')


acc:0.7580275229357798


### 75.パディング

In [6]:
from torch.nn.utils.rnn import pad_sequence

def collate(data):
    data.sort(key=lambda x: len(x['input_id']), reverse=True)

    input_ids = [item["input_id"] for item in data]
    labels = [item["label"] for item in data]

    padded_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    labels = torch.stack(labels)
    result = {'input_id': padded_ids, 'label': labels}

    return result

### 76.ミニバッチ学習

In [7]:
from torch.utils.data import DataLoader


class MeanEmbeddingClassifier(nn.Module):
    def __init__(self, embedding_matrix, freeze_embedding=True):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(
            torch.tensor(embedding_matrix), freeze=freeze_embedding
        )
        self.linear = nn.Linear(embedding_matrix.shape[1], 1)

    def forward(self, input_ids):
        mask = (input_ids != 0).unsqueeze(-1)
        embedded = self.embedding(input_ids) * mask
        mean_embed = embedded.sum(1) / mask.sum(1).clamp(min=1)
        return self.linear(mean_embed).squeeze(1)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MeanEmbeddingClassifier(embedding_matrix).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.BCEWithLogitsLoss()

train_loader = DataLoader(data_train, batch_size=32, shuffle=True, collate_fn=collate)
dev_loader = DataLoader(data_dev, batch_size=32, shuffle=False, collate_fn=collate)

def train(model, loader):
    model.train()
    train_loss = []
    for batch in tqdm.tqdm(loader):
        input_id = batch['input_id'].to(device)
        label = batch['label'].to(device).squeeze(1)
        optimizer.zero_grad()
        logits = model(input_id)
        loss = criterion(logits, label)
        loss.backward()
        optimizer.step()
        train_loss.append(loss.item())
    return np.mean(train_loss)

def dev(model, loader):
    model.eval()
    pred_labels = []
    true_labels = []
    for batch in loader:
        input_id = batch['input_id'].to(device)
        label = batch['label'].to(device).squeeze(1)
        logit = model(input_id)
        preds = (torch.sigmoid(logit) > 0.5).float()
        pred_labels.extend(preds.cpu().numpy())
        true_labels.extend(label.cpu().numpy())
    return accuracy_score(true_labels, pred_labels)

for epoch in range(50):
    train_loss = train(model, train_loader)
    dev_acc = dev(model, dev_loader)
    print(f'epoch:{epoch}\ntrain loss:{train_loss}\ndev acc:{dev_acc}')

torch.save(model.state_dict(), '/home/yokoyama/nlp-100/models/cp08/76.pt')

100%|██████████| 2083/2083 [00:07<00:00, 288.75it/s]


epoch:0
train loss:0.6450534793429803
dev acc:0.5894495412844036


100%|██████████| 2083/2083 [00:07<00:00, 286.69it/s]


epoch:1
train loss:0.5739368901364916
dev acc:0.7064220183486238


100%|██████████| 2083/2083 [00:07<00:00, 284.20it/s]


epoch:2
train loss:0.5279845911267281
dev acc:0.7488532110091743


100%|██████████| 2083/2083 [00:07<00:00, 277.67it/s]


epoch:3
train loss:0.4965325057993013
dev acc:0.7626146788990825


100%|██████████| 2083/2083 [00:07<00:00, 281.48it/s]


epoch:4
train loss:0.4741752711958572
dev acc:0.7729357798165137


100%|██████████| 2083/2083 [00:07<00:00, 283.24it/s]


epoch:5
train loss:0.45777928464354317
dev acc:0.7786697247706422


100%|██████████| 2083/2083 [00:07<00:00, 278.50it/s]


epoch:6
train loss:0.44533418870807784
dev acc:0.7752293577981652


100%|██████████| 2083/2083 [00:07<00:00, 279.24it/s]


epoch:7
train loss:0.4356222094621901
dev acc:0.7740825688073395


100%|██████████| 2083/2083 [00:07<00:00, 288.05it/s]


epoch:8
train loss:0.4279240795728168
dev acc:0.7786697247706422


100%|██████████| 2083/2083 [00:07<00:00, 289.61it/s]


epoch:9
train loss:0.42160200385222113
dev acc:0.7821100917431193


100%|██████████| 2083/2083 [00:07<00:00, 279.79it/s]


epoch:10
train loss:0.4163671706910896
dev acc:0.783256880733945


100%|██████████| 2083/2083 [00:07<00:00, 281.91it/s]


epoch:11
train loss:0.4119558104502142
dev acc:0.7844036697247706


100%|██████████| 2083/2083 [00:07<00:00, 286.35it/s]


epoch:12
train loss:0.4081896734148965
dev acc:0.7844036697247706


100%|██████████| 2083/2083 [00:07<00:00, 285.65it/s]


epoch:13
train loss:0.4049333620832565
dev acc:0.783256880733945


100%|██████████| 2083/2083 [00:07<00:00, 286.04it/s]


epoch:14
train loss:0.4020880385839876
dev acc:0.7809633027522935


100%|██████████| 2083/2083 [00:07<00:00, 282.01it/s]


epoch:15
train loss:0.39961481612288374
dev acc:0.7809633027522935


100%|██████████| 2083/2083 [00:07<00:00, 283.36it/s]


epoch:16
train loss:0.3974190500139675
dev acc:0.7821100917431193


100%|██████████| 2083/2083 [00:07<00:00, 285.72it/s]


epoch:17
train loss:0.3954647810797936
dev acc:0.783256880733945


100%|██████████| 2083/2083 [00:07<00:00, 290.11it/s]


epoch:18
train loss:0.3937086333349469
dev acc:0.783256880733945


100%|██████████| 2083/2083 [00:07<00:00, 281.86it/s]


epoch:19
train loss:0.3921314725009398
dev acc:0.7878440366972477


100%|██████████| 2083/2083 [00:07<00:00, 277.84it/s]


epoch:20
train loss:0.39067243194127965
dev acc:0.7889908256880734


100%|██████████| 2083/2083 [00:07<00:00, 283.31it/s]


epoch:21
train loss:0.3893646748089985
dev acc:0.7878440366972477


100%|██████████| 2083/2083 [00:07<00:00, 280.18it/s]


epoch:22
train loss:0.3881511412171063
dev acc:0.7889908256880734


100%|██████████| 2083/2083 [00:07<00:00, 283.92it/s]


epoch:23
train loss:0.387073974561016
dev acc:0.7878440366972477


100%|██████████| 2083/2083 [00:07<00:00, 289.45it/s]


epoch:24
train loss:0.3860514767314171
dev acc:0.786697247706422


100%|██████████| 2083/2083 [00:07<00:00, 276.11it/s]


epoch:25
train loss:0.38511744213115695
dev acc:0.7878440366972477


100%|██████████| 2083/2083 [00:07<00:00, 287.40it/s]


epoch:26
train loss:0.3842400571030032
dev acc:0.7912844036697247


100%|██████████| 2083/2083 [00:07<00:00, 280.55it/s]


epoch:27
train loss:0.3834388077087375
dev acc:0.7878440366972477


100%|██████████| 2083/2083 [00:07<00:00, 283.37it/s]


epoch:28
train loss:0.38269867465093166
dev acc:0.7912844036697247


100%|██████████| 2083/2083 [00:07<00:00, 284.93it/s]


epoch:29
train loss:0.3819918540224569
dev acc:0.7901376146788991


100%|██████████| 2083/2083 [00:07<00:00, 282.79it/s]


epoch:30
train loss:0.3813406865340955
dev acc:0.7889908256880734


100%|██████████| 2083/2083 [00:07<00:00, 279.81it/s]


epoch:31
train loss:0.3807294530489098
dev acc:0.7924311926605505


100%|██████████| 2083/2083 [00:07<00:00, 285.70it/s]


epoch:32
train loss:0.38015882286725566
dev acc:0.7935779816513762


100%|██████████| 2083/2083 [00:07<00:00, 281.01it/s]


epoch:33
train loss:0.379603920013528
dev acc:0.7924311926605505


100%|██████████| 2083/2083 [00:07<00:00, 279.34it/s]


epoch:34
train loss:0.3791064567223685
dev acc:0.7947247706422018


100%|██████████| 2083/2083 [00:07<00:00, 280.69it/s]


epoch:35
train loss:0.3785961291435757
dev acc:0.7947247706422018


100%|██████████| 2083/2083 [00:07<00:00, 283.20it/s]


epoch:36
train loss:0.37816452617348817
dev acc:0.7947247706422018


100%|██████████| 2083/2083 [00:07<00:00, 281.69it/s]


epoch:37
train loss:0.3777206904088083
dev acc:0.7935779816513762


100%|██████████| 2083/2083 [00:07<00:00, 282.75it/s]


epoch:38
train loss:0.37733127819164064
dev acc:0.7970183486238532


100%|██████████| 2083/2083 [00:07<00:00, 280.43it/s]


epoch:39
train loss:0.3769299524853679
dev acc:0.7935779816513762


100%|██████████| 2083/2083 [00:07<00:00, 283.03it/s]


epoch:40
train loss:0.37657448509193875
dev acc:0.7958715596330275


100%|██████████| 2083/2083 [00:07<00:00, 285.32it/s]


epoch:41
train loss:0.3762138281384169
dev acc:0.7947247706422018


100%|██████████| 2083/2083 [00:07<00:00, 284.66it/s]


epoch:42
train loss:0.3759014042477967
dev acc:0.7970183486238532


100%|██████████| 2083/2083 [00:07<00:00, 285.76it/s]


epoch:43
train loss:0.3755786765010896
dev acc:0.7958715596330275


100%|██████████| 2083/2083 [00:07<00:00, 284.24it/s]


epoch:44
train loss:0.3752510964269542
dev acc:0.7970183486238532


100%|██████████| 2083/2083 [00:07<00:00, 285.01it/s]


epoch:45
train loss:0.3749641426268874
dev acc:0.7935779816513762


100%|██████████| 2083/2083 [00:07<00:00, 281.31it/s]


epoch:46
train loss:0.37471434884613886
dev acc:0.7958715596330275


100%|██████████| 2083/2083 [00:07<00:00, 286.54it/s]


epoch:47
train loss:0.37442593834489457
dev acc:0.7981651376146789


100%|██████████| 2083/2083 [00:07<00:00, 283.53it/s]


epoch:48
train loss:0.374194948813747
dev acc:0.7970183486238532


100%|██████████| 2083/2083 [00:07<00:00, 288.83it/s]


epoch:49
train loss:0.3739545902150557
dev acc:0.7970183486238532


### 77.GPU上での学習
76.と同様

### 78.単語埋め込みのファインチューニング

In [8]:
from torch.utils.data import DataLoader


class MeanEmbeddingClassifier(nn.Module):
    def __init__(self, embedding_matrix, freeze_embedding=False):#freeze_embedding=Falseで学習中にembedding層の重みを更新
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(
            torch.tensor(embedding_matrix), freeze=freeze_embedding
        )
        self.linear = nn.Linear(embedding_matrix.shape[1], 1)

    def forward(self, input_ids):
        mask = (input_ids != 0).unsqueeze(-1)
        embedded = self.embedding(input_ids) * mask
        mean_embed = embedded.sum(1) / mask.sum(1).clamp(min=1)
        return self.linear(mean_embed).squeeze(1)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MeanEmbeddingClassifier(embedding_matrix).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.BCEWithLogitsLoss()

train_loader = DataLoader(data_train, batch_size=32, shuffle=True, collate_fn=collate)
dev_loader = DataLoader(data_dev, batch_size=32, shuffle=False, collate_fn=collate)

def train(model, loader):
    model.train()
    train_loss = []
    for batch in tqdm.tqdm(loader):
        input_id = batch['input_id'].to(device)
        label = batch['label'].to(device).squeeze(1)
        optimizer.zero_grad()
        logits = model(input_id)
        loss = criterion(logits, label)
        loss.backward()
        optimizer.step()
        train_loss.append(loss.item())
    return np.mean(train_loss)

def dev(model, loader):
    model.eval()
    pred_labels = []
    true_labels = []
    for batch in loader:
        input_id = batch['input_id'].to(device)
        label = batch['label'].to(device).squeeze(1)
        logit = model(input_id)
        preds = (torch.sigmoid(logit) > 0.5).float()
        pred_labels.extend(preds.cpu().numpy())
        true_labels.extend(label.cpu().numpy())
    return accuracy_score(true_labels, pred_labels)

for epoch in range(10):
    train_loss = train(model, train_loader)
    dev_acc = dev(model, dev_loader)
    print(f'epoch:{epoch}\ntrain loss:{train_loss}\ndev acc:{dev_acc}')

torch.save(model.state_dict(), '/home/yokoyama/nlp-100/models/cp08/78.pt')

100%|██████████| 2083/2083 [04:26<00:00,  7.82it/s]


epoch:0
train loss:0.5999459676941713
dev acc:0.7454128440366973


100%|██████████| 2083/2083 [04:26<00:00,  7.81it/s]


epoch:1
train loss:0.42035491369661093
dev acc:0.8073394495412844


100%|██████████| 2083/2083 [04:26<00:00,  7.80it/s]


epoch:2
train loss:0.32645510344538503
dev acc:0.8176605504587156


100%|██████████| 2083/2083 [04:27<00:00,  7.80it/s]


epoch:3
train loss:0.2797163834458238
dev acc:0.8268348623853211


100%|██████████| 2083/2083 [04:27<00:00,  7.80it/s]


epoch:4
train loss:0.251293467909484
dev acc:0.8245412844036697


100%|██████████| 2083/2083 [04:26<00:00,  7.80it/s]


epoch:5
train loss:0.23156868961142968
dev acc:0.823394495412844


100%|██████████| 2083/2083 [04:26<00:00,  7.80it/s]


epoch:6
train loss:0.21679434004099793
dev acc:0.8165137614678899


100%|██████████| 2083/2083 [04:26<00:00,  7.80it/s]


epoch:7
train loss:0.20519471702363076
dev acc:0.8188073394495413


100%|██████████| 2083/2083 [04:26<00:00,  7.80it/s]


epoch:8
train loss:0.1958073860090682
dev acc:0.8153669724770642


100%|██████████| 2083/2083 [04:26<00:00,  7.81it/s]


epoch:9
train loss:0.18785398108844453
dev acc:0.8153669724770642


### 79.アーキテクチャの変更

In [9]:
import torch.nn as nn



class RNNnet(nn.Module):
    def __init__(self, embedding_matrix, hidden_size=64, num_layers=2):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix), freeze=True)
        self.rnn = nn.RNN(embedding_matrix.shape[1], hidden_size=hidden_size, num_layers=num_layers)
        self.linear = nn.Linear(hidden_size*2, 1)

    def forward(self, input_id):
        embedding = self.embedding(input_id)
        embedding = embedding.permute(1,0,2)
        output, h_n = self.rnn(embedding)
        h_f = h_n[-2]
        h_b = h_n[-1]
        h = torch.cat([h_f, h_b], dim=1)
        return self.linear(h).squeeze(1)


def train(model, loader):
    model.train()
    train_loss = []
    for batch in tqdm.tqdm(loader):
        input_id = batch['input_id'].to(device)
        label = batch['label'].to(device).squeeze(1)
        optimizer.zero_grad()
        logits = model(input_id)
        loss = criterion(logits, label)
        loss.backward()
        optimizer.step()
        train_loss.append(loss.item())
    return np.mean(train_loss)

def dev(model, loader):
    model.eval()
    pred_labels = []
    true_labels = []
    for batch in loader:
        input_id = batch['input_id'].to(device)
        label = batch['label'].to(device).squeeze(1)
        logit = model(input_id)
        preds = (torch.sigmoid(logit) > 0.5).float()
        pred_labels.extend(preds.cpu().numpy())
        true_labels.extend(label.cpu().numpy())
    return accuracy_score(true_labels, pred_labels)



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RNNnet(embedding_matrix).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.BCEWithLogitsLoss()

for epoch in range(10):
    train_loss = train(model, train_loader)
    dev_acc = dev(model, dev_loader)
    print(f'epoch:{epoch}\ntrain loss:{train_loss}\ndev acc:{dev_acc}')

torch.save(model.state_dict(), '/home/yokoyama/nlp-100/models/cp08/79.pt')

100%|██████████| 2083/2083 [00:19<00:00, 107.37it/s]


epoch:0
train loss:0.5618758475585907
dev acc:0.805045871559633


100%|██████████| 2083/2083 [00:18<00:00, 112.25it/s]


epoch:1
train loss:0.3752494651754487
dev acc:0.7924311926605505


100%|██████████| 2083/2083 [00:19<00:00, 109.46it/s]


epoch:2
train loss:0.3656098894842874
dev acc:0.8096330275229358


100%|██████████| 2083/2083 [00:19<00:00, 109.00it/s]


epoch:3
train loss:0.360889786773013
dev acc:0.8107798165137615


100%|██████████| 2083/2083 [00:19<00:00, 108.62it/s]


epoch:4
train loss:0.355860928449989
dev acc:0.8027522935779816


100%|██████████| 2083/2083 [00:18<00:00, 110.36it/s]


epoch:5
train loss:0.35824947197578644
dev acc:0.7878440366972477


100%|██████████| 2083/2083 [00:18<00:00, 109.87it/s]


epoch:6
train loss:0.35375129176456654
dev acc:0.786697247706422


100%|██████████| 2083/2083 [00:19<00:00, 108.98it/s]


epoch:7
train loss:0.3650477338784369
dev acc:0.7878440366972477


100%|██████████| 2083/2083 [00:19<00:00, 109.08it/s]


epoch:8
train loss:0.35630555269569947
dev acc:0.7912844036697247


100%|██████████| 2083/2083 [00:19<00:00, 109.57it/s]


epoch:9
train loss:0.3516535079907009
dev acc:0.8096330275229358
