In [1]:
import os
import torch

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
import pdb
import time
import datetime
import numpy as np
import pandas as pd
import os
import csv
import random

from transformers import BertTokenizer, BertForSequenceClassification, AdamW

import torch
from torch.utils.data import DataLoader
from torch_geometric.data import Data
import torch.nn.functional as F

device = torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu')

2021-12-01 18:01:21.497363: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-01 18:01:21.497402: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


=====data load====

In [3]:
train = pd.read_csv("./data/sst-2/train.csv")
val = pd.read_csv("./data/sst-2/val.csv")
test = pd.read_csv("./data/sst-2/test.csv")

df = pd.DataFrame(train)
train = df.sample(n=2000, random_state=7)

min_co = 5
print(train.shape, val.shape, test.shape, "min_co :", min_co)

(2000, 2) (872, 2) (1821, 2) min_co : 5


=====AUG 시작 ====

In [4]:
import re
def get_only_chars(line):

    clean_line = ""

    line = line.replace("’", "")
    line = line.replace("'", "")
    line = line.replace("-", " ") #replace hyphens with spaces
    line = line.replace("\t", " ")
    line = line.replace("\n", " ")
    line = line.lower()

    for char in line:
        if char in 'qwertyuiopasdfghjklzxcvbnm ':
            clean_line += char
        else:
            clean_line += ' '

    clean_line = re.sub(' +',' ',clean_line) #delete extra spaces
    if clean_line[0] == ' ':
        clean_line = clean_line[1:]
    return clean_line

====Graph 만들기===

In [5]:
from collections import Counter
from collections import defaultdict
from scipy.sparse import csr_matrix

def scan_vocabulary(train_input, min_count=2):
    counter = Counter(w for sent in train_input for w in sent)
    counter = {w:c for w,c in counter.items() if c >= min_count}
    idx_to_vocab = [w for w, _ in sorted(counter.items(), key=lambda x:-x[1])]
    vocab_to_idx = {vocab:idx for idx, vocab in enumerate(idx_to_vocab)}
    return idx_to_vocab, vocab_to_idx

def dict_to_mat(d, n_rows, n_cols):
    rows, cols, data = [], [], []
    for (i, j), v in d.items():
        rows.append(i)
        cols.append(j)
        data.append(v)
    return csr_matrix((data, (rows, cols)), shape=(n_rows, n_cols))

def cooccurrence(tokens, vocab_to_idx, window=2, min_cooccurrence=min_co):
    counter = defaultdict(int)
    for s, tokens_i in enumerate(tokens):
        vocabs = [vocab_to_idx[w] for w in tokens_i if w in vocab_to_idx]
        n = len(vocabs)
        for i, v in enumerate(vocabs):
            if window <= 0:
                b, e = 0, n
            else:
                b = max(0, i - window)
                e = min(i + window, n)
            for j in range(b, e):
                if i == j:
                    continue
                counter[(v, vocabs[j])] += 1
                counter[(vocabs[j], v)] += 1
    counter = {k:v for k,v in counter.items() if v >= min_cooccurrence}
    n_vocabs = len(vocab_to_idx)
    return dict_to_mat(counter, n_vocabs, n_vocabs), counter

def word_graph(train_input, min_count=2, window=2, min_cooccurrence=min_co):
    idx_to_vocab, vocab_to_idx = scan_vocabulary(train_input, min_count)
    tokens = [sent for sent in train_input]
    g, counter = cooccurrence(tokens, vocab_to_idx, window, min_cooccurrence)
    return g, idx_to_vocab, vocab_to_idx, counter
def get_graph_input(data):
    vocab = []
    train_input = []
    train_label = []
    for sentence, label in zip(data["sentence"], data["label"]):
        token_sen = get_only_chars(sentence)
        words = token_sen.split(' ')
        train_input.append(words)
        train_label.append(label)
        for word in words:
            if word not in vocab:
                vocab.append(word)
            else:
                pass
    return vocab, train_input, train_label

In [6]:
vocab, train_input, train_label = get_graph_input(train)
print(len(vocab), len(train_input), len(train_label))

7098 2000 2000


In [7]:
def preprocessing(data):
    data_input = []
    data_label = []
    for no, sentence in enumerate(data["sentence"]):
        token_sen = get_only_chars(sentence)
        words = token_sen.split(' ')
        data_input.append(words)
        data_label.append(data["label"][no])
    return data_input, data_label

In [8]:
val_input, val_label = preprocessing(val)
test_input, test_label = preprocessing(test)

In [9]:
def make_graph(train_input):
    g, idx_to_vocab, vocab_to_idx, counter = word_graph(train_input)
    mat = g.toarray()
    print("vocab :", len(vocab), "   ", "idx_to_vocab :", len(idx_to_vocab))
    print("vocab_to_idx = vocab : idx")
    print("counter = (node1, node2) : number")
    
    #make a edge
    edge_list = []

    for i, j in enumerate(mat):
        for l, occurance in enumerate(j):
            if occurance != 0:
                edge_list.append([vocab_to_idx[idx_to_vocab[i]], vocab_to_idx[idx_to_vocab[l]]])
            else:
                pass
    print("edge_list_len : ", len(edge_list))
    edge_index = torch.Tensor(edge_list).long()
    
    #make a node
    node_list = []
    for i in vocab_to_idx.values():
        node_list.append([i])
    print("node_list_len : ", len(node_list))
    node_index = torch.Tensor(node_list).float()
    
    print("edge, node size : ", edge_index.size(), node_index.size())
    

    graph_data = Data(x=node_index, edge_index = edge_index.t().contiguous())
    print("graph_data : ", graph_data)
    
    print("node : ", graph_data.num_nodes, "edge : ", graph_data.num_edges,
          "isolated node have : ", graph_data.contains_isolated_nodes(), 
          "self loop node have : ", graph_data.contains_self_loops(), 
          "key : ", graph_data.keys)
    return graph_data, vocab_to_idx, idx_to_vocab

In [10]:
graph_data, vocab_to_idx, idx_to_vocab = make_graph(train_input)

vocab : 7098     idx_to_vocab : 2863
vocab_to_idx = vocab : idx
counter = (node1, node2) : number
edge_list_len :  5156
node_list_len :  2863
edge, node size :  torch.Size([5156, 2]) torch.Size([2863, 1])
graph_data :  Data(x=[2863, 1], edge_index=[2, 5156])
node :  2863 edge :  5156 isolated node have :  True self loop node have :  True key :  ['x', 'edge_index']




In [11]:
edge_dic = {}
for i in graph_data.edge_index.t().numpy():
    if i[0] not in edge_dic.keys():
        edge_dic[i[0]] = [i[1]]
    else:
        edge_dic[i[0]].append(i[1])
print(len(edge_dic.keys()))

873


===AUG 기법 적용 ===

In [12]:
def tga03(train_input, train_label, graph_data, vocab_to_idx):
    aug_train_input = []
    aug_train_label = []
    for no, sentence in enumerate(train_input):        
        init_nodes_idx=[]
        nodes_idx = []
        for word in sentence:
            if word in vocab_to_idx.keys():
                nodes_idx.append(vocab_to_idx[word])
                init_nodes_idx.append(vocab_to_idx[word])
        num_edge = 10000
        init_node_idx = 2**100
        sw_node_idx = 2**100
        for node_idx in nodes_idx:
            if node_idx not in edge_dic.keys():
                nodes_idx.remove(node_idx)
                continue
            edge_list = edge_dic[node_idx]            
            for edge_node in edge_list:
                if edge_node not in init_nodes_idx:
                    edge_list.remove(edge_node)            
                    continue
            if len(edge_list) ==0:
                nodes_idx.remove(node_idx)
                continue                    
            if len(edge_list) < num_edge:
                num_edge = len(edge_list)
                init_node_idx = node_idx
                sw_node_idx = random.choice(edge_list)
        augdel_sentence = []+sentence
        augrep_sentence = []+sentence
        augsw_sentence = []+sentence
        augins_sentence = []+sentence
        if init_node_idx == 2**100:
            continue
        augdel_sentence.remove(idx_to_vocab[init_node_idx])
        aug_train_input.append(augdel_sentence)
        aug_train_label.append(train_label[no])          
        
        if sw_node_idx != 2**100:
            init_word_index = 0
            sw_word_index = 0
            for index, word in enumerate(sentence):
                if word not in vocab_to_idx.keys():
                    continue 
                if vocab_to_idx[word] == init_node_idx:
                    init_word_index = index
                if vocab_to_idx[word] == sw_node_idx:
                    sw_word_index = index
            augsw_sentence[sw_word_index] = idx_to_vocab[init_node_idx]
            augsw_sentence[init_word_index] = idx_to_vocab[sw_node_idx]
            aug_train_input.append(augsw_sentence)
            aug_train_label.append(train_label[no])    
        
        candidate = edge_dic[init_node_idx]
        if len(candidate) == 0:
            pass
        else:
            augrep_sentence[init_word_index] = idx_to_vocab[random.choice(candidate)]
            aug_train_input.append(augrep_sentence)
            aug_train_label.append(train_label[no])    
            augrep_sentence[init_word_index] = idx_to_vocab[random.choice(candidate)]
            aug_train_input.append(augrep_sentence)
            aug_train_label.append(train_label[no])    
            augins_sentence.insert(init_word_index, idx_to_vocab[random.choice(candidate)])
            aug_train_input.append(augins_sentence)
            aug_train_label.append(train_label[no])    
            augins_sentence.insert(init_word_index, idx_to_vocab[random.choice(candidate)])
            aug_train_input.append(augins_sentence)    
            aug_train_label.append(train_label[no])    
                    
        aug_train_input.append(sentence)
        aug_train_label.append(train_label[no])    
    return aug_train_input, aug_train_label
        

In [13]:
def tga01(train_input, train_label, graph_data, vocab_to_idx):
    aug_train_input = []
    aug_train_label = []
    for no, sentence in enumerate(train_input):        
        init_nodes_idx=[]
        nodes_idx = []
        for word in sentence:
            if word in vocab_to_idx.keys():
                nodes_idx.append(vocab_to_idx[word])
                init_nodes_idx.append(vocab_to_idx[word])
        num_edge = 0
        init_node_idx = 2**100
        sw_node_idx = 2**100
        for node_idx in nodes_idx:
            if node_idx not in edge_dic.keys():
                nodes_idx.remove(node_idx)
                continue
            edge_list = edge_dic[node_idx]            
            if len(edge_list) ==0:
                nodes_idx.remove(node_idx)
                continue
            for edge_node in edge_list:
                if edge_node not in init_nodes_idx:
                    edge_list.remove(edge_node)            
                    continue
            if len(edge_list) > num_edge:
                num_edge = len(edge_list)
                init_node_idx = node_idx
                sw_node_idx = random.choice(edge_list)
        augdel_sentence = []+sentence
        augrep_sentence = []+sentence
        augsw_sentence = []+sentence
        augins_sentence = []+sentence
        if init_node_idx == 2**100:
            continue
        augdel_sentence.remove(idx_to_vocab[init_node_idx])
        aug_train_input.append(augdel_sentence)
        aug_train_label.append(train_label[no])          
        
        if sw_node_idx != 2**100:
            init_word_index = 0
            sw_word_index = 0
            for index, word in enumerate(sentence):
                if word not in vocab_to_idx.keys():
                    continue 
                if vocab_to_idx[word] == init_node_idx:
                    init_word_index = index
                if vocab_to_idx[word] == sw_node_idx:
                    sw_word_index = index
            augsw_sentence[sw_word_index] = idx_to_vocab[init_node_idx]
            augsw_sentence[init_word_index] = idx_to_vocab[sw_node_idx]
            aug_train_input.append(augsw_sentence)
            aug_train_label.append(train_label[no])    
        
        candidate = edge_dic[init_node_idx]
        if len(candidate) == 0:
            pass
        else:
            augrep_sentence[init_word_index] = idx_to_vocab[random.choice(candidate)]
            aug_train_input.append(augrep_sentence)
            aug_train_label.append(train_label[no])    
            augrep_sentence[init_word_index] = idx_to_vocab[random.choice(candidate)]
            aug_train_input.append(augrep_sentence)
            aug_train_label.append(train_label[no])    
            augins_sentence.insert(init_word_index, idx_to_vocab[random.choice(candidate)])
            aug_train_input.append(augins_sentence)
            aug_train_label.append(train_label[no])    
            augins_sentence.insert(init_word_index, idx_to_vocab[random.choice(candidate)])
            aug_train_input.append(augins_sentence)    
            aug_train_label.append(train_label[no])    
                    
        aug_train_input.append(sentence)
        aug_train_label.append(train_label[no])    
    return aug_train_input, aug_train_label
        

In [14]:
aug_train_input, aug_train_label = tga03(train_input, train_label, graph_data, vocab_to_idx)

In [15]:
print(len(train_input), len(aug_train_input), len(aug_train_label)), print(min_co)

2000 13391 13391
5


(None, None)

In [16]:
def convert_input(input_data):
    temp_inputs = []
    for sentence in input_data:
        temp_inputs.append(" ".join(sentence))
    return temp_inputs

In [17]:
aug_train_inputs = convert_input(aug_train_input)
val_inputs = convert_input(val_input)
test_inputs = convert_input(test_input)

=====data preprocessing====

In [18]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [19]:
import torch

class text(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [20]:
def make_dataset(inputs, label):
    sentences = inputs
    labels = label
    print("sentences : ", len(sentences), "labels : ", len(labels))
    max_len = 128
    tokenized = tokenizer(sentences, truncation=True, padding=True)
    dataset = text(tokenized, labels)
    return dataset

In [21]:
train_dataset = make_dataset(aug_train_inputs, aug_train_label)
val_dataset = make_dataset(val_inputs, val_label)
test_dataset = make_dataset(test_inputs, test_label)

sentences :  13391 labels :  13391
sentences :  872 labels :  872
sentences :  1821 labels :  1821


====== BERT ======

In [22]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

epochs = 10

In [23]:
BERTmodel = BertForSequenceClassification.from_pretrained('bert-base-uncased')
BERTmodel.to(device)

train_loader = DataLoader(train_dataset, batch_size = 64, shuffle=True)
eval_loader = DataLoader(val_dataset, batch_size = 64, shuffle = True)
optim = AdamW(BERTmodel.parameters(), lr=0.00001)

for epoch in range(0, epochs):
    print("")
    print("===Epoch : ", epoch+1, "/", epochs, "===")
    t0 = time.time()
    total_loss = 0
    BERTmodel.train()
    for step, batch in enumerate(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = BERTmodel(input_ids, attention_mask=attention_mask, labels=labels)
        loss=outputs[0]
        total_loss += loss.item()
        loss.backward()
        optim.step()
    avg_train_loss = total_loss / len(train_loader)
    print("---Average training loss : ", avg_train_loss)
    print("---training epoch took : ", format_time(time.time()-t0))
    print("===Running Validation===")    
    
    BERTmodel.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in eval_loader:        
        b_input_ids = batch['input_ids'].to(device)
        b_attention_mask = batch['attention_mask'].to(device)
        b_labels = batch['labels'].to(device)
        with torch.no_grad():
            b_outputs = BERTmodel(b_input_ids, attention_mask=b_attention_mask)
        
        logits = b_outputs[0]
        
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps +=1
    print(" Val_accuracy : ", eval_accuracy/nb_eval_steps)
    print(" Val took : ", format_time(time.time()-t0))
print("Training complete")

t0 = time.time()
BERTmodel.eval()

test_loss, test_accuracy = 0, 0
test_steps, test_examples = 0, 0
test_loader = DataLoader(test_dataset, batch_size = 64, shuffle = True)
for step, batch in enumerate(test_loader):        
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    with torch.no_grad():
        outputs = BERTmodel(input_ids, attention_mask=attention_mask)
    logits = outputs[0]       
    logits = logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()

    tmp_test_accuracy = flat_accuracy(logits, label_ids)
    test_accuracy += tmp_test_accuracy
    test_steps +=1
print(" test_accuracy : ", test_accuracy/test_steps)
print(" test took : ", format_time(time.time()-t0))
print("Test complete")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


===Epoch :  1 / 10 ===
---Average training loss :  0.3235592667368196
---training epoch took :  0:01:16
===Running Validation===
 Val_accuracy :  0.8946428571428572
 Val took :  0:01:17

===Epoch :  2 / 10 ===
---Average training loss :  0.034031161067209076
---training epoch took :  0:01:19
===Running Validation===
 Val_accuracy :  0.8964285714285715
 Val took :  0:01:20

===Epoch :  3 / 10 ===
---Average training loss :  0.016863448481031117
---training epoch took :  0:01:19
===Running Validation===
 Val_accuracy :  0.8868303571428572
 Val took :  0:01:21

===Epoch :  4 / 10 ===
---Average training loss :  0.004277948725835553
---training epoch took :  0:01:19
===Running Validation===
 Val_accuracy :  0.8863839285714287
 Val took :  0:01:21

===Epoch :  5 / 10 ===
---Average training loss :  0.005066083256609826
---training epoch took :  0:01:19
===Running Validation===
 Val_accuracy :  0.8727678571428571
 Val took :  0:01:21

===Epoch :  6 / 10 ===
---Average training loss :  0.004

In [24]:
BERTmodel = BertForSequenceClassification.from_pretrained('bert-base-uncased')
BERTmodel.to(device)

train_loader = DataLoader(train_dataset, batch_size = 64, shuffle=True)
eval_loader = DataLoader(val_dataset, batch_size = 64, shuffle = True)
optim = AdamW(BERTmodel.parameters(), lr=0.00001)

for epoch in range(0, epochs):
    print("")
    print("===Epoch : ", epoch+1, "/", epochs, "===")
    t0 = time.time()
    total_loss = 0
    BERTmodel.train()
    for step, batch in enumerate(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = BERTmodel(input_ids, attention_mask=attention_mask, labels=labels)
        loss=outputs[0]
        total_loss += loss.item()
        loss.backward()
        optim.step()
    avg_train_loss = total_loss / len(train_loader)
    print("---Average training loss : ", avg_train_loss)
    print("---training epoch took : ", format_time(time.time()-t0))
    print("===Running Validation===")    
    
    BERTmodel.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in eval_loader:        
        b_input_ids = batch['input_ids'].to(device)
        b_attention_mask = batch['attention_mask'].to(device)
        b_labels = batch['labels'].to(device)
        with torch.no_grad():
            b_outputs = BERTmodel(b_input_ids, attention_mask=b_attention_mask)
        
        logits = b_outputs[0]
        
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps +=1
    print(" Val_accuracy : ", eval_accuracy/nb_eval_steps)
    print(" Val took : ", format_time(time.time()-t0))
print("Training complete")

t0 = time.time()
BERTmodel.eval()

test_loss, test_accuracy = 0, 0
test_steps, test_examples = 0, 0
test_loader = DataLoader(test_dataset, batch_size = 64, shuffle = True)
for step, batch in enumerate(test_loader):        
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    with torch.no_grad():
        outputs = BERTmodel(input_ids, attention_mask=attention_mask)
    logits = outputs[0]       
    logits = logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()

    tmp_test_accuracy = flat_accuracy(logits, label_ids)
    test_accuracy += tmp_test_accuracy
    test_steps +=1
print(" test_accuracy : ", test_accuracy/test_steps)
print(" test took : ", format_time(time.time()-t0))
print("Test complete")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


===Epoch :  1 / 10 ===
---Average training loss :  0.279878058390958
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8763392857142858
 Val took :  0:01:19

===Epoch :  2 / 10 ===
---Average training loss :  0.03352419794315384
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8901785714285715
 Val took :  0:01:20

===Epoch :  3 / 10 ===
---Average training loss :  0.01218162903872629
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8879464285714286
 Val took :  0:01:19

===Epoch :  4 / 10 ===
---Average training loss :  0.009663358224844116
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8772321428571429
 Val took :  0:01:19

===Epoch :  5 / 10 ===
---Average training loss :  0.008464655793172174
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8901785714285715
 Val took :  0:01:19

===Epoch :  6 / 10 ===
---Average training loss :  0.008169

In [25]:
BERTmodel = BertForSequenceClassification.from_pretrained('bert-base-uncased')
BERTmodel.to(device)

train_loader = DataLoader(train_dataset, batch_size = 64, shuffle=True)
eval_loader = DataLoader(val_dataset, batch_size = 64, shuffle = True)
optim = AdamW(BERTmodel.parameters(), lr=0.00001)

for epoch in range(0, epochs):
    print("")
    print("===Epoch : ", epoch+1, "/", epochs, "===")
    t0 = time.time()
    total_loss = 0
    BERTmodel.train()
    for step, batch in enumerate(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = BERTmodel(input_ids, attention_mask=attention_mask, labels=labels)
        loss=outputs[0]
        total_loss += loss.item()
        loss.backward()
        optim.step()
    avg_train_loss = total_loss / len(train_loader)
    print("---Average training loss : ", avg_train_loss)
    print("---training epoch took : ", format_time(time.time()-t0))
    print("===Running Validation===")    
    
    BERTmodel.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in eval_loader:        
        b_input_ids = batch['input_ids'].to(device)
        b_attention_mask = batch['attention_mask'].to(device)
        b_labels = batch['labels'].to(device)
        with torch.no_grad():
            b_outputs = BERTmodel(b_input_ids, attention_mask=b_attention_mask)
        
        logits = b_outputs[0]
        
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps +=1
    print(" Val_accuracy : ", eval_accuracy/nb_eval_steps)
    print(" Val took : ", format_time(time.time()-t0))
print("Training complete")

t0 = time.time()
BERTmodel.eval()

test_loss, test_accuracy = 0, 0
test_steps, test_examples = 0, 0
test_loader = DataLoader(test_dataset, batch_size = 64, shuffle = True)
for step, batch in enumerate(test_loader):        
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    with torch.no_grad():
        outputs = BERTmodel(input_ids, attention_mask=attention_mask)
    logits = outputs[0]       
    logits = logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()

    tmp_test_accuracy = flat_accuracy(logits, label_ids)
    test_accuracy += tmp_test_accuracy
    test_steps +=1
print(" test_accuracy : ", test_accuracy/test_steps)
print(" test took : ", format_time(time.time()-t0))
print("Test complete")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


===Epoch :  1 / 10 ===
---Average training loss :  0.28456493448022574
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8810267857142857
 Val took :  0:01:19

===Epoch :  2 / 10 ===
---Average training loss :  0.034649766878491
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8877232142857142
 Val took :  0:01:19

===Epoch :  3 / 10 ===
---Average training loss :  0.013441493298714271
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8950892857142857
 Val took :  0:01:19

===Epoch :  4 / 10 ===
---Average training loss :  0.00532644794272658
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8899553571428571
 Val took :  0:01:19

===Epoch :  5 / 10 ===
---Average training loss :  0.002891246109945877
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8839285714285714
 Val took :  0:01:19

===Epoch :  6 / 10 ===
---Average training loss :  0.003195

In [26]:
BERTmodel = BertForSequenceClassification.from_pretrained('bert-base-uncased')
BERTmodel.to(device)

train_loader = DataLoader(train_dataset, batch_size = 64, shuffle=True)
eval_loader = DataLoader(val_dataset, batch_size = 64, shuffle = True)
optim = AdamW(BERTmodel.parameters(), lr=0.00001)

for epoch in range(0, epochs):
    print("")
    print("===Epoch : ", epoch+1, "/", epochs, "===")
    t0 = time.time()
    total_loss = 0
    BERTmodel.train()
    for step, batch in enumerate(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = BERTmodel(input_ids, attention_mask=attention_mask, labels=labels)
        loss=outputs[0]
        total_loss += loss.item()
        loss.backward()
        optim.step()
    avg_train_loss = total_loss / len(train_loader)
    print("---Average training loss : ", avg_train_loss)
    print("---training epoch took : ", format_time(time.time()-t0))
    print("===Running Validation===")    
    
    BERTmodel.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in eval_loader:        
        b_input_ids = batch['input_ids'].to(device)
        b_attention_mask = batch['attention_mask'].to(device)
        b_labels = batch['labels'].to(device)
        with torch.no_grad():
            b_outputs = BERTmodel(b_input_ids, attention_mask=b_attention_mask)
        
        logits = b_outputs[0]
        
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps +=1
    print(" Val_accuracy : ", eval_accuracy/nb_eval_steps)
    print(" Val took : ", format_time(time.time()-t0))
print("Training complete")

t0 = time.time()
BERTmodel.eval()

test_loss, test_accuracy = 0, 0
test_steps, test_examples = 0, 0
test_loader = DataLoader(test_dataset, batch_size = 64, shuffle = True)
for step, batch in enumerate(test_loader):        
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    with torch.no_grad():
        outputs = BERTmodel(input_ids, attention_mask=attention_mask)
    logits = outputs[0]       
    logits = logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()

    tmp_test_accuracy = flat_accuracy(logits, label_ids)
    test_accuracy += tmp_test_accuracy
    test_steps +=1
print(" test_accuracy : ", test_accuracy/test_steps)
print(" test took : ", format_time(time.time()-t0))
print("Test complete")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


===Epoch :  1 / 10 ===
---Average training loss :  0.2802179616388111
---training epoch took :  0:01:17
===Running Validation===
 Val_accuracy :  0.8783482142857143
 Val took :  0:01:19

===Epoch :  2 / 10 ===
---Average training loss :  0.03835117303367172
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8966517857142857
 Val took :  0:01:19

===Epoch :  3 / 10 ===
---Average training loss :  0.022833154503522173
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8944196428571428
 Val took :  0:01:19

===Epoch :  4 / 10 ===
---Average training loss :  0.007626581082253584
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8863839285714287
 Val took :  0:01:19

===Epoch :  5 / 10 ===
---Average training loss :  0.00701026773922855
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8912946428571429
 Val took :  0:01:19

===Epoch :  6 / 10 ===
---Average training loss :  0.00694

In [27]:
BERTmodel = BertForSequenceClassification.from_pretrained('bert-base-uncased')
BERTmodel.to(device)

train_loader = DataLoader(train_dataset, batch_size = 64, shuffle=True)
eval_loader = DataLoader(val_dataset, batch_size = 64, shuffle = True)
optim = AdamW(BERTmodel.parameters(), lr=0.00001)

for epoch in range(0, epochs):
    print("")
    print("===Epoch : ", epoch+1, "/", epochs, "===")
    t0 = time.time()
    total_loss = 0
    BERTmodel.train()
    for step, batch in enumerate(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = BERTmodel(input_ids, attention_mask=attention_mask, labels=labels)
        loss=outputs[0]
        total_loss += loss.item()
        loss.backward()
        optim.step()
    avg_train_loss = total_loss / len(train_loader)
    print("---Average training loss : ", avg_train_loss)
    print("---training epoch took : ", format_time(time.time()-t0))
    print("===Running Validation===")    
    
    BERTmodel.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in eval_loader:        
        b_input_ids = batch['input_ids'].to(device)
        b_attention_mask = batch['attention_mask'].to(device)
        b_labels = batch['labels'].to(device)
        with torch.no_grad():
            b_outputs = BERTmodel(b_input_ids, attention_mask=b_attention_mask)
        
        logits = b_outputs[0]
        
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps +=1
    print(" Val_accuracy : ", eval_accuracy/nb_eval_steps)
    print(" Val took : ", format_time(time.time()-t0))
print("Training complete")

t0 = time.time()
BERTmodel.eval()

test_loss, test_accuracy = 0, 0
test_steps, test_examples = 0, 0
test_loader = DataLoader(test_dataset, batch_size = 64, shuffle = True)
for step, batch in enumerate(test_loader):        
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    with torch.no_grad():
        outputs = BERTmodel(input_ids, attention_mask=attention_mask)
    logits = outputs[0]       
    logits = logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()

    tmp_test_accuracy = flat_accuracy(logits, label_ids)
    test_accuracy += tmp_test_accuracy
    test_steps +=1
print(" test_accuracy : ", test_accuracy/test_steps)
print(" test took : ", format_time(time.time()-t0))
print("Test complete")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


===Epoch :  1 / 10 ===
---Average training loss :  0.2718585896616181
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8883928571428571
 Val took :  0:01:19

===Epoch :  2 / 10 ===
---Average training loss :  0.035701010249821204
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.890625
 Val took :  0:01:20

===Epoch :  3 / 10 ===
---Average training loss :  0.011729505650388697
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8859374999999999
 Val took :  0:01:19

===Epoch :  4 / 10 ===
---Average training loss :  0.009952433956676119
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8895089285714286
 Val took :  0:01:19

===Epoch :  5 / 10 ===
---Average training loss :  0.006969144642408494
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8799107142857142
 Val took :  0:01:19

===Epoch :  6 / 10 ===
---Average training loss :  0.0022901411290

In [28]:
BERTmodel = BertForSequenceClassification.from_pretrained('bert-base-uncased')
BERTmodel.to(device)

train_loader = DataLoader(train_dataset, batch_size = 64, shuffle=True)
eval_loader = DataLoader(val_dataset, batch_size = 64, shuffle = True)
optim = AdamW(BERTmodel.parameters(), lr=0.00001)

for epoch in range(0, epochs):
    print("")
    print("===Epoch : ", epoch+1, "/", epochs, "===")
    t0 = time.time()
    total_loss = 0
    BERTmodel.train()
    for step, batch in enumerate(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = BERTmodel(input_ids, attention_mask=attention_mask, labels=labels)
        loss=outputs[0]
        total_loss += loss.item()
        loss.backward()
        optim.step()
    avg_train_loss = total_loss / len(train_loader)
    print("---Average training loss : ", avg_train_loss)
    print("---training epoch took : ", format_time(time.time()-t0))
    print("===Running Validation===")    
    
    BERTmodel.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in eval_loader:        
        b_input_ids = batch['input_ids'].to(device)
        b_attention_mask = batch['attention_mask'].to(device)
        b_labels = batch['labels'].to(device)
        with torch.no_grad():
            b_outputs = BERTmodel(b_input_ids, attention_mask=b_attention_mask)
        
        logits = b_outputs[0]
        
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps +=1
    print(" Val_accuracy : ", eval_accuracy/nb_eval_steps)
    print(" Val took : ", format_time(time.time()-t0))
print("Training complete")

t0 = time.time()
BERTmodel.eval()

test_loss, test_accuracy = 0, 0
test_steps, test_examples = 0, 0
test_loader = DataLoader(test_dataset, batch_size = 64, shuffle = True)
for step, batch in enumerate(test_loader):        
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    with torch.no_grad():
        outputs = BERTmodel(input_ids, attention_mask=attention_mask)
    logits = outputs[0]       
    logits = logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()

    tmp_test_accuracy = flat_accuracy(logits, label_ids)
    test_accuracy += tmp_test_accuracy
    test_steps +=1
print(" test_accuracy : ", test_accuracy/test_steps)
print(" test took : ", format_time(time.time()-t0))
print("Test complete")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


===Epoch :  1 / 10 ===
---Average training loss :  0.28446609752164
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8796875000000001
 Val took :  0:01:20

===Epoch :  2 / 10 ===
---Average training loss :  0.04062493936868296
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8912946428571429
 Val took :  0:01:20

===Epoch :  3 / 10 ===
---Average training loss :  0.019913315760814364
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8848214285714285
 Val took :  0:01:19

===Epoch :  4 / 10 ===
---Average training loss :  0.014232135236462844
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8883928571428571
 Val took :  0:01:20

===Epoch :  5 / 10 ===
---Average training loss :  0.008359507983550429
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8899553571428571
 Val took :  0:01:19

===Epoch :  6 / 10 ===
---Average training loss :  0.008822

In [29]:
BERTmodel = BertForSequenceClassification.from_pretrained('bert-base-uncased')
BERTmodel.to(device)

train_loader = DataLoader(train_dataset, batch_size = 64, shuffle=True)
eval_loader = DataLoader(val_dataset, batch_size = 64, shuffle = True)
optim = AdamW(BERTmodel.parameters(), lr=0.00001)

for epoch in range(0, epochs):
    print("")
    print("===Epoch : ", epoch+1, "/", epochs, "===")
    t0 = time.time()
    total_loss = 0
    BERTmodel.train()
    for step, batch in enumerate(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = BERTmodel(input_ids, attention_mask=attention_mask, labels=labels)
        loss=outputs[0]
        total_loss += loss.item()
        loss.backward()
        optim.step()
    avg_train_loss = total_loss / len(train_loader)
    print("---Average training loss : ", avg_train_loss)
    print("---training epoch took : ", format_time(time.time()-t0))
    print("===Running Validation===")    
    
    BERTmodel.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in eval_loader:        
        b_input_ids = batch['input_ids'].to(device)
        b_attention_mask = batch['attention_mask'].to(device)
        b_labels = batch['labels'].to(device)
        with torch.no_grad():
            b_outputs = BERTmodel(b_input_ids, attention_mask=b_attention_mask)
        
        logits = b_outputs[0]
        
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps +=1
    print(" Val_accuracy : ", eval_accuracy/nb_eval_steps)
    print(" Val took : ", format_time(time.time()-t0))
print("Training complete")

t0 = time.time()
BERTmodel.eval()

test_loss, test_accuracy = 0, 0
test_steps, test_examples = 0, 0
test_loader = DataLoader(test_dataset, batch_size = 64, shuffle = True)
for step, batch in enumerate(test_loader):        
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    with torch.no_grad():
        outputs = BERTmodel(input_ids, attention_mask=attention_mask)
    logits = outputs[0]       
    logits = logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()

    tmp_test_accuracy = flat_accuracy(logits, label_ids)
    test_accuracy += tmp_test_accuracy
    test_steps +=1
print(" test_accuracy : ", test_accuracy/test_steps)
print(" test took : ", format_time(time.time()-t0))
print("Test complete")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


===Epoch :  1 / 10 ===
---Average training loss :  0.25186142789288646
---training epoch took :  0:01:17
===Running Validation===
 Val_accuracy :  0.8946428571428572
 Val took :  0:01:19

===Epoch :  2 / 10 ===
---Average training loss :  0.030345605718459758
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8919642857142858
 Val took :  0:01:20

===Epoch :  3 / 10 ===
---Average training loss :  0.013584412442564609
---training epoch took :  0:01:19
===Running Validation===
 Val_accuracy :  0.8928571428571429
 Val took :  0:01:20

===Epoch :  4 / 10 ===
---Average training loss :  0.006836462310803611
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.9035714285714286
 Val took :  0:01:19

===Epoch :  5 / 10 ===
---Average training loss :  0.003407987941699546
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8950892857142857
 Val took :  0:01:19

===Epoch :  6 / 10 ===
---Average training loss :  0.00

In [30]:
BERTmodel = BertForSequenceClassification.from_pretrained('bert-base-uncased')
BERTmodel.to(device)

train_loader = DataLoader(train_dataset, batch_size = 64, shuffle=True)
eval_loader = DataLoader(val_dataset, batch_size = 64, shuffle = True)
optim = AdamW(BERTmodel.parameters(), lr=0.00001)

for epoch in range(0, epochs):
    print("")
    print("===Epoch : ", epoch+1, "/", epochs, "===")
    t0 = time.time()
    total_loss = 0
    BERTmodel.train()
    for step, batch in enumerate(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = BERTmodel(input_ids, attention_mask=attention_mask, labels=labels)
        loss=outputs[0]
        total_loss += loss.item()
        loss.backward()
        optim.step()
    avg_train_loss = total_loss / len(train_loader)
    print("---Average training loss : ", avg_train_loss)
    print("---training epoch took : ", format_time(time.time()-t0))
    print("===Running Validation===")    
    
    BERTmodel.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in eval_loader:        
        b_input_ids = batch['input_ids'].to(device)
        b_attention_mask = batch['attention_mask'].to(device)
        b_labels = batch['labels'].to(device)
        with torch.no_grad():
            b_outputs = BERTmodel(b_input_ids, attention_mask=b_attention_mask)
        
        logits = b_outputs[0]
        
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps +=1
    print(" Val_accuracy : ", eval_accuracy/nb_eval_steps)
    print(" Val took : ", format_time(time.time()-t0))
print("Training complete")

t0 = time.time()
BERTmodel.eval()

test_loss, test_accuracy = 0, 0
test_steps, test_examples = 0, 0
test_loader = DataLoader(test_dataset, batch_size = 64, shuffle = True)
for step, batch in enumerate(test_loader):        
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    with torch.no_grad():
        outputs = BERTmodel(input_ids, attention_mask=attention_mask)
    logits = outputs[0]       
    logits = logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()

    tmp_test_accuracy = flat_accuracy(logits, label_ids)
    test_accuracy += tmp_test_accuracy
    test_steps +=1
print(" test_accuracy : ", test_accuracy/test_steps)
print(" test took : ", format_time(time.time()-t0))
print("Test complete")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


===Epoch :  1 / 10 ===
---Average training loss :  0.33461703459421793
---training epoch took :  0:01:17
===Running Validation===
 Val_accuracy :  0.8763392857142858
 Val took :  0:01:19

===Epoch :  2 / 10 ===
---Average training loss :  0.04571172697247849
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8863839285714287
 Val took :  0:01:19

===Epoch :  3 / 10 ===
---Average training loss :  0.01924738162418916
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8819196428571429
 Val took :  0:01:19

===Epoch :  4 / 10 ===
---Average training loss :  0.012314435583539307
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8834821428571429
 Val took :  0:01:19

===Epoch :  5 / 10 ===
---Average training loss :  0.006908645042373488
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.884375
 Val took :  0:01:19

===Epoch :  6 / 10 ===
---Average training loss :  0.00758132888780

In [31]:
BERTmodel = BertForSequenceClassification.from_pretrained('bert-base-uncased')
BERTmodel.to(device)

train_loader = DataLoader(train_dataset, batch_size = 64, shuffle=True)
eval_loader = DataLoader(val_dataset, batch_size = 64, shuffle = True)
optim = AdamW(BERTmodel.parameters(), lr=0.00001)

for epoch in range(0, epochs):
    print("")
    print("===Epoch : ", epoch+1, "/", epochs, "===")
    t0 = time.time()
    total_loss = 0
    BERTmodel.train()
    for step, batch in enumerate(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = BERTmodel(input_ids, attention_mask=attention_mask, labels=labels)
        loss=outputs[0]
        total_loss += loss.item()
        loss.backward()
        optim.step()
    avg_train_loss = total_loss / len(train_loader)
    print("---Average training loss : ", avg_train_loss)
    print("---training epoch took : ", format_time(time.time()-t0))
    print("===Running Validation===")    
    
    BERTmodel.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in eval_loader:        
        b_input_ids = batch['input_ids'].to(device)
        b_attention_mask = batch['attention_mask'].to(device)
        b_labels = batch['labels'].to(device)
        with torch.no_grad():
            b_outputs = BERTmodel(b_input_ids, attention_mask=b_attention_mask)
        
        logits = b_outputs[0]
        
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps +=1
    print(" Val_accuracy : ", eval_accuracy/nb_eval_steps)
    print(" Val took : ", format_time(time.time()-t0))
print("Training complete")

t0 = time.time()
BERTmodel.eval()

test_loss, test_accuracy = 0, 0
test_steps, test_examples = 0, 0
test_loader = DataLoader(test_dataset, batch_size = 64, shuffle = True)
for step, batch in enumerate(test_loader):        
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    with torch.no_grad():
        outputs = BERTmodel(input_ids, attention_mask=attention_mask)
    logits = outputs[0]       
    logits = logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()

    tmp_test_accuracy = flat_accuracy(logits, label_ids)
    test_accuracy += tmp_test_accuracy
    test_steps +=1
print(" test_accuracy : ", test_accuracy/test_steps)
print(" test took : ", format_time(time.time()-t0))
print("Test complete")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


===Epoch :  1 / 10 ===
---Average training loss :  0.27893046539365535
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8995535714285714
 Val took :  0:01:19

===Epoch :  2 / 10 ===
---Average training loss :  0.03181663874780671
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8935267857142858
 Val took :  0:01:20

===Epoch :  3 / 10 ===
---Average training loss :  0.009376075699748028
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8953125000000001
 Val took :  0:01:19

===Epoch :  4 / 10 ===
---Average training loss :  0.007366366571347628
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8912946428571429
 Val took :  0:01:19

===Epoch :  5 / 10 ===
---Average training loss :  0.00572816553910906
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8926339285714285
 Val took :  0:01:19

===Epoch :  6 / 10 ===
---Average training loss :  0.0053

In [32]:
BERTmodel = BertForSequenceClassification.from_pretrained('bert-base-uncased')
BERTmodel.to(device)

train_loader = DataLoader(train_dataset, batch_size = 64, shuffle=True)
eval_loader = DataLoader(val_dataset, batch_size = 64, shuffle = True)
optim = AdamW(BERTmodel.parameters(), lr=0.00001)

for epoch in range(0, epochs):
    print("")
    print("===Epoch : ", epoch+1, "/", epochs, "===")
    t0 = time.time()
    total_loss = 0
    BERTmodel.train()
    for step, batch in enumerate(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = BERTmodel(input_ids, attention_mask=attention_mask, labels=labels)
        loss=outputs[0]
        total_loss += loss.item()
        loss.backward()
        optim.step()
    avg_train_loss = total_loss / len(train_loader)
    print("---Average training loss : ", avg_train_loss)
    print("---training epoch took : ", format_time(time.time()-t0))
    print("===Running Validation===")    
    
    BERTmodel.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in eval_loader:        
        b_input_ids = batch['input_ids'].to(device)
        b_attention_mask = batch['attention_mask'].to(device)
        b_labels = batch['labels'].to(device)
        with torch.no_grad():
            b_outputs = BERTmodel(b_input_ids, attention_mask=b_attention_mask)
        
        logits = b_outputs[0]
        
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps +=1
    print(" Val_accuracy : ", eval_accuracy/nb_eval_steps)
    print(" Val took : ", format_time(time.time()-t0))
print("Training complete")

t0 = time.time()
BERTmodel.eval()

test_loss, test_accuracy = 0, 0
test_steps, test_examples = 0, 0
test_loader = DataLoader(test_dataset, batch_size = 64, shuffle = True)
for step, batch in enumerate(test_loader):        
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    with torch.no_grad():
        outputs = BERTmodel(input_ids, attention_mask=attention_mask)
    logits = outputs[0]       
    logits = logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()

    tmp_test_accuracy = flat_accuracy(logits, label_ids)
    test_accuracy += tmp_test_accuracy
    test_steps +=1
print(" test_accuracy : ", test_accuracy/test_steps)
print(" test took : ", format_time(time.time()-t0))
print("Test complete")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


===Epoch :  1 / 10 ===
---Average training loss :  0.25892000557145195
---training epoch took :  0:01:17
===Running Validation===
 Val_accuracy :  0.8926339285714285
 Val took :  0:01:19

===Epoch :  2 / 10 ===
---Average training loss :  0.03115159507530431
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.890625
 Val took :  0:01:20

===Epoch :  3 / 10 ===
---Average training loss :  0.017839705774427523
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.884375
 Val took :  0:01:20

===Epoch :  4 / 10 ===
---Average training loss :  0.008243330575281843
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8915178571428571
 Val took :  0:01:19

===Epoch :  5 / 10 ===
---Average training loss :  0.0047283358743048405
---training epoch took :  0:01:18
===Running Validation===
 Val_accuracy :  0.8917410714285714
 Val took :  0:01:19

===Epoch :  6 / 10 ===
---Average training loss :  0.0060530613346989936
--