In [1]:
import json
import glob
import os

create_data = False

def extract_data(dir_path):
    data = []
    for _, dirs, _ in os.walk(dir_path):
        for dir in dirs:
            files = glob.glob(os.path.join(dir_path, dir, '*.json'))
            for file in files:
                with open(file, 'r') as f:
                    content = json.load(f)
                    reliable = content['labeledDataInfo']['clickbaitClass']
                    title = content['sourceDataInfo']['newsTitle'] if reliable else content['labeledDataInfo']['newTitle']
                    content = content['sourceDataInfo']['newsContent']
                    content = data['content']
                    content = content.replace("\n", " ")
                    content = content.replace("\\\'", "\'")
                    content = content.replace('\\\"', '\"')
                    content = content.replace("\\", "")
                    data.append({
                        'title': title,
                        'content': content,
                        'reliable': reliable
                    })
    return data

if create_data:
    train_data = extract_data('../data/cls/training_set_mini') 
    test_data = extract_data('../data/cls/validation_set_mini')

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader

batch_size = 8

class CBDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data = self.data[idx]
        return data['title'], data['content'], data['reliable']

if create_data:
    train_dataloader = DataLoader(CBDataset(train_data), batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(CBDataset(test_data), batch_size=batch_size)
    torch.save({'train': train_dataloader, 'test': test_dataloader},'../data/cls_data_loader.pt')
    
else:
    loaded = torch.load('../data/cls_data_loader.pt')
    train_dataloader = loaded['train']
    test_dataloader = loaded['test']

In [3]:
from transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('monologg/kobert')
model = BertForSequenceClassification.from_pretrained('monologg/kobert', num_labels=2).cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from torch.optim import AdamW
from tqdm import tqdm

optimizer = AdamW(model.parameters(), lr=1e-5)

itr = 1
p_itr = 1000
epochs = 1
total_loss = 0
total_len = 0

model.train()
for epoch in range(epochs):

    for step, data in enumerate(tqdm(train_dataloader)):
        title, content, label = data
        inputs = tokenizer(title, content, padding=True, truncation=True, max_length=512, return_tensors="pt")
        inputs, label = inputs.to('cuda'), label.to('cuda')
        outputs = model(**inputs, labels=label)

        loss = outputs.loss
        total_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (step+1) % 1000 == 0:
            print('[epoch {}/{}] step {} -> loss: {:.4f}'.format(epoch+1, epochs, step+1, total_loss/(step+1)))
            torch.save(model.state_dict(), 'cls.pt')
            total_loss = 0
            total_len = 0
            
torch.save(model.state_dict(), 'cls.pt')
# Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.

  0%|          | 5/11869 [00:02<1:07:41,  2.92it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  0%|          | 18/11869 [00:05<32:39,  6.05it/s] Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  0%|          | 22/11869 [00:05<40:30,  4.87it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  0%|          | 26/11869 [00:06<42:02,  4.70it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_fir

[epoch 1/1] step 1000 -> loss: 0.4742


  8%|▊         | 1005/11869 [03:27<35:49,  5.05it/s]  Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  8%|▊         | 1007/11869 [03:27<39:22,  4.60it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  9%|▊         | 1010/11869 [03:28<38:02,  4.76it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  9%|▊         | 1011/11869 [03:28<38:50,  4.66it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'lon

[epoch 1/1] step 2000 -> loss: 0.1750


 17%|█▋        | 2002/11869 [06:55<40:41,  4.04it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 17%|█▋        | 2017/11869 [06:58<32:33,  5.04it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 17%|█▋        | 2026/11869 [06:59<28:00,  5.86it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 17%|█▋        | 2036/11869 [07:01<32:38,  5.02it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longe

[epoch 1/1] step 3000 -> loss: 0.1020


 25%|██▌       | 3005/11869 [10:23<37:06,  3.98it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 25%|██▌       | 3016/11869 [10:25<30:24,  4.85it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 25%|██▌       | 3019/11869 [10:26<31:03,  4.75it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list 

[epoch 1/1] step 4000 -> loss: 0.0696


 34%|███▍      | 4008/11869 [13:52<26:43,  4.90it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 34%|███▍      | 4018/11869 [13:55<26:36,  4.92it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 34%|███▍      | 4037/11869 [13:59<29:07,  4.48it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 34%|███▍      | 4049/11869 [14:01<24:19,  5.36it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longe

[epoch 1/1] step 5000 -> loss: 0.0475


 42%|████▏     | 5003/11869 [17:21<28:33,  4.01it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 42%|████▏     | 5007/11869 [17:21<24:08,  4.74it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 42%|████▏     | 5010/11869 [17:22<26:31,  4.31it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 42%|████▏     | 5012/11869 [17:23<26:40,  4.29it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longe

[epoch 1/1] step 6000 -> loss: 0.0333


 51%|█████     | 6016/11869 [20:50<19:40,  4.96it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 51%|█████     | 6023/11869 [20:52<18:26,  5.28it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 51%|█████     | 6027/11869 [20:52<20:16,  4.80it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 51%|█████     | 6032/11869 [20:53<20:02,  4.86it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longe

[epoch 1/1] step 7000 -> loss: 0.0246


 59%|█████▉    | 7009/11869 [24:14<17:21,  4.67it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 59%|█████▉    | 7031/11869 [24:18<16:27,  4.90it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 59%|█████▉    | 7043/11869 [24:21<17:01,  4.73it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 59%|█████▉    | 7047/11869 [24:22<17:53,  4.49it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longe

[epoch 1/1] step 8000 -> loss: 0.0240


 68%|██████▊   | 8017/11869 [27:39<14:36,  4.39it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 68%|██████▊   | 8021/11869 [27:40<13:21,  4.80it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 68%|██████▊   | 8029/11869 [27:42<13:06,  4.89it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 68%|██████▊   | 8036/11869 [27:43<12:41,  5.04it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longe

[epoch 1/1] step 9000 -> loss: 0.0186


 76%|███████▌  | 9000/11869 [31:08<15:19,  3.12it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 76%|███████▌  | 9003/11869 [31:08<11:41,  4.09it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 76%|███████▌  | 9024/11869 [31:12<08:20,  5.69it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 76%|███████▌  | 9038/11869 [31:15<08:45,  5.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longe

[epoch 1/1] step 10000 -> loss: 0.0167


 84%|████████▍ | 10002/11869 [34:41<07:47,  3.99it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 84%|████████▍ | 10016/11869 [34:43<06:25,  4.81it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 85%|████████▍ | 10039/11869 [34:48<06:04,  5.02it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 85%|████████▍ | 10067/11869 [34:54<05:25,  5.53it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'l

[epoch 1/1] step 11000 -> loss: 0.0134


 93%|█████████▎| 11017/11869 [38:12<02:46,  5.13it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 93%|█████████▎| 11020/11869 [38:12<03:15,  4.33it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 93%|█████████▎| 11028/11869 [38:14<02:58,  4.70it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned li

In [5]:
model.eval()
total_correct = 0

for step, data in enumerate(tqdm(test_dataloader)):
    title, content, label = data
    inputs = tokenizer(title, content, padding=True, truncation=True, max_length=512, return_tensors="pt")
    inputs, label = inputs.to('cuda'), label.to('cuda')
    outputs = model(**inputs)

    logits = outputs.logits
    pred = torch.argmax(logits, dim=-1)
    correct = pred.eq(label)
    total_correct += correct.sum().item()
    total_len += len(label)

print('Test accuracy: ', total_correct / total_len)


  8%|▊         | 34/450 [00:02<00:25, 16.01it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 18%|█▊        | 81/450 [00:05<00:23, 15.49it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 20%|█▉        | 89/450 [00:05<00:22, 16.33it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 21%|██        | 93/450 [00:06<00:23, 15.04it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' trunca

Test accuracy:  0.7337409672040022



