In [None]:
!pip install d2l==1.0.0-alpha1.post0
import json
import torch
from torch import nn
from d2l import torch as d2l
from collections import Counter
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting d2l==1.0.0-alpha1.post0
  Downloading d2l-1.0.0a1.post0-py3-none-any.whl (93 kB)
[K     |████████████████████████████████| 93 kB 994 kB/s 
Collecting jupyter
  Downloading jupyter-1.0.0-py2.py3-none-any.whl (2.7 kB)
Collecting matplotlib-inline
  Downloading matplotlib_inline-0.1.6-py3-none-any.whl (9.4 kB)
Collecting qtconsole
  Downloading qtconsole-5.3.2-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 17.6 MB/s 
Collecting jedi>=0.10
  Downloading jedi-0.18.1-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 63.2 MB/s 
Collecting qtpy>=2.0.1
  Downloading QtPy-2.2.0-py3-none-any.whl (82 kB)
[K     |████████████████████████████████| 82 kB 636 kB/s 
Installing collected packages: jedi, qtpy, qtconsole, matplotlib-inline, jupyter, d2l
Successfully installed d2l-1.0.0a1.post0 jedi-0.18.1 jupyter-1.0.0 matplotlib-inline-

In [None]:
DATAPATH = "drive/MyDrive/SML2"
from collections import Counter
def dict_from_json(filename):
    f = open(filename)
    data = json.load(f)
    f.close()
    return data

train_dict = dict_from_json("drive/MyDrive/SML2/alltrain.json")
valid_dict = dict_from_json("drive/MyDrive/SML2/test.json")
tests = [train_dict[i]['abstract'] for i in range(len(train_dict))] + [valid_dict[i]['abstract'] for i in range(len(valid_dict))]

In [None]:
class myWikiTextDataset(torch.utils.data.Dataset):
    """Defined in :numref:`subsec_prepare_mlm_data`"""
    def __init__(self, paragraphs, max_len):
        # Input `paragraphs[i]` is a list of sentence strings representing a
        # paragraph; while output `paragraphs[i]` is a list of sentences
        # representing a paragraph, where each sentence is a list of tokens
        paragraphs = [paragraph for paragraph in paragraphs]
        sentences = [sentence for paragraph in paragraphs
                     for sentence in paragraph]
        self.vocab = MYVocab(sentences, min_freq=5, reserved_tokens=[
            '<pad>', '<mask>', '<cls>', '<sep>'])
        # Get data for the next sentence prediction task
        examples = []
        for paragraph in paragraphs:
            examples.extend(d2l._get_nsp_data_from_paragraph(
                paragraph, paragraphs, self.vocab, max_len))
        # Get data for the masked language model task
        examples = [(d2l._get_mlm_data_from_tokens(tokens, self.vocab)
                      + (segments, is_next))
                     for tokens, segments, is_next in examples]
        self.examples = examples
        # Pad inputs
        (self.all_token_ids, self.all_segments, self.valid_lens,
         self.all_pred_positions, self.all_mlm_weights,
         self.all_mlm_labels, self.nsp_labels) = d2l._pad_bert_inputs(
            examples, max_len, self.vocab)
         

    def __getitem__(self, idx):
        return (self.all_token_ids[idx], self.all_segments[idx],
                self.valid_lens[idx], self.all_pred_positions[idx],
                self.all_mlm_weights[idx], self.all_mlm_labels[idx],
                self.nsp_labels[idx])

    def __len__(self):
        return len(self.all_token_ids)

class MYVocab:
    """Vocabulary for text."""
    def __init__(self, tokens=[], min_freq=0, reserved_tokens=[]):
        """Defined in :numref:`sec_text-sequence`"""
        # Flatten a 2D list if needed
        if tokens and isinstance(tokens[0], list):
            tokens = [token for line in tokens for token in line]
        # Count token frequencies
        counter = Counter(tokens)
        self.token_freqs = sorted(counter.items(), key=lambda x: x[1],
                                  reverse=True)
        # The list of unique tokens
        comp1 = reserved_tokens
        comp2 = [token for token, freq in self.token_freqs if freq >= min_freq]
        self.idx_to_token = list(set(['<unk>'] + comp1 + comp2))
        self.token_to_idx = {token: idx
                             for idx, token in enumerate(self.idx_to_token)}

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if hasattr(indices, '__len__') and len(indices) > 1:
            return [self.idx_to_token[int(index)] for index in indices]
        return self.idx_to_token[indices]

    @property
    def unk(self):  # Index for the unknown token
        return self.token_to_idx['<unk>']

In [None]:
import json

def paragraph_builder(abstract):
    size = len(abstract)
    if abstract[-1] != 12:
      abstract.append(12)
    idx_list = [idx + 1 for idx, val in
            enumerate(abstract) if val == 12]
    
    res = [abstract[i: j] for i, j in
            zip([0] + idx_list, idx_list + 
            ([size] if idx_list[-1] != size else []))]
    return res

def load_Mydataset(batch_size, max_len, asbtracts):
    num_workers = 0
    paragraphs = [paragraph_builder(paragraph) for paragraph in asbtracts]
    train_set = myWikiTextDataset(paragraphs, max_len)
    train_iter = torch.utils.data.DataLoader(train_set, batch_size,shuffle=True, num_workers=num_workers)
    return train_iter, train_set.vocab

In [None]:
train_iter, vocab = load_Mydataset(32, 128, tests)

In [None]:
NUMS_OF_HEAD = 8
NUMS_OF_BLKS = 8
DROPOUT = 0.2
HIDDEN = 128
FFN_num_hiddens = 256

net = d2l.BERTModel(len(vocab), num_hiddens=HIDDEN,
                    ffn_num_hiddens=FFN_num_hiddens, num_heads=NUMS_OF_HEAD, num_blks=NUMS_OF_BLKS, dropout=0.2)

devices = d2l.try_all_gpus()
loss = nn.CrossEntropyLoss()

def _get_batch_loss_bert(net, loss, vocab_size, tokens_X,
                         segments_X, valid_lens_x,
                         pred_positions_X, mlm_weights_X,
                         mlm_Y, nsp_y):
    # Forward pass
    _, mlm_Y_hat, nsp_Y_hat = net(tokens_X, segments_X,
                                  valid_lens_x.reshape(-1),
                                  pred_positions_X)
    # Compute masked language model loss
    mlm_l = loss(mlm_Y_hat.reshape(-1, vocab_size), mlm_Y.reshape(-1)) *\
    mlm_weights_X.reshape(-1, 1)
    mlm_l = mlm_l.sum() / (mlm_weights_X.sum() + 1e-8)
    # Compute next sentence prediction loss
    nsp_l = loss(nsp_Y_hat, nsp_y)
    l = mlm_l + nsp_l
    return mlm_l, nsp_l, l

def train_bert(train_iter, net, loss, vocab_size, devices, num_steps):
    net(*next(iter(train_iter))[:4])
    net = nn.DataParallel(net, device_ids=devices).to(devices[0])
    trainer = torch.optim.Adam(net.parameters(), lr=0.01)
    step, timer = 0, d2l.Timer()
    #animator = d2l.Animator(xlabel='step', ylabel='loss',
    #                        xlim=[1, num_steps], legend=['mlm', 'nsp'])
    # Sum of masked language modeling losses, sum of next sentence prediction
    # losses, no. of sentence pairs, count
    metric = d2l.Accumulator(4)
    num_steps_reached = False
    while step < num_steps and not num_steps_reached:
        for tokens_X, segments_X, valid_lens_x, pred_positions_X,\
            mlm_weights_X, mlm_Y, nsp_y in train_iter:
            tokens_X = tokens_X.to(devices[0])
            segments_X = segments_X.to(devices[0])
            valid_lens_x = valid_lens_x.to(devices[0])
            pred_positions_X = pred_positions_X.to(devices[0])
            mlm_weights_X = mlm_weights_X.to(devices[0])
            mlm_Y, nsp_y = mlm_Y.to(devices[0]), nsp_y.to(devices[0])
            trainer.zero_grad()
            timer.start()
            mlm_l, nsp_l, l = _get_batch_loss_bert(
                net, loss, vocab_size, tokens_X, segments_X, valid_lens_x,
                pred_positions_X, mlm_weights_X, mlm_Y, nsp_y)
            l.backward()
            trainer.step()
            metric.add(mlm_l, nsp_l, tokens_X.shape[0], 1)
            timer.stop()
            #animator.add(step + 1,
            #             (metric[0] / metric[3], metric[1] / metric[3]))
            step += 1
            if step%500 == 0:
              print(f"The {step} has complited ,\n MLM loss {metric[0] / metric[3]:.3f}, \n NSP loss {metric[1] / metric[3]:.3f}")
            if step == num_steps:
                num_steps_reached = True
                break

    print(f'MLM loss {metric[0] / metric[3]:.3f}, '
          f'NSP loss {metric[1] / metric[3]:.3f}')
    print(f'{metric[2] / timer.sum():.1f} sentence pairs/sec on '
          f'{str(devices)}')



In [None]:
train_bert(train_iter, net, loss, len(vocab), devices, 5000)

The 500 has complited ,
 MLM loss 3.731, 
 NSP loss 0.735
The 1000 has complited ,
 MLM loss 3.700, 
 NSP loss 0.730
The 1500 has complited ,
 MLM loss 3.680, 
 NSP loss 0.722
The 2000 has complited ,
 MLM loss 3.667, 
 NSP loss 0.715
The 2500 has complited ,
 MLM loss 3.663, 
 NSP loss 0.711
The 3000 has complited ,
 MLM loss 3.662, 
 NSP loss 0.709
The 3500 has complited ,
 MLM loss 3.659, 
 NSP loss 0.707
The 4000 has complited ,
 MLM loss 3.658, 
 NSP loss 0.705
The 4500 has complited ,
 MLM loss 3.659, 
 NSP loss 0.704
The 5000 has complited ,
 MLM loss 3.659, 
 NSP loss 0.703
MLM loss 3.659, NSP loss 0.703
553.0 sentence pairs/sec on [device(type='cuda', index=0)]


In [None]:


def get_tokens_and_segments(tokens_a, tokens_b=None):
    """Get tokens of the BERT input sequence and their segment IDs.

    Defined in :numref:`sec_bert`"""
    
    tokens = ['<cls>'] + tokens_a + ['<sep>']
    
    # 0 and 1 are marking segment A and B, respectively
    segments = [0] * (len(tokens_a) + 2)
    if tokens_b is not None:
        tokens += tokens_b + ['<sep>']
        segments += [1] * (len(tokens_b) + 1)
    return tokens, segments

def get_bert_encoding(net, tokens_a, tokens_b=None):
    tokens, segments = get_tokens_and_segments(tokens_a, tokens_b)
    token_ids = torch.tensor(vocab[tokens], device=devices[0]).unsqueeze(0)
    segments = torch.tensor(segments, device=devices[0]).unsqueeze(0)
    valid_len = torch.tensor(len(tokens), device=devices[0]).unsqueeze(0)
    encoded_X, _, _ = net(token_ids, segments, valid_len)
    return encoded_X


# Feedforward neural net

In [None]:

class PaperDataset(Dataset):

    def __init__(self, path, net):
        f = open(path)
        self.df = json.load(f)
        f.close()
        self.bert = net
        self.MAXLEN = 1000
    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        label = torch.zeros(100)
        label[self.df[index]['proauthors']] = 1
        abstract =list(self.df[index]["abstract"])
        if len(abstract) >= self.MAXLEN - 2:

          abstract = abstract[0:998]
          abstract[-3] = 12
        absrtract = d2l.get_tokens_and_segments(abstract)
        encoded_text_cls = get_bert_encoding(self.bert,abstract)[:, 0, :]
        return encoded_text_cls.squeeze(0), label


class SentimentClassifier(nn.Module):

    def __init__(self, inputsize):
        super(SentimentClassifier, self).__init__()
        self.cls_layer = nn.Linear(inputsize, 100)

    def forward(self, seq):
        logits = self.cls_layer(seq)
        return logits

In [None]:

def train(train_status, model, optim, criterion, epoch_size, train_loader, valid_loader):
    device = 0
    for epoch in range(epoch_size):
        model.train()
        epoch_loss = 0
        epoch_labels = torch.Tensor([])
        epoch_preds = torch.Tensor([])
        TOTAL = len(train_loader)
        train_loop = tqdm(enumerate(train_loader), total=TOTAL)
        train_loop.set_description(f"Epoch [{epoch+1}/{epoch_size}]")
        
        for batch, (inputs, labels) in train_loop:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            optim.zero_grad()
            loss.backward()
            optim.step() 

            epoch_loss += loss.item()
            epoch_preds = torch.cat(((epoch_preds, (outputs.cpu() > 0.5).int())), 0)
            epoch_labels = torch.cat((epoch_labels, labels.cpu()), 0)

            train_loop.set_postfix_str(
                'train_loss={:.5f}'.format(loss.item())
            )

            if batch == TOTAL-1:
                epoch_loss /= len(train_loader.dataset)/train_loader.batch_size
                train_f1 = f1_score(epoch_labels, epoch_preds, average='samples', zero_division=1)
                valid_f1 = validate(model, valid_loader)
                print(epoch_preds.shape)
                train_loop.set_postfix_str(
                    'train_loss={:.5f}, train_f1={:.5f}, valid_f1={:.5f}'.format(
                        epoch_loss, train_f1, valid_f1
                    )
                )

def validate(model, valid_loader):
    model.eval()
    valid_labels = torch.Tensor([])
    valid_preds = torch.Tensor([])
    device = 0
    with torch.no_grad():
        for batch, (inputs, labels) in enumerate(valid_loader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)

            valid_preds = torch.cat(((valid_preds, (outputs.cpu() > 0.5).int())), 0)
            valid_labels = torch.cat((valid_labels, labels.cpu()), 0)

    return f1_score(valid_labels, valid_preds, average='samples', zero_division=1)

In [None]:
epoch_size = 10
batch_size = 2
lr = 2e-3
device = 0 
model = SentimentClassifier(128).to(device)
optim = torch.optim.AdamW(model.parameters(), lr=lr)
criterion = nn.BCEWithLogitsLoss()

train_set = PaperDataset('drive/MyDrive/SML2/train.json',net)
valid_set = PaperDataset('drive/MyDrive/SML2/valid.json',net)

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=0)
valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=True, num_workers=0)

train_status = {'train_loss': []}
train(train_status, model, optim, criterion, epoch_size, train_loader, valid_loader)

Epoch [1/10]: 100%|██████████| 10317/10317 [09:54<00:00, 17.35it/s, train_loss=0.03671, train_f1=0.71077, valid_f1=0.71080]


torch.Size([20634, 100])


Epoch [2/10]: 100%|██████████| 10317/10317 [09:42<00:00, 17.72it/s, train_loss=0.02311, train_f1=0.71077, valid_f1=0.71080]


torch.Size([20634, 100])


Epoch [3/10]: 100%|██████████| 10317/10317 [09:44<00:00, 17.65it/s, train_loss=0.02308, train_f1=0.71077, valid_f1=0.71080]


torch.Size([20634, 100])


Epoch [4/10]: 100%|██████████| 10317/10317 [09:39<00:00, 17.81it/s, train_loss=0.02310, train_f1=0.71077, valid_f1=0.71080]


torch.Size([20634, 100])


Epoch [5/10]: 100%|██████████| 10317/10317 [09:38<00:00, 17.83it/s, train_loss=0.02308, train_f1=0.71077, valid_f1=0.71080]


torch.Size([20634, 100])


Epoch [6/10]: 100%|██████████| 10317/10317 [09:37<00:00, 17.86it/s, train_loss=0.02308, train_f1=0.71077, valid_f1=0.71080]


torch.Size([20634, 100])


Epoch [7/10]:   2%|▏         | 165/10317 [00:07<08:03, 21.00it/s, train_loss=0.00343]

In [None]:
f = open('drive/MyDrive/SML2/test.json')
dfs = json.load(f)
for df in dfs:
  df["proauthors"] = []
f.close()
with open('drive/MyDrive/SML2/test.json', "w") as outfile:
    json.dump(dfs, outfile)
print(dfs[0])

In [None]:
f = open("drive/MyDrive/SML2/train.json")
df = json.load(f)
f.close()
print(len(df))
print(len(vocab))

In [None]:
def get_predictions(model, loader):
    model.eval()
    predict = []
    device = 0
    with torch.no_grad():
      for batch, (inputs, labels) in enumerate(loader):
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = model(inputs)
        predict.append((outputs.cpu() > 0.5).int())
        #valid_labels = torch.cat((valid_labels, labels.cpu()), 0)
    return predict
TESTDataset = PaperDataset('drive/MyDrive/SML2/test.json',net)
test_result = get_predictions(model,TESTDataset)
for g in range(len(test_result)):
  print(g, end =", ")
  for i in range(100):
    if test_result[g][i] != 0:
      print(i,end = " ")
  print("")


NameError: ignored

In [None]:
TESTDataset = PaperDataset('drive/MyDrive/SML2/test.json',net)
test_result = get_predictions(model,TESTDataset)

import csv
with open("test.csv",'w',newline='') as f:
    csvwritter = csv.writer(f,delimiter = ',')
    csvwritter.writerow(["ID","Predict"])
    for g in range(len(test_result)):
        result = []
        for i in range(len(test_result[g])):
            if test_result[g][i] == 1:
                result.append(i)
        if result == []:
            csvwritter.writerow([g,-1])
        else:
            out = " "
            result = [str(r) for r in result]
            csvwritter.writerow([g, out.join(result)])
        

In [None]:
f = open('drive/MyDrive/SML2/test.json')
dfs = json.load(f)
print(len(dfs))