In [1]:
!pip install pandas
!pip install tqdm
!pip install sklearn



In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [3]:
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import f1_score

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
!nvidia-smi

Thu Oct  6 18:40:57 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.65.01    Driver Version: 515.65.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:81:00.0 Off |                  N/A |
| 30%   33C    P8    20W / 350W |      3MiB / 24576MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [15]:
class PaperDataset(Dataset):

    def __init__(self, path, test=False):
        self.df = pd.read_json(path, orient="records")
        self.test = test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        return self.one_hot(self.df.loc[index, :])
    
    def one_hot(self, row):
        # title = torch.zeros(4999)
        # title[[i-1 for i in row['title']]] = 1.

        # abstract = torch.zeros(4999)
        # abstract[[i-1 for i in row['abstract']]] = 1.
        text = [i-1 for i in row['title']]
        text.extend([i-1 for i in row['abstract']])
        doc = torch.zeros(4999)
        doc[[text]] = 1.

        venue = torch.zeros(465)
        venue[[row['venue']] if row['venue'] != '' else []] = 1.

        coauthor = torch.zeros(21146)
        coauthor[[i-100 for i in row['coauthors']]] = 1.

        input = torch.cat([doc, coauthor, venue], 0)

        if not self.test:
            label = torch.zeros(100)
            label[row['proauthors']] = 1.

            # label = torch.zeros(100)
            # if len(row['proauthors']) > 0:
            #     label[row['proauthors']] = 1.
            # else:
            #     label[-1] = 1.
            
            return input, label
        else:
            return input, row["identifier"]

In [16]:
class AuthorAttriClf(nn.Module):
    def __init__(self):
        super(AuthorAttriClf, self).__init__()

        self.clf_block = nn.Sequential(
            nn.Linear(26610, 2048),
            nn.Dropout(),
            nn.Linear(2048, 1024),
            nn.Dropout(),
            nn.Linear(1024, 100)
        )

    def forward(self, input):
        probs = self.clf_block(input)

        return probs    

In [17]:
def train(train_status, model, optim, scheduler, criterion, epoch_size, train_loader, valid_loader):
   
    for epoch in range(epoch_size):
        model.train()
        epoch_loss = 0
        epoch_labels = torch.Tensor([])
        epoch_preds = torch.Tensor([])

        train_loop = tqdm(enumerate(train_loader), total=len(train_loader))
        train_loop.set_description(f"Epoch [{epoch+1}/{epoch_size}]")

        for batch, (inputs, labels) in train_loop:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            optim.zero_grad()
            loss.backward()
            optim.step() 

            epoch_loss += loss.item()
            epoch_preds = torch.cat(((epoch_preds, (outputs.cpu() > 0.5).int())), 0)
            epoch_labels = torch.cat((epoch_labels, labels.cpu()), 0)

            train_loop.set_postfix_str(
                'train_loss={:.5f}'.format(loss.item())
            )

            if batch == len(train_loader)-1:
                epoch_loss /= len(train_loader.dataset)/train_loader.batch_size
                valid_loss, valid_f1 = validate(model, criterion, valid_loader)
                train_loop.set_postfix_str(
                    'train_loss={:.5f}, valid_loss={:.5f}, valid_f1={:.5f}'.format(
                        epoch_loss, valid_loss, valid_f1
                    )
                )

                # train_f1 = f1_score(epoch_labels, epoch_preds, average='samples', zero_division=1)
                # valid_f1 = validate(model, valid_loader)
                # train_loop.set_postfix_str(
                #     'train_loss={:.5f}, train_f1={:.5f}, valid_f1={:.5f}'.format(
                #         epoch_loss, train_f1, valid_f1
                #     )
                # )、
                scheduler.step()

def validate(model, criterion, valid_loader):
    model.eval()
    valid_labels = torch.Tensor([])
    valid_preds = torch.Tensor([])
    valid_loss = 0
    with torch.no_grad():
        for batch, (inputs, labels) in enumerate(valid_loader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            
            loss = criterion(outputs, labels)
            valid_loss += loss.item()
            valid_preds = torch.cat(((valid_preds, (outputs.cpu() > 0.8).int())), 0)
            valid_labels = torch.cat((valid_labels, labels.cpu()), 0)
        valid_loss /= len(valid_loader.dataset)/valid_loader.batch_size
    return valid_loss, f1_score(valid_labels, valid_preds, average='samples', zero_division=1)
    

In [18]:
train_set = PaperDataset('./data/only/train.json')
valid_set = PaperDataset('./data/only/valid.json')

In [20]:
epoch_size = 20
batch_size = 32
lr = 1e-3

model = AuthorAttriClf().to(device)
optim = torch.optim.AdamW(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.MultiplicativeLR(optim, lr_lambda=lambda epoch: 0.95)
criterion = nn.BCEWithLogitsLoss()

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=128, shuffle=True)

train_status = {'train_loss': []}
train(train_status, model, optim, scheduler, criterion, epoch_size, train_loader, valid_loader)

Epoch [1/20]: 100%|██████████| 187/187 [00:03<00:00, 48.61it/s, train_loss=0.08274, valid_loss=0.05712, valid_f1=0.01591]
Epoch [2/20]: 100%|██████████| 187/187 [00:03<00:00, 47.88it/s, train_loss=0.04170, valid_loss=0.04549, valid_f1=0.13257]
Epoch [3/20]: 100%|██████████| 187/187 [00:03<00:00, 50.09it/s, train_loss=0.02327, valid_loss=0.04508, valid_f1=0.21881]
Epoch [4/20]: 100%|██████████| 187/187 [00:03<00:00, 47.28it/s, train_loss=0.01588, valid_loss=0.05812, valid_f1=0.24668]
Epoch [5/20]: 100%|██████████| 187/187 [00:03<00:00, 49.26it/s, train_loss=0.01686, valid_loss=0.07050, valid_f1=0.24118]
Epoch [6/20]: 100%|██████████| 187/187 [00:04<00:00, 46.36it/s, train_loss=0.01841, valid_loss=0.07781, valid_f1=0.27529]
Epoch [7/20]: 100%|██████████| 187/187 [00:03<00:00, 48.09it/s, train_loss=0.01311, valid_loss=0.07869, valid_f1=0.28302]
Epoch [8/20]: 100%|██████████| 187/187 [00:03<00:00, 49.24it/s, train_loss=0.01071, valid_loss=0.10849, valid_f1=0.20027]
Epoch [9/20]: 100%|█████

In [21]:
def get_predictions(model, test_loader):
    with torch.no_grad():
        preds = []
        identifiers = []

        for batch, (inputs, ids) in enumerate(test_loader):
            inputs = inputs.to(device)
            outputs = torch.sigmoid(model(inputs))

            for i in range(outputs.shape[0]):
                identifiers.append(int(ids[i]))
                pred = torch.nonzero((outputs[i].cpu() > 0.5).int())
                if len(pred) > 0:
                    preds.append(" ".join([str(int(i)) for i in pred]))
                else:
                    preds.append("-1")

        df = pd.DataFrame({'ID': identifiers, 'Predict': preds})
        df.to_csv('data/pred.csv', sep=',', index=False, encoding='utf-8')

test_set = PaperDataset(r"data/downsamp/test.json", test=True)
test_loader = DataLoader(test_set, batch_size=128, shuffle=False, num_workers=0)
get_predictions(model, test_loader)