In [1]:
!pip install pandas
!pip install tqdm
!pip install sklearn
!pip install gensim==4.2.0
!pip install Cython



In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

In [3]:
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
import gensim
import Cython
import time

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
!nvidia-smi

Wed Oct 12 16:52:50 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.48.07    Driver Version: 515.48.07    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:02:00.0 Off |                  N/A |
|  0%   48C    P8    20W / 350W |      3MiB / 12288MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
raw = json.load(open(r'data/backup/train.json'))

In [6]:
labels = []
data = []
d2v = gensim.models.doc2vec.Doc2Vec.load(r'doc2vec_256.model')
n2v = gensim.models.Word2Vec.load(r"node2vec_1024.model")

for i, record in enumerate(raw):
    if i%1000 == 0:
        print(i)
        
    proauthors = [i for i in record['authors'] if i < 100]
    label = np.zeros(101)
    # label[proauthors] = 1.
    if len(proauthors) > 0:
        label[proauthors] = 1.
    else: 
        label[-1] = 1.
    labels.append(label)
    
    text = [str(i) for i in record['title']]
    text.extend([str(i) for i in record['abstract']])
    text_vec = d2v.infer_vector(text)
    # title_vec = d2v.infer_vector([str(i) for i in record['title']])
    # abstract_vec = d2v.infer_vector([str(i) for i in record['abstract']])
    # text_vec = np.concatenate([title_vec, abstract_vec], axis=0)
    
    # coauthors = np.zeros(21146)
    # coauthors[[i-100 for i in [i for i in record['authors'] if i >= 100]]] = 1.
    coauthors = [i for i in record['authors'] if i >= 100]
    coauthor_vec = []
    if len(coauthors) > 0:
        for co in coauthors:
            coauthor_vec.append(n2v.wv[str(co)])
        coauthor_vec = np.mean(np.array(coauthor_vec), axis=0)
    else:
        coauthor_vec = np.zeros(1024)
    
    
    venue_vec = np.zeros(465)
    venue_vec[[record['venue']] if record['venue'] != '' else []] = 1.
    
    data.append(np.concatenate([text_vec, coauthor_vec, venue_vec], axis=0))
    
labels = np.array(labels)
data = np.array(data)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000


In [7]:
ids = np.where(labels[:, -1]==0)
data_t = data[ids]
labels_t = labels[ids]

ids = np.where(labels[:, -1]==1)
data_f = data[ids]
labels_f = labels[ids]

data_t, labels_t = resample(data_t, labels_t, replace=True, n_samples=int(2 * labels_f.shape[0]), random_state=51)

data = np.concatenate([data_t, data_f])
labels = np.concatenate([labels_t, labels_f])


In [8]:
# scaler = StandardScaler()
# data = scaler.fit_transform(data)

In [9]:
data.shape

(54999, 1745)

In [10]:
labels.shape

(54999, 101)

In [11]:
data_train, data_valid, labels_train, labels_valid = train_test_split(data, labels, test_size=0.2, random_state=51)

In [12]:
train_set = TensorDataset(torch.tensor(data_train, dtype=torch.float), torch.tensor(labels_train, dtype=torch.float))
valid_set = TensorDataset(torch.tensor(data_valid, dtype=torch.float), torch.tensor(labels_valid, dtype=torch.float))

In [13]:
class AuthorAttriClf(nn.Module):
    def __init__(self):
        super(AuthorAttriClf, self).__init__()
        
        self.clf_block = nn.Sequential(
            # nn.Linear(21867, 2048),
            # nn.Dropout(),
            # nn.ReLU(),
            # nn.Linear(2048, 1024),
            # nn.Dropout(),
            # nn.ReLU(),
            # nn.Linear(1024, 1024),
            # nn.ReLU(),
            # nn.Linear(1024, 101),
            nn.Linear(1745, 1024),
            nn.Dropout(),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.Dropout(),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 101),
        )

    def forward(self, input):
        probs = self.clf_block(input)

        return probs    

In [14]:
def train(train_status, model, optim, scheduler, criterion, epoch_size, train_loader, valid_loader=None):
   
    # es_loss = 0
    # es_count = 0
    # es_patience = 5

    for epoch in range(epoch_size):
        model.train()
        epoch_loss = 0
        epoch_labels = torch.Tensor([]).to(device)
        epoch_preds = torch.Tensor([]).to(device)

        train_loop = tqdm(enumerate(train_loader), total=len(train_loader))
        train_loop.set_description(f"Epoch [{epoch+1}/{epoch_size}]")

        for batch, (inputs, labels) in train_loop:
            time.sleep(0.01)
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            optim.zero_grad()
            loss.backward()
            optim.step() 

            epoch_loss += loss.item()
            epoch_preds = torch.cat(((epoch_preds, (outputs > 0.5).int())), 0)
            epoch_labels = torch.cat((epoch_labels, labels), 0)

            train_loop.set_postfix_str(
                'train_loss={:.5f}'.format(loss.item())
            )

            if batch == len(train_loader)-1 and valid_loader is not None:
                epoch_loss /= len(train_loader.dataset)/train_loader.batch_size
                train_f1 = f1_score(epoch_labels.detach().cpu(), epoch_preds.detach().cpu(), average='samples', zero_division=1)
                valid_loss, valid_f1 = validate(model, criterion, valid_loader)
                train_loop.set_postfix_str(
                    'train_loss={:.5f}, train_f1={:.5f}, valid_loss={:.5f}, valid_f1={:.5f}'.format(
                        epoch_loss, train_f1, valid_loss, valid_f1
                    )
                )

                scheduler.step()
                
                # Early Stopping
                # if es_loss != 0 and es_loss < valid_loss:
                #     if es_count >= es_patience:
                #         print('Early Stop')
                #         return
                #     else:
                #         es_count += 1
                # else:
                #     es_loss = valid_loss
                #     es_count = 0
                
def validate(model, criterion, valid_loader):
    model.eval()
    valid_loss = 0
    valid_labels = torch.Tensor([])
    valid_preds = torch.Tensor([])
    with torch.no_grad():
        for batch, (inputs, labels) in enumerate(valid_loader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            valid_loss += loss.item()
            valid_preds = torch.cat(((valid_preds, (outputs.detach().cpu() > 0.5).int())), 0)
            valid_labels = torch.cat((valid_labels, labels.detach().cpu()), 0)
    
    valid_loss /= len(valid_loader.dataset) / valid_loader.batch_size
    valid_f1 = f1_score(valid_labels, valid_preds, average='samples', zero_division=1)
    return valid_loss, valid_f1
    

In [15]:
epoch_size = 50
batch_size = 128
lr = 1e-3

model = AuthorAttriClf().to(device)
optim = torch.optim.AdamW(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.MultiplicativeLR(optim, lr_lambda=lambda epoch: 0.98)
criterion = nn.BCEWithLogitsLoss()

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=0)
valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=True, num_workers=0)

train_status = {'train_loss': []}
train(train_status, model, optim, scheduler, criterion, epoch_size, train_loader, valid_loader)

Epoch [1/50]: 100%|██████████| 344/344 [00:07<00:00, 43.54it/s, train_loss=0.06193, train_f1=0.09184, valid_loss=0.02725, valid_f1=0.27097]
Epoch [2/50]: 100%|██████████| 344/344 [00:07<00:00, 46.00it/s, train_loss=0.02061, train_f1=0.45833, valid_loss=0.01378, valid_f1=0.69089]
Epoch [3/50]: 100%|██████████| 344/344 [00:07<00:00, 45.31it/s, train_loss=0.01378, train_f1=0.66985, valid_loss=0.01079, valid_f1=0.78074]
Epoch [4/50]: 100%|██████████| 344/344 [00:07<00:00, 45.40it/s, train_loss=0.01147, train_f1=0.73703, valid_loss=0.01015, valid_f1=0.79228]
Epoch [5/50]: 100%|██████████| 344/344 [00:07<00:00, 44.49it/s, train_loss=0.01014, train_f1=0.77429, valid_loss=0.00868, valid_f1=0.83942]
Epoch [6/50]: 100%|██████████| 344/344 [00:07<00:00, 45.15it/s, train_loss=0.00923, train_f1=0.79757, valid_loss=0.00820, valid_f1=0.86133]
Epoch [7/50]: 100%|██████████| 344/344 [00:07<00:00, 45.36it/s, train_loss=0.00855, train_f1=0.81781, valid_loss=0.00776, valid_f1=0.87821]
Epoch [8/50]: 100%|█

In [19]:
df_test = pd.read_json(r'data/backup/test.json')

pred_ids = []
for i in df_test['identifier']:
    pred_ids.append(i)
pred_ids = np.array(pred_ids)

data_test = []
for _, row in df_test.iterrows():
    text = [str(i) for i in row['title']]
    text.extend([str(i) for i in row['abstract']])
    text_vec = d2v.infer_vector(text)
    # title_vec = d2v.infer_vector([str(i) for i in row['title']])
    # abstract_vec = d2v.infer_vector([str(i) for i in row['abstract']])
    # text_vec = np.concatenate([title_vec, abstract_vec], axis=0)
    
    # coauthors = np.zeros(21146)
    # coauthors[[i-100 for i in [i for i in row['coauthors'] if i >= 100]]] = 1.
    coauthors = row['coauthors']
    coauthor_vec = []
    if len(coauthors) > 0:
        for co in coauthors:
            coauthor_vec.append(n2v.wv[str(co)])
        coauthor_vec = np.mean(np.array(coauthor_vec), axis=0)
    else:
        coauthor_vec = np.zeros(1024)
    
    venue_vec = np.zeros(465)
    venue_vec[[row['venue']] if row['venue'] != '' else []] = 1.
    
    data_test.append(np.concatenate([text_vec, coauthor_vec, venue_vec], axis=0))
    
data_test = np.array(data_test)

test_set = TensorDataset(torch.tensor(data_test, dtype=torch.float), torch.tensor(pred_ids, dtype=torch.float))
test_loader = DataLoader(test_set, batch_size=128, shuffle=False, num_workers=0)

In [20]:
data_test.shape

(800, 1745)

In [21]:
def get_predictions(model, test_loader):
    with torch.no_grad():
        preds = []
        identifiers = []

        for batch, (inputs, ids) in enumerate(test_loader):
            inputs = inputs.to(device)
            outputs = torch.sigmoid(model(inputs))

            for i in range(outputs.shape[0]):
                identifiers.append(int(ids[i]))
                pred = torch.nonzero((outputs[i].cpu() > 0.6)[:-1].int())
                if len(pred) > 0:
                    preds.append(" ".join([str(int(i)) for i in pred]))
                else:
                    preds.append("-1")
        df = pd.DataFrame({'ID': identifiers, 'Predict': preds})
        df.to_csv(r'data/pred.csv', sep=',', index=False, encoding='utf-8')

get_predictions(model, test_loader)