In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', 0)
import matplotlib.pyplot as plt
import itertools
import random
import tqdm as tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
# Import transformers package from huggingface
%%capture
!pip install transformers

from transformers import AutoTokenizer, AutoModel, BertTokenizer, BertModel

In [3]:
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_cased')

# Get the data

In [4]:
from google.colab import drive 
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [5]:
## Read data in
df = pd.read_csv('drive/My Drive/Team USA/dataset_corrected.csv')

In [6]:
df = df.dropna()

In [7]:
train_df = df.loc[:int(0.90*df.shape[0]),:]
test_df = df.loc[int(0.90*df.shape[0])+1:,:]
len_train = len(train_df)
len_test = len(test_df)

In [8]:
Citations_Graph = df['Citations'].dropna().to_dict()
citations = [(key, int(value)) for key, values in Citations_Graph.items() for value in values.split(',')]

train_citations_graph = train_df['Citations'].dropna().to_dict()
test_citations_graph = test_df['Citations'].dropna().to_dict()

train_citations = [(key, int(value)) for key, values in train_citations_graph.items() for value in values.split(',') if int(value) < len_train and int(value) in df.index]
test_citations = [(key, int(value)) for key, values in test_citations_graph.items() for value in values.split(',') if int(value) > len_train and int(value) in df.index]

# Get the models for the ensemble

In [9]:
class TitleBERT(nn.Module):

  def __init__(self):
    super().__init__()

    self.bertTitle = AutoModel.from_pretrained('allenai/scibert_scivocab_cased')
    self.linear = nn.Linear(768,1)

  def forward(self,inputs):

    outs = self.bertTitle(**inputs)
    outs = torch.mean(outs['last_hidden_state'],dim = 1)
    outs = self.linear(outs)
    outs = torch.sigmoid(outs)

    return outs

class VenueBERT(nn.Module):

  def __init__(self):
    super().__init__()

    self.bertVenue = AutoModel.from_pretrained('allenai/scibert_scivocab_cased')
    self.linear = nn.Linear(768,1)

  def forward(self,inputs):

    outs = self.bertVenue(**inputs)
    outs = torch.mean(outs['last_hidden_state'],dim = 1)
    outs = self.linear(outs)
    outs = torch.sigmoid(outs)

    return outs

class AbstractBERT(nn.Module):

  def __init__(self):
    super().__init__()

    self.bertAbstract = AutoModel.from_pretrained('allenai/scibert_scivocab_cased')
    self.linear = nn.Linear(768,1)

  def forward(self,inputs):

    outs = self.bertAbstract(**inputs)
    outs = torch.mean(outs['last_hidden_state'],dim = 1)
    outs = self.linear(outs)
    outs = torch.sigmoid(outs)

    return outs

In [10]:
# add model paths here to load 

# titlebert = torch.load('drive/My Drive/Team USA/models/titlebert_patsplit.pth', map_location = torch.device('cpu'))
# venuebert = torch.load('drive/My Drive/Team USA/models/venuemodel_kaggle_pat.pth', map_location = torch.device('cpu'))
# abstractbert = torch.load('drive/My Drive/Team USA/models/venuemodel_kaggle_pat.pth', map_location = torch.device('cpu'))

titlebert = torch.load('drive/My Drive/Team USA/models/titlebert_patsplit.pth')
titlebert.to(torch.device('cuda'))
venuebert = torch.load('drive/My Drive/Team USA/models/venuemodel_kaggle_pat.pth')
venuebert.to(torch.device('cuda'))
abstractbert = torch.load('drive/My Drive/Team USA/models/venuemodel_kaggle_pat.pth')
abstractbert.to(torch.device('cuda'))

VenueBERT(
  (bertVenue): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31116, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [11]:
type(titlebert), type(venuebert), type(abstractbert)

(__main__.TitleBERT, __main__.VenueBERT, __main__.VenueBERT)

In [12]:
device = 'cuda'

In [13]:
inputs = tokenizer(['ACM Journal 33', 'ACM Journal 33'],['Institute of Manufacturing','ACM Journal 33'],return_tensors='pt',
                    max_length = 32,
                    truncation = True,
                    padding = 'max_length')
titlebert(inputs.to(device))
venuebert(inputs.to(device))

tensor([[0.2791],
        [0.9917]], device='cuda:0', grad_fn=<SigmoidBackward0>)

# Dataset and Dataloader

In [14]:
class Citations(Dataset):

  def __init__(self,citations,df):
    super().__init__()
    
    self.citations = citations
    self.citations_mapping()
    self.length = df.shape[0]
    self.data = df

  def __len__(self):

    return len(self.citations)

  def citations_mapping(self):
    d = {}
    for c1, c2 in self.citations:
      if c1 not in d:
        d[c1] = []
      else:
        if c2 not in d[c1]:
          d[c1].append(c2)
    self.d = d

  def __getitem__(self,index):

    X = []
    X.append(self.data.loc[self.citations[index][0],:].values)
    X.append(self.data.loc[self.citations[index][0],:].values)
   

    Y = []
    Y.append(self.data.loc[self.citations[index][1],:].values)

    # # get a random citation that is not part of that paper's citations
    # rand_cit = self.citations[random.sample(list(df.index),1)[0]][1]
    # # rand_cit = self.citations[random.randint(0, len(self.citations)-1)][1]
    # while rand_cit in self.d[self.citations[index][0]]:
    #   # rand_cit = self.citations[random.randint(0, len(self.citations)-1)][1]
    #   rand_cit = self.citations[random.sample(list(df.index),1)[0]][1]

    Y.append(self.data.loc[random.sample(list(df.index),1)[0],:].values)


    labels = [1,0]
  
    return X,Y,labels

In [15]:
train_dataset = Citations(train_citations, df)
test_dataset = Citations(test_citations, df)

# Majority voting

In [16]:
def custom_collate(batch):

  # batch is list of samples : batch = [dataset[0],dataset[1], and so on...]

  # lists to store titles and labels
  title1, title2, venue1, venue2, abstract1, abstract2 = [], [], [], [], [], []
  labels = []

  for sample in batch:

    title1.append(sample[0][0][1])
    title1.append(sample[1][0][1])

    venue1.append(sample[0][0][4])
    venue1.append(sample[0][1][4])

    abstract1.append(sample[0][0][5])
    abstract1.append(sample[0][1][5])

    title2.append(sample[1][0][1])
    title2.append(sample[1][1][1])

    venue2.append(sample[1][0][4])
    venue2.append(sample[1][1][4])

    abstract2.append(sample[1][0][5])
    abstract2.append(sample[1][1][5])

    labels.extend(sample[2])

  return {"title1": np.array(title1),
          "title2": np.array(title2), 
          "venue1": np.array(venue1),
          "venue2": np.array(venue2),
          "abstract1":np.array(abstract1),
          "abstract2": np.array(abstract2),
          "labels":labels}

In [17]:
BATCH_SIZE = 64
trainloader = DataLoader(train_dataset, collate_fn = custom_collate, batch_size = BATCH_SIZE, shuffle = False)
testloader = DataLoader(test_dataset, collate_fn = custom_collate, batch_size = BATCH_SIZE, shuffle = False)

In [18]:
import time
from tqdm import tqdm
from scipy import stats

def isnotnan(s):
  return s==s

### Majority Voting

In [19]:
def compute_accuracy(titlemodel, venuemodel, abstractmodel, data_loader, device):
    
    titlemodel.eval()
    venuemodel.eval()
    abstractmodel.eval()

    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for batch_idx, batch in enumerate(data_loader):

            print(f"{batch_idx} of {len(data_loader)}")

            # print(batch['venue1'])
            

            start_time = time.time()

            ### Prepare data
            inputs_title = tokenizer(list(batch['title1']),list(batch['title2']),return_tensors='pt',
                    max_length = 32,
                    truncation = True,
                    padding = 'max_length')
            
            inputs_venue = tokenizer(list(batch['venue1']),list(batch['venue2']),return_tensors='pt',
                    max_length = 32,
                    truncation = True,
                    padding = 'max_length')
            
            # print(time.time() - start_time)
            
            inputs_abstract = tokenizer(list(batch['abstract1']),list(batch['abstract2']),return_tensors='pt',
                    max_length = 32,
                    truncation = True,
                    padding = 'max_length')
            
            # print(time.time() - start_time)

            inputs_title.to(device)
            inputs_venue.to(device)
            inputs_abstract.to(device)

            
            labels = torch.Tensor(batch['labels']).view(-1,1).to(device)
            
            outputs_title = titlemodel(inputs_title)
            outputs_venue = venuemodel(inputs_venue)
            outputs_abstract = abstractmodel(inputs_abstract)

            pred_labels_title = torch.round(outputs_title)
            pred_labels_venue = torch.round(outputs_venue)
            pred_labels_abstract = torch.round(outputs_abstract)

            # determine the locations of the nan features
            isnotnan_t = isnotnan(batch['title1']) & isnotnan(batch['title2'])
            isnotnan_v = isnotnan(batch['venue1']) & isnotnan(batch['venue2'])
            isnotnan_a = isnotnan(batch['abstract1']) & isnotnan(batch['abstract2'])

            # print(pred_labels_title)

            # -------- Majority Voting (where fields exist) --------------------
            pred_labels = []
            for i in range(len(batch['title1'])):
              res = []
              if isnotnan_t[i]:
                res.append(pred_labels_title[i].item())
              if isnotnan_v[i]:
                res.append(pred_labels_venue[i].item())
              if isnotnan_a[i]:
                res.append(pred_labels_abstract[i].item())

              pred_labels.append([stats.mode(res)[0][0]])
            pred_labels = torch.Tensor(pred_labels).to(device)
            # ---------------------------------------------

            # # -------- Averaging (where fields exist) --------------------
            # outputs_avg = []
            # for i in range(len(batch['title1'])):
            #   res = []
            #   if isnotnan_t[i]:
            #     res.append(outputs_title[i].item())
            #   if isnotnan_v[i]:
            #     res.append(outputs_venue[i].item())
            #   if isnotnan_a[i]:
            #     res.append(outputs_abstract[i].item())

            #   outputs_avg.append([np.mean(res)])
            # outputs_avg = torch.Tensor(outputs_avg).to(device)
            # pred_labels = torch.round(outputs_avg)
            # # ---------------------------------------------

            # print(pred_labels)
            # print(labels)

            num_examples += labels.size(0)

            correct_pred += (pred_labels == labels).sum()

            print(correct_pred.float()/num_examples * 100)
    return correct_pred.float()/num_examples * 100

In [20]:
class TitleBERT(nn.Module):

  def __init__(self):
    super().__init__()

    self.bertTitle = AutoModel.from_pretrained('allenai/scibert_scivocab_cased')
    self.linear = nn.Linear(768,1)

  def forward(self,inputs):

    outs = self.bertTitle(**inputs)
    outs = torch.mean(outs['last_hidden_state'],dim = 1)
    outs = self.linear(outs)
    outs = torch.sigmoid(outs)

    return outs

In [21]:
class EnsembleWeighted(nn.Module):

  def __init__(self):

    super().__init__()

    self.title = titlebert
    self.abstract = abstractbert
    self.venue = venuebert
    self.linear = nn.Linear(3,1)

    self.title.eval()
    self.abstract.eval()
    self.venue.eval()

  def forward(self,y):

    inputs_title = inputs_title = tokenizer(list(y['title1']),list(y['title2']),return_tensors='pt',
                    max_length = 32,
                    truncation = True,
                    padding = 'max_length')
    
    inputs_abstract= inputs_title = tokenizer(list(y['abstract1']),list(y['abstract2']),return_tensors='pt',
                    max_length = 64,
                    truncation = True,
                    padding = 'max_length')
    
    inputs_venue = inputs_title = tokenizer(list(y['venue1']),list(y['venue2']),return_tensors='pt',
                    max_length = 32,
                    truncation = True,
                    padding = 'max_length')

    titlepred = self.title(inputs_title.to(device)) 
    abstractpred = self.abstract(inputs_abstract.to(device)) 
    venuepred = self.venue(inputs_venue.to(device)) 

    combine = torch.cat([titlepred,abstractpred,venuepred],dim = 1)

    outs = self.linear(combine)

    # print(outs)

    outs = torch.sigmoid(outs)

    # print(outs)

    return outs
    

In [22]:
y = next(iter(trainloader))

In [23]:
NUM_EPOCHS = 1

device = torch.device("cuda")

ensemble = EnsembleWeighted()
ensemble.to(device)
criterion = nn.BCELoss()

In [24]:
for name,param in ensemble.named_parameters():
  param.requires_grad = False

In [25]:
ensemble.linear.weight.requires_grad = True
ensemble.linear.bias.requires_grad = True

ensemble.linear.weight.data = torch.Tensor([[0.33,0.33,0.33]]).to(device)
ensemble.linear.bias.data = torch.Tensor([0]).to(device)

In [25]:
# def custom_loss(y,y_hat):
#   y = torch.Tensor(np.array(y)).view(-1,1).to(device)
#   return torch.mean(-.5 * ( (1-y)*torch.log(2-y_hat) + (y)*torch.log(y_hat) ) + torch.log(torch.Tensor(2).to(device)))

In [26]:
from transformers import AdamW
optim = AdamW(ensemble.parameters(), lr = 1e-2)

losses = []

for epoch in range(NUM_EPOCHS):
    
    ensemble.train()
    
    for batch_idx, batch in enumerate(trainloader):
        
      optim.zero_grad()

      output = ensemble(batch)
      
      # loss = custom_loss(batch['labels'],output)
      loss = criterion(output,torch.tensor(np.array(batch['labels']).reshape(-1,1)).to(torch.float32).to(device))
      loss.backward()

      # if not batch_idx % 50:
      losses.append(round(loss.item(),2))

      print(f"epoch number = {epoch}", f"batch {batch_idx}/{len(trainloader)}", f"loss = {round(loss.item(),2)}")

      print(ensemble.linear.weight,ensemble.linear.bias)

      optim.step()
    
    # with torch.set_grad_enabled(False):
    #     print(
    #           f'\ntest accuracy: '
    #           f'{compute_accuracy(abstractmodel, testabstractsloader, device):.2f}%')

epoch number = 0 batch 0/20 loss = 0.61
Parameter containing:
tensor([[0.3300, 0.3300, 0.3300]], device='cuda:0', requires_grad=True) Parameter containing:
tensor([0.], device='cuda:0', requires_grad=True)
epoch number = 0 batch 1/20 loss = 0.62
Parameter containing:
tensor([[0.3400, 0.3386, 0.3400]], device='cuda:0', requires_grad=True) Parameter containing:
tensor([-0.0100], device='cuda:0', requires_grad=True)
epoch number = 0 batch 2/20 loss = 0.6
Parameter containing:
tensor([[0.3499, 0.3313, 0.3500]], device='cuda:0', requires_grad=True) Parameter containing:
tensor([-0.0200], device='cuda:0', requires_grad=True)
epoch number = 0 batch 3/20 loss = 0.58
Parameter containing:
tensor([[0.3597, 0.3308, 0.3600]], device='cuda:0', requires_grad=True) Parameter containing:
tensor([-0.0300], device='cuda:0', requires_grad=True)
epoch number = 0 batch 4/20 loss = 0.58
Parameter containing:
tensor([[0.3691, 0.3301, 0.3700]], device='cuda:0', requires_grad=True) Parameter containing:
tensor

In [38]:
def compute_accuracy(model, data_loader, device):
    
    model.eval()
    model.to(device)

    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for batch_idx, batch in enumerate(data_loader):

            print(f"{batch_idx} of {len(data_loader)}")

            ### Prepare data
            inputs = tokenizer(list(batch['title1']),list(batch['title2']),return_tensors='pt',
                    max_length = 32,
                    truncation = True,
                    padding = 'max_length')
            
            labels = torch.Tensor(batch['labels']).view(-1,1).to(device)
            
            outputs = model(inputs.to(device))

            predicted_labels = torch.round(outputs)

            num_examples += labels.size(0)

            correct_pred += (predicted_labels == labels).sum()

            print(correct_pred.float()/num_examples * 100)
    return correct_pred.float()/num_examples * 100

In [35]:
y = next(iter(testloader))

In [36]:
y.keys()

dict_keys(['title1', 'title2', 'venue1', 'venue2', 'abstract1', 'abstract2', 'labels'])

In [39]:
# compute_accuracy(titlebert, venuebert, abstractbert, trainloader, 'cpu')
compute_accuracy(titlebert,testloader, 'cuda')

0 of 2422
tensor(84.3750, device='cuda:0')
1 of 2422
tensor(86.7188, device='cuda:0')
2 of 2422
tensor(87.2396, device='cuda:0')
3 of 2422
tensor(87.5000, device='cuda:0')
4 of 2422
tensor(87.6562, device='cuda:0')
5 of 2422
tensor(87.1094, device='cuda:0')
6 of 2422
tensor(87.2768, device='cuda:0')
7 of 2422
tensor(87.4023, device='cuda:0')
8 of 2422
tensor(87.5868, device='cuda:0')
9 of 2422
tensor(87.8125, device='cuda:0')
10 of 2422
tensor(87.3580, device='cuda:0')
11 of 2422
tensor(87.3698, device='cuda:0')
12 of 2422
tensor(87.5601, device='cuda:0')
13 of 2422
tensor(87.7232, device='cuda:0')
14 of 2422
tensor(88.0208, device='cuda:0')
15 of 2422
tensor(88.1836, device='cuda:0')
16 of 2422
tensor(88.2353, device='cuda:0')
17 of 2422
tensor(88.2378, device='cuda:0')
18 of 2422
tensor(88.4868, device='cuda:0')
19 of 2422
tensor(88.7500, device='cuda:0')
20 of 2422
tensor(88.6533, device='cuda:0')
21 of 2422
tensor(88.5298, device='cuda:0')
22 of 2422
tensor(88.5530, device='cuda:0'

KeyboardInterrupt: ignored

### Average ensemble

In [None]:
def compute_accuracy(titlemodel, venuemodel, abstractmodel, data_loader, device):
    
    titlemodel.eval()
    venuemodel.eval()
    abstractmodel.eval()

    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for batch_idx, batch in enumerate(data_loader):

            print(f"{batch_idx} of {len(data_loader)}")

            # print(batch['venue1'])
            

            start_time = time.time()

            ### Prepare data
            inputs_title = tokenizer(list(batch['title1']),list(batch['title2']),return_tensors='pt',
                    max_length = 32,
                    truncation = True,
                    padding = 'max_length')
            
            inputs_venue = tokenizer(list(batch['venue1']),list(batch['venue2']),return_tensors='pt',
                    max_length = 32,
                    truncation = True,
                    padding = 'max_length')
            
            # print(time.time() - start_time)
            
            inputs_abstract = tokenizer(list(batch['abstract1']),list(batch['abstract2']),return_tensors='pt',
                    max_length = 32,
                    truncation = True,
                    padding = 'max_length')
            
            # print(time.time() - start_time)

            inputs_title.to(device)
            inputs_venue.to(device)
            inputs_abstract.to(device)

            
            labels = torch.Tensor(batch['labels']).view(-1,1).to(device)
            
            outputs_title = titlemodel(inputs_title)
            outputs_venue = venuemodel(inputs_venue)
            outputs_abstract = abstractmodel(inputs_abstract)

            pred_labels_title = torch.round(outputs_title)
            pred_labels_venue = torch.round(outputs_venue)
            pred_labels_abstract = torch.round(outputs_abstract)

            # determine the locations of the nan features
            isnotnan_t = isnotnan(batch['title1']) & isnotnan(batch['title2'])
            isnotnan_v = isnotnan(batch['venue1']) & isnotnan(batch['venue2'])
            isnotnan_a = isnotnan(batch['abstract1']) & isnotnan(batch['abstract2'])

            # print(pred_labels_title)

            # # -------- Majority Voting (where fields exist) --------------------
            # pred_labels = []
            # for i in range(len(batch['title1'])):
            #   res = []
            #   if isnotnan_t[i]:
            #     res.append(pred_labels_title[i].item())
            #   if isnotnan_v[i]:
            #     res.append(pred_labels_venue[i].item())
            #   if isnotnan_a[i]:
            #     res.append(pred_labels_abstract[i].item())

            #   pred_labels.append([stats.mode(res)[0][0]])
            # pred_labels = torch.Tensor(pred_labels).to(device)
            # # ---------------------------------------------

            # -------- Averaging (where fields exist) --------------------
            outputs_avg = []
            for i in range(len(batch['title1'])):
              res = []
              if isnotnan_t[i]:
                res.append(outputs_title[i].item())
              if isnotnan_v[i]:
                res.append(outputs_venue[i].item())
              if isnotnan_a[i]:
                res.append(outputs_abstract[i].item())

              outputs_avg.append([np.mean(res)])
            outputs_avg = torch.Tensor(outputs_avg).to(device)
            pred_labels = torch.round(outputs_avg)
            # ---------------------------------------------

            # print(pred_labels)
            # print(labels)

            num_examples += labels.size(0)

            correct_pred += (pred_labels == labels).sum()

            print(correct_pred.float()/num_examples * 100)
    return correct_pred.float()/num_examples * 100

In [None]:
# compute_accuracy(titlebert, venuebert, abstractbert, testloader, 'cpu')
compute_accuracy(titlebert, venuebert, abstractbert, testloader, 'cuda')

0 of 128
tensor(81.2500, device='cuda:0')
1 of 128
tensor(82.8125, device='cuda:0')
2 of 128
tensor(83.0729, device='cuda:0')
3 of 128
tensor(82.6172, device='cuda:0')
4 of 128
tensor(81.2500, device='cuda:0')
5 of 128
tensor(80.8594, device='cuda:0')
6 of 128
tensor(80.4688, device='cuda:0')
7 of 128
tensor(79.8828, device='cuda:0')
8 of 128
tensor(79.8611, device='cuda:0')
9 of 128
tensor(80.2344, device='cuda:0')
10 of 128
tensor(80.1136, device='cuda:0')
11 of 128
tensor(80.4688, device='cuda:0')
12 of 128
tensor(80.7091, device='cuda:0')
13 of 128
tensor(80.4129, device='cuda:0')
14 of 128
tensor(80.3125, device='cuda:0')
15 of 128
tensor(79.9805, device='cuda:0')
16 of 128
tensor(80.3309, device='cuda:0')
17 of 128
tensor(80.0347, device='cuda:0')
18 of 128
tensor(79.9342, device='cuda:0')
19 of 128
tensor(79.6094, device='cuda:0')
20 of 128
tensor(79.5759, device='cuda:0')
21 of 128
tensor(79.7940, device='cuda:0')
22 of 128
tensor(80.1291, device='cuda:0')
23 of 128
tensor(80.0

tensor(79.2290, device='cuda:0')

# Run a forward pass on sample input

In [None]:
ind = 17 
p1 = df.iloc[17]

cits = df['Citations'][ind]
cits = cits.split(',')
cits = [int(c) for c in cits]

for c in cits:
  print(c)
  p2 = df.iloc[c]
  
  inputs_title = tokenizer([p1['Title']], [p2['Title']], return_tensors = 'pt',
                           max_length = 32, truncation = True, padding = 'max_length')
  print(titlebert(inputs_title))
  
  if type(p1['Venue'])==str and type(p2['Venue'])==str:
    inputs_venue = tokenizer([p1['Venue']], [p2['Venue']], return_tensors = 'pt',
                            max_length = 32, truncation = True, padding = 'max_length')
    print(venuebert(inputs_venue))
  
  


357875
tensor([[0.9623]], grad_fn=<SigmoidBackward0>)
tensor([[0.6248]], grad_fn=<SigmoidBackward0>)
214023
tensor([[0.9941]], grad_fn=<SigmoidBackward0>)
tensor([[0.9963]], grad_fn=<SigmoidBackward0>)
317448
tensor([[0.9946]], grad_fn=<SigmoidBackward0>)
tensor([[0.9956]], grad_fn=<SigmoidBackward0>)
319987
tensor([[0.9871]], grad_fn=<SigmoidBackward0>)
tensor([[0.6522]], grad_fn=<SigmoidBackward0>)
334185
tensor([[0.9861]], grad_fn=<SigmoidBackward0>)
tensor([[0.7613]], grad_fn=<SigmoidBackward0>)
95255
tensor([[0.9748]], grad_fn=<SigmoidBackward0>)
294124
tensor([[0.3765]], grad_fn=<SigmoidBackward0>)
tensor([[0.8313]], grad_fn=<SigmoidBackward0>)
96319
tensor([[0.0501]], grad_fn=<SigmoidBackward0>)
tensor([[0.1989]], grad_fn=<SigmoidBackward0>)
610127
tensor([[0.9993]], grad_fn=<SigmoidBackward0>)
tensor([[0.9962]], grad_fn=<SigmoidBackward0>)


In [None]:
# Averaging ensemble

prob = []

for idx1, idx2 in tqdm(test_citations):

  # get the two papers
  p1 = df.iloc[idx1]
  p2 = df.iloc[idx2]

  res = []

  # title will always exist (I think??)
  inputs_title = tokenizer([p1['Title']], [p2['Title']], return_tensors = 'pt',
                           max_length = 32, truncation = True, padding = 'max_length')
  res.append(titlebert(inputs_title).item())
  
  if type(p1['Venue'])==str and type(p2['Venue'])==str:
    inputs_venue = tokenizer([p1['Venue']], [p2['Venue']], return_tensors = 'pt',
                            max_length = 32, truncation = True, padding = 'max_length')
    # print(venuebert(inputs_venue))
    res.append(venuebert(inputs_venue).item())

  prob.append(np.mean(res))
  # print(np.mean(res))



print(prob)

  



[0.9102713167667389, 0.4544652998447418, 0.5327351689338684, 0.7293183207511902, 0.7384950518608093, 0.858288586139679, 0.9193280339241028, 0.9970178604125977, 0.9317418038845062, 0.8895634710788727, 0.8271278142929077, 0.757253110408783, 0.8239882290363312, 0.665449246764183, 0.14407627284526825, 0.9084127843379974, 0.9106732308864594, 0.9390210807323456, 0.22672046720981598, 0.979086697101593, 0.9682144522666931, 0.5376552641391754, 0.394911527633667, 0.8768737018108368, 0.6421718597412109, 0.999293327331543, 0.6416687965393066, 0.5108339190483093, 0.7773755490779877, 0.9958547949790955, 0.6665770411491394, 0.9948846697807312, 0.9737203121185303, 0.9745980501174927, 0.994413435459137, 0.8615728914737701, 0.9912108182907104, 0.999254047870636, 0.939901202917099, 0.8948940634727478, 0.9845825433731079, 0.9923205971717834, 0.9995452761650085, 0.9937540888786316, 0.5401720404624939, 0.9878716468811035, 0.9829439520835876, 0.9723591208457947, 0.8204454481601715, 0.9447020590305328, 0.8528

In [None]:
np.mean(prob)

0.7968190245625406