In [1]:
import json
import random
import csv
import numpy as np
import pandas as pd

from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.nn.functional import cosine_similarity
from torch.nn import TransformerEncoder, TransformerEncoderLayer

from scipy import spatial
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from tqdm import tqdm
from preprocessing import *
from collections import defaultdict
import heapq
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine

In [3]:
train = json.load(open('../data/train.json'))
test = json.load(open('../data/test.json'))

In [4]:
train_df = pd.DataFrame(train).T

In [5]:
def get_coop_record(dataset):
    '''
    This function generate and return a dictionary that records cooperation history between each authors
    '''
    record = pd.DataFrame(train_preprocessing(dataset)).T
    coop_record = defaultdict(list)
    for coauthor, author in zip(record.coauthor, record.target):
        coop_record[author].append(coauthor)
    coop_record = dict(coop_record)
    for key, value in coop_record.items():
        coop_record[key] = list(set([j for sub in coop_record[key] for j in sub]))

    return coop_record

coop_record = get_coop_record(train)

In [6]:
w2v = Word2Vec.load('word2vec.bin')

def compute_similarity_matrix(dataset):
    '''
    This function computes the cosine similarity matrix between each keywords vector
    and returns a similarity matrix
    '''
    
    all_cur_sum = []
    for label, record in dataset.items():
        cur_sum = torch.zeros(128)
        for k in record['keywords']:
            cur_sum += torch.tensor(w2v.wv.get_vector(k))
        cur_sum = cur_sum/len(record['keywords'])
        cur_sum = list(cur_sum.numpy())
        all_cur_sum.append(cur_sum)
    
    all_cur_sum = np.array(all_cur_sum)
    simi_matrix = 1 - pairwise_distances(all_cur_sum, metric="cosine")
    return simi_matrix

In [7]:
def extract_feature(json_record, author):
    '''
    This function extracts feature from the JSON file

    RETURN VALUE:

    A dictionary contains all the attributes value of an instances
    '''

    venue = json_record['venue']
    # replace any empty venue id with an arbitrary id
    if venue == '':
        venue = 470

    return {
        'venue':venue, 
        'keywords':json_record['keywords'], 
        'year':json_record['year'], 
        'author':author}

Perform negative sampling

In [8]:
POSITIVE, NEGATIVE = 1, 0
simi_matrix = compute_similarity_matrix(train)

def negative_sampling(index):
    negative_samples = []
    simi_list = list(simi_matrix[int(index)])
    lowest_20 = list(map(simi_list.index, heapq.nsmallest(30, simi_list)))
    for i in lowest_20:
        negative_samples.extend(train_df['author'][i])
    return set(negative_samples)


def create_dataset(dataset, test=False):
    '''
    This function creates the dataset from the JSON file 
    and performs negative sampling as well

    RETURN VALUE:
    X: Attributes
    y: labels (POSITIVE OR NEGATIVE)
    '''

    X, y = [], []
    count = 0

    # Don't apply negative sampling when loading test set
    if test:
        for index, record in dataset:
            X.append(extract_feature(record, record['target']))
            y.append(POSITIVE)

    else:
        for index, record in dataset:
            authors = record['author']
            size = len(authors)
            for author in authors:
                X.append(extract_feature(record, author))
                y.append(POSITIVE)

            # Remove the current set of authors from the returned negative samples
            negative_sample = negative_sampling(index) - set(authors)

            # Negative sample size are one to one to positive samples
            negative_authors = random.sample(negative_sample, size)
            for author in negative_authors:
                X.append(extract_feature(record, author))
                y.append(NEGATIVE)
            
            count += 1
            if count % 1000 == 0:
                print(count)
                
    return np.stack(X), np.array(y)

Create custom dataset

In [9]:
class MyDataset(Dataset):
    def __init__(self, dataset, test=False):
        self.X, self.y = create_dataset(dataset, test)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


Split training set into train set and validation set

In [11]:
TRAIN_SIZE = 0.8

train_data = list(train.items())
train_size = int(TRAIN_SIZE * len(train_data))

# Random shuffle dataset and split dataset into train set and validation set
train_data = shuffle(train_data)
train_set = train_data[:train_size]
valid_set = train_data[train_size:]

In [12]:
train_ds = MyDataset(train_set)
valid_ds = MyDataset(valid_set)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
1000
2000
3000
4000
5000


In [None]:
def collate_fn(batch):
    venues, years, keywords, authors, y = [], [], [], [], []

    for data, label in batch:
        venues.append(data['venue'])
        years.append(data['year'])
        authors.append(data['author'])
        keywords.append(torch.LongTensor(data['keywords']))
        y.append(label)

    return {
        'venues':torch.LongTensor(venues), 'keywords':keywords, 'years':torch.LongTensor(years),
        'authors':torch.LongTensor(authors), 'labels':torch.FloatTensor(y)
    }

In [13]:
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_ds, batch_size=64, shuffle=True, collate_fn=collate_fn)

In [14]:
N_VENUES, N_YEARS, N_KEYWORDS, N_AUTHORS = 471, 20, 500+1, 2302
weight = torch.FloatTensor(w2v.wv.vectors)

class NNEmbeddings(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size):
        super(NNEmbeddings, self).__init__()

        self.venue_embedding = nn.Embedding(N_VENUES, embedding_size)
        self.year_embedding = nn.Embedding(N_YEARS, embedding_size)
        self.keywords_embedding = nn.Embedding(N_KEYWORDS, embedding_size)
        self.author_embedding = nn.Embedding(N_AUTHORS, embedding_size)

        encoder_layer = TransformerEncoderLayer(d_model=128, nhead=2, batch_first=True)
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers=1)

        # First fully connected layer
        self.fc1 = nn.Linear(input_size, hidden_size)

        # Activation function
        self.activation = nn.Tanh()

        # Second fully connected layer
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.logits = nn.Linear(output_size, 1)
        #self.dropout = nn.Dropout(0.25)

    def forward(self, batch):
        authors = batch['authors']
        venues = batch['venues']
        keywords = batch['keywords']
        labels = batch['labels']

        # Sequence has vary length, padded to the longest squence of the current batch
        keywords = pad_sequence(keywords, batch_first=True, padding_value=500)
        # Creates a padding mask for the current batch
        pad_mask = ~(keywords != 500)
        k_vec = self.keywords_embedding(keywords)
        k_vec = self.transformer_encoder(k_vec, src_key_padding_mask=pad_mask)

        v_vec = self.venue_embedding(venues)
        a_vec = self.author_embedding(authors)

        statcked_k_vec = k_vec[:,0,:]
        
        out = self.fc1(torch.cat((v_vec, a_vec, statcked_k_vec), dim=-1))
        out = self.activation(out)
        #out = self.dropout(out)
        out = self.fc2(out)
        out = self.logits(out)
        return out

In [15]:
EMBEDDING_SIZE = 128
OUTPUT_SIZE = 256
INPUT_SIZE =  3 * EMBEDDING_SIZE
HIDDEN_SIZE = 256
NUM_EPOCHS = 2
LOG_INTERVAL = 1000
LEARNING_RATE = 0.001

model = NNEmbeddings(INPUT_SIZE, EMBEDDING_SIZE, HIDDEN_SIZE, OUTPUT_SIZE)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.RMSprop(model.parameters(), lr=LEARNING_RATE)

def train(model, train_loader, optimizer):
    for i in range(NUM_EPOCHS):
        for j, batch in enumerate(train_loader):
            logits = model(batch)
            loss = criterion(logits.squeeze(), batch['labels'])
            loss.backward()       # Backward pass (compute parameter gradients)
            optimizer.step()      # Update weight parameter using SGD
            optimizer.zero_grad() # Reset gradients to zero for next iteration
            
            if j % LOG_INTERVAL == 0:
                print(f"[TRAINING] epoch {i} | Batch #{j} | Loss {loss:.5f} |")

train(model, train_loader, optimizer)


[TRAINING] epoch 0 | Batch #0 | Loss 0.6751 |
[TRAINING] epoch 0 | Batch #1000 | Loss 0.2885 |
[TRAINING] epoch 0 | Batch #2000 | Loss 0.4676 |
[TRAINING] epoch 0 | Batch #3000 | Loss 0.2183 |
[TRAINING] epoch 0 | Batch #4000 | Loss 0.2327 |
[TRAINING] epoch 0 | Batch #5000 | Loss 0.1132 |
[TRAINING] epoch 0 | Batch #6000 | Loss 0.2826 |
[TRAINING] epoch 0 | Batch #7000 | Loss 0.5913 |
[TRAINING] epoch 0 | Batch #8000 | Loss 0.2087 |
[TRAINING] epoch 0 | Batch #9000 | Loss 0.2979 |
[TRAINING] epoch 0 | Batch #10000 | Loss 0.1141 |
[TRAINING] epoch 0 | Batch #11000 | Loss 0.0726 |
[TRAINING] epoch 0 | Batch #12000 | Loss 0.2005 |
[TRAINING] epoch 1 | Batch #0 | Loss 0.0341 |
[TRAINING] epoch 1 | Batch #1000 | Loss 0.3030 |
[TRAINING] epoch 1 | Batch #2000 | Loss 0.2216 |
[TRAINING] epoch 1 | Batch #3000 | Loss 0.1832 |
[TRAINING] epoch 1 | Batch #4000 | Loss 0.1706 |
[TRAINING] epoch 1 | Batch #5000 | Loss 0.2628 |
[TRAINING] epoch 1 | Batch #6000 | Loss 0.0718 |
[TRAINING] epoch 1 | Ba

Load test set

In [None]:
test_set = list(test.items())
test_ds = MyDataset(test_set, test=True)
test_loader = DataLoader(test_ds, batch_size=64, shuffle=False, collate_fn=collate_fn)

In [16]:
def sigmoid_fn(x):
    return [(1 / (1 + np.exp(-i))) for i in x]

def make_prediction(dataloader, model):
    prediction = []
    with torch.no_grad():
        for batch in dataloader:
            output = model(batch).squeeze().numpy()
            prediction.append(output)
    
    # Stack arrays in sequence horizontally and apply sigmoid function
    return sigmoid_fn(np.hstack(prediction))

probability_score = make_prediction(test_loader, model)

Write Kaggle submission file

In [17]:
with open('submission.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Id', 'Predicted'])
    test_id = 0
    for i in probability_score:
        writer.writerow([test_id, i])
        test_id += 1