In [2]:
import csv
import json
import pandas as pd
import numpy as np
import random
from collections import defaultdict

In [3]:
dev = open("dev.csv")         # dev set source-sink information
dev_labels = open("dev-labels.csv") # dev set  labelling

In [4]:
# parse nodes.json
author = defaultdict(dict) 

with open('nodes.json') as file:
    data = json.load(file)
    for instance in data:
        author_id = int(instance["id"])
        author[author_id]["first"] = int(instance["first"])
        author[author_id]["last"] = int(instance["last"])
        author[author_id]["num_paper"] = int(instance["num_papers"])
        author[author_id]["keywords"] = set([int(key.split('_')[1]) for key in instance.keys() if 'keyword' in key])
        author[author_id]["venues"] = set([int(key.split('_')[1]) for key in instance.keys() if 'venue' in key])
        
# access author data like this:
example_id = 3
print("author[exampleid][venues]:", author[example_id]["venues"])
print("author[exampleid][keywords]:", author[example_id]["keywords"])

author[exampleid][venues]: {3, 19, 180, 20}
author[exampleid][keywords]: {2, 35, 38, 6, 8, 42, 43, 47, 16, 18, 52, 24, 25, 30, 31}


In [5]:
# parse train.txt
train = defaultdict(lambda: defaultdict(int)) 

with open('train.txt') as file:
    data = file.readlines()
    for row in data:
        tmp = row.split()
        src = tmp[0]
        sinks = tmp[1:]
        for sink in sinks:
            train[int(src)][int(sink)] = 1

In [6]:
# function to create a row of feautres given two author information
colname = "src_first,sink_first,src_last,sink_last,first_diff,last_diff,num_shared_keyword,num_shared_venue,edge\n"

def constructRow(src, sink, label):
    row = []
    # years since first published
    row.append(src['first'])
    row.append(sink['first'])
    # years since last published
    row.append(src['last'])
    row.append(sink['last'])
    # difference in 'first' between two authors
    row.append(src['first'] - sink['first'])
    # difference in 'lasst' between two authors
    row.append(src['last'] - sink['last'])

    # number of overlapping keywords between two authors
    common_keywords = src['keywords'].intersection(sink['keywords'])
    row.append(len(common_keywords))
    # number of overlapping venue between two authors
    common_venues = src['venues'].intersection(sink['venues'])
    row.append(len(common_venues))
            
    ## dont forget the labelling!
    row.append(label)
    
    return row

In [None]:
# # construct table (full)
# table = []
# # used to avoid repetition of sorce sink pairs
# processed = defaultdict(lambda: defaultdict(int))

# # this nested loop constructs each row of the table
# for src_id in author.keys():
#     for sink_id in author.keys():
#         # dont include edge to self, dont repeat edge
#         if (not src_id == sink_id) and (not processed[src_id][sink_id]):
#             # generate row
#             label = train[src_id][sink_id]
#             row = constructRow(author[src_id], author[sink_id], label)
#             # add row to table
#             table.append(row)
#             # set edge as processed
#             processed[src_id][sink_id] = 1

In [13]:
# construct table (stratified sampling, undersampling)
table = []
# used to avoid repetition of sorce sink pairs
processed = defaultdict(lambda: defaultdict(int))

# first add only instance with links
for src_id in train.keys():
    for sink_id in train[src_id].keys():
        # dont include edge to self, dont repeat edge
        if (train[src_id][sink_id]) and (not src_id == sink_id) and (not processed[src_id][sink_id]):
            # generate row
            label = train[src_id][sink_id]
            row = constructRow(author[src_id], author[sink_id], label)
            # add row to table
            table.append(row)
            # set edge as processed
            processed[src_id][sink_id] = 1

# now add instances with no link
to_add = len(table)
while to_add > 0:
    # pick random src and sink
    src_id = random.randint(0, len(author.keys())-1)
    sink_id = random.randint(0, len(author.keys())-1)
    # dont include edge to self, dont repeat edge
    if (not src_id == sink_id) and (not processed[src_id][sink_id]):
        # generate row
        label = train[src_id][sink_id]
        row = constructRow(author[src_id], author[sink_id], label)
        # add row to table
        table.append(row)
        # set edge as processed
        processed[src_id][sink_id] = 1
        
    to_add -= 1

In [8]:
len(table)

95174

In [None]:
# write to file
with open('train_reconstructed.csv', 'w') as file:
    ## header
    file.write(colname)
    for row in table:
        row_string = ""
        for i in range(len(row)):
            features = row[i]
            row_string += str(features)
            if i+1 < len(row):
                row_string += ","
            else:
                row_string += "\n"
                
        file.write(row_string)