In [13]:
import csv
import json
import pandas as pd
import numpy as np
import random
from collections import defaultdict

In [15]:
# parse nodes.json
author = defaultdict(dict) 

with open('../data/raw/nodes.json') as file:
    data = json.load(file)
    for instance in data:
        author_id = int(instance["id"])
        author[author_id]["id"] = author_id
        author[author_id]["first"] = int(instance["first"])
        author[author_id]["last"] = int(instance["last"])
        author[author_id]["num_papers"] = int(instance["num_papers"])
        author[author_id]["keywords"] = set([int(key.split('_')[1]) for key in instance.keys() if 'keyword' in key])
        author[author_id]["venues"] = set([int(key.split('_')[1]) for key in instance.keys() if 'venue' in key])
        
# access author data like this:
example_id = 3
print("author[exampleid][venues]:", author[example_id]["venues"])
print("author[exampleid][keywords]:", author[example_id]["keywords"])

author[exampleid][venues]: {3, 19, 180, 20}
author[exampleid][keywords]: {2, 35, 38, 6, 8, 42, 43, 47, 16, 18, 52, 24, 25, 30, 31}


In [16]:
# parse train.txt
train = defaultdict(lambda: defaultdict(int)) 

with open('../data/raw/train.txt') as file:
    data = file.readlines()
    for row in data:
        tmp = row.split()
        src = tmp[0]
        sinks = tmp[1:]
        for sink in sinks:
            train[int(src)][int(sink)] = 1

In [17]:
# Parse dev
dev = pd.read_csv("../data/raw/dev.csv")         # dev set source-sink information
dev_labels = pd.read_csv("../data/raw/dev-labels.csv")['Expected'] # dev set  labelling

In [18]:
# Initialise dictionary of neighbours of nodes (for dev)
dev_src = dev['Source']
dev_sink = dev['Sink']

# Empty dictionary to store all the neighbours
neighbours = {}

# Stores each src's neighbours (Aggregate sinks) as a list in the neighbours dictionary
for src in dev_src.unique():
    neighbours[src] = set(dev[dev['Source'] == src]['Sink'])

# Does the same for sink's neighbours (aggregate sources) as a list in the neighbours dictionary
# adds on to the dictionary if key already exists.
for sink in dev_sink.unique():
    neighbours[sink] = neighbours.get(sink,set([])).union(dev[dev['Sink'] == sink]['Source'])

In [19]:
# # compute minimum distance of two nodes (degrees of seperation). Uses BFS (UNUSED)
# def compute_distance(graph, src_id, sink_id):
    
#     # record distance of visited path (used to check for visited nodes too)
#     distance = {}
#     # the queue
#     distance[src_id] = 0
#     queue = [src_id]

#     # while queue is not empty
#     while queue:
#         # get a node from the queue
#         current = queue.pop(0)
#         # target found, return distance
#         if current == sink_id:
#             return distance[current]
        
#         # find neighbours and add them into the queue
#         neighbours = graph[current]
#         if neighbours:
#             for n in neighbours:                
#                 if n not in distance.keys():    
#                     distance[n] = distance[current] + 1 # neighbour is always 1 step more
#                     queue.append(n)

#     return len(graph.keys())

# testing
# test_graph = {1:[2, 4],
#              2:[1, 4, 3],
#              3:[2, 6],
#              4:[1, 2, 5],
#              5:[4],
#              6:[3],
#              7:[]}

# compute_distance(test_graph, 1, 6)

In [20]:
# Initialise dictionary of neighbours of nodes (for train)
# Dont need to consider for sink since train data neighbours is two way.
train_neighbours = {}
for src in train.keys():
    train_neighbours[src] = set(train[src].keys())

In [21]:
# function to create a row of feautres given two author information
colname = ["sum_papers","first_diff","last_diff","overlap_years","common_keywords",
           "keyword_similarity", "common_venue","venue_similarity","common_neighbours","neighbours_similarity","edge"]
colname_str = ""
for i in range(len(colname)):
    colname_str += colname[i]
    if i+1 < len(colname):
        colname_str += ","
    else:
        colname_str += "\n"

def constructRow(src, sink, label=None):
    row = []
    # sum of papers
    row.append(src['num_papers']+sink['num_papers'])
    
    # difference in 'first' between two authors
    row.append(abs(src['first'] - sink['first']))
    # difference in 'last' between two authors
    row.append(abs(src['last'] - sink['last']))
    
    # overlap years
    src_range = set(range(src['last'], src['first']))
    sink_range = set(range(sink['last'], sink['first']))
    row.append(len(src_range.intersection(sink_range)))
    
    # common words: number of overlapping keywords between two authors
    common_keywords = src['keywords'].intersection(sink['keywords'])
    row.append(len(common_keywords))
    
    # keyword similarity: overlap keywords / union numer of keywords
    union_keywords = src['keywords'].union(sink['keywords'])
    if len(union_keywords) != 0:
        row.append(len(common_keywords) / len(union_keywords))
    else: 
        row.append(0)
    
    # common venue: number of overlapping venue between two authors
    common_venues = src['venues'].intersection(sink['venues'])
    row.append(len(common_venues))
    
    # venue similarity: overlap venue / union numer of venue
    union_venues = src['venues'].union(sink['venues'])
    if len(union_venues) != 0:
        row.append(len(common_venues) / len(union_venues))
    else:
        row.append(0)
    
    # common neighbours (accounts for nodes that are not recorded in training set)
    common_neighbours = train_neighbours.get(src['id'],set([])).intersection(train_neighbours.get(sink['id'],set([])))
    row.append(len(common_neighbours))
    
    # neighbour similarity: overlap neighbours / union number of neighbours
    union_neighbours = train_neighbours.get(src['id'],set([])).union(train_neighbours.get(sink['id'],set([])))
    row.append(len(common_neighbours)/len(union_neighbours)) if len(union_neighbours) else row.append(0)

    ## dont forget the labelling!
    if label:
        row.append(label)
    else:
        row.append(0)
    
    return row

In [22]:
# # construct table (full)
# table = []
# # used to avoid repetition of sorce sink pairs
# processed = defaultdict(lambda: defaultdict(int))

# # this nested loop constructs each row of the table
# for src_id in author.keys():
#     for sink_id in author.keys():
#         # dont include edge to self, dont repeat edge
#         if (not src_id == sink_id) and (not processed[src_id][sink_id]):
#             # generate row
#             label = train[src_id][sink_id]
#             row = constructRow(author[src_id], author[sink_id], label)
#             # add row to table
#             table.append(row)
#             # set edge as processed
#             processed[src_id][sink_id] = 1

In [23]:
# construct table (with undersampling)
table = []
# used to avoid repetition of sorce sink pairs
processed = defaultdict(lambda: defaultdict(int))

# first add only instance with links
for src_id in train.keys():
    for sink_id in train[src_id].keys():
        # dont include edge to self, dont repeat edge
        if (train[src_id][sink_id]) and (not src_id == sink_id) and (not processed[src_id][sink_id]):
            # generate row
            label = train[src_id][sink_id]
            row = constructRow(author[src_id], author[sink_id], label)
            # add row to table
            table.append(row)
            # set edge as processed
            processed[src_id][sink_id] = 1

# now add instances with no link
to_add = len(table)
while to_add > 0:
    # pick random src and sink
    src_id = random.randint(0, len(author.keys())-1)
    sink_id = random.randint(0, len(author.keys())-1)
    # dont include edge to self, dont repeat edge
    if (not src_id == sink_id) and (not processed[src_id][sink_id]):
        # generate row
        label = train[src_id][sink_id]
        row = constructRow(author[src_id], author[sink_id], label)
        # add row to table
        table.append(row)
        # set edge as processed
        processed[src_id][sink_id] = 1
        
    to_add -= 1

In [24]:
# write to file (train)
with open('../data/final/train_reconstructed.csv', 'w') as file:
    ## header
    file.write(colname_str)
    for row in table:
        row_string = ""
        for i in range(len(row)):
            features = row[i]
            row_string += str(features)
            if i+1 < len(row):
                row_string += ","
            else:
                row_string += "\n"
                
        file.write(row_string)

In [25]:
# create dev set and write to csv
dev['Expected'] = dev_labels
dev_src = dev['Source']
dev_sink = dev['Sink']

# total npapers
sum_papers = np.array([author[dev_src[i]]['num_papers']  + 
                       author[dev_sink[i]]['num_papers'] for i in range(len(dev_src))])


# # first year published
src_first = np.array([author[id]['first'] for id in dev_src])
sink_first = np.array([author[id]['first'] for id in dev_sink])

# # last year published
src_last = np.array([author[id]['last'] for id in dev_src])
sink_last = np.array([author[id]['last'] for id in dev_sink])

# Diff in first year and last year publishing between two authors.
first_diff = abs(src_first - sink_first)
last_diff = abs(src_last - sink_last)

# overlap years
src_range = [set(range(author[src]['last'], author[src]['first']))
              for src in dev_src]
sink_range = [set(range(author[sink]['last'], author[sink]['first']))
              for sink in dev_sink]
overlap_years = [len(src_range[i].intersection(sink_range[i])) for i in range(len(src_range))]

# overlap keywords / union numer of keywords
common_keywords = np.array([len(author[dev_src[i]]['keywords'].intersection(author[dev_sink[i]]['keywords'])) for i in range(len(dev_sink))])
union_keywords = np.array([len(author[dev_src[i]]['keywords'].union(author[dev_sink[i]]['keywords'])) for i in range(len(dev_sink))])
keyword_similarity = common_keywords / union_keywords

# overlapping venus/ union number of venues
common_venue = np.array([len(author[dev_src[i]]['venues'].intersection(author[dev_sink[i]]['venues'])) for i in range(len(dev_sink))])
union_venues = np.array([len(author[dev_src[i]]['venues'].union(author[dev_sink[i]]['venues'])) for i in range(len(dev_sink))])
venue_similarity = common_venue / union_venues 

# common neighbours
common_neighbours = np.array([len(neighbours[dev_src[i]].intersection(neighbours[dev_sink[i]])) for i in range(len(dev_sink))])
union_neighbours =  np.array([len(neighbours[dev_src[i]].union(neighbours[dev_sink[i]])) for i in range(len(dev_sink))])
neighbours_similarity = common_neighbours/union_neighbours

edge = dev['Expected'].apply(lambda x: 0 if x == -1 else x)

test_df = { 'sum_papers':sum_papers,
            'first_diff': first_diff,
            'last_diff': last_diff,
            'overlap_years': overlap_years,
            'common_keywords': common_keywords,
            'keyword_similarity': keyword_similarity,
            'common_venue': common_venue,
            'venue_similarity': venue_similarity,
            'common_neighbours': common_neighbours,
            'neighbours_similarity': neighbours_similarity,
            'edge': edge 
            }
dev_test = pd.DataFrame(data=test_df)
dev_test.to_csv('../data/final/dev-test.csv', index=False)

## Parsing test-public.csv

In [27]:
# Parse test-public
test_final = pd.read_csv("../data/raw/test-public.csv") 

In [28]:
# Initialise dictionary of neighbours of nodes (for test_final)
test_final_src = test_final['Source']
test_final_sink = test_final['Sink']

# Empty dictionary to store all the neighbours
neighbours = {}

# Stores each src's neighbours (Aggregate sinks) as a list in the neighbours dictionary
for src in test_final_src.unique():
    neighbours[src] = set(test_final[test_final['Source'] == src]['Sink'])

# Does the same for sink's neighbours (aggregate sources) as a list in the neighbours dictionary
# adds on to the dictionary if key already exists.
for sink in test_final_sink.unique():
    neighbours[sink] = neighbours.get(sink,set([])).union(test_final[test_final['Sink'] == sink]['Source'])

In [29]:
# create final-test set and write to csv
test_final_src = test_final['Source']
test_final_sink = test_final['Sink']

# total npapers
sum_papers = np.array([author[test_final_src[i]]['num_papers']  + 
                       author[test_final_sink[i]]['num_papers'] for i in range(len(test_final_src))])


# # first year published
src_first = np.array([author[id]['first'] for id in test_final_src])
sink_first = np.array([author[id]['first'] for id in test_final_sink])

# # last year published
src_last = np.array([author[id]['last'] for id in test_final_src])
sink_last = np.array([author[id]['last'] for id in test_final_sink])

# Diff in first year and last year publishing between two authors.
first_diff = abs(src_first - sink_first)
last_diff = abs(src_last - sink_last)

# overlap years
src_range = [set(range(author[src]['last'], author[src]['first']))
              for src in test_final_src]
sink_range = [set(range(author[sink]['last'], author[sink]['first']))
              for sink in test_final_sink]
overlap_years = [len(src_range[i].intersection(sink_range[i])) for i in range(len(src_range))]

# overlap keywords / union numer of keywords
common_keywords = np.array([len(author[test_final_src[i]]['keywords'].intersection(author[test_final_sink[i]]['keywords'])) for i in range(len(test_final_sink))])
union_keywords = np.array([len(author[test_final_src[i]]['keywords'].union(author[test_final_sink[i]]['keywords'])) for i in range(len(test_final_sink))])
keyword_similarity = common_keywords / union_keywords

# overlapping venus/ union number of venues
common_venue = np.array([len(author[test_final_src[i]]['venues'].intersection(author[test_final_sink[i]]['venues'])) for i in range(len(test_final_sink))])
union_venues = np.array([len(author[test_final_src[i]]['venues'].union(author[test_final_sink[i]]['venues'])) for i in range(len(test_final_sink))])
venue_similarity = common_venue / union_venues

# common neighbours
common_neighbours = np.array([len(neighbours[test_final_src[i]].intersection(neighbours[test_final_sink[i]])) for i in range(len(test_final_sink))])
union_neighbours =  np.array([len(neighbours[test_final_src[i]].union(neighbours[test_final_sink[i]])) for i in range(len(test_final_sink))])
neighbours_similarity = common_neighbours/union_neighbours

test_df = { 'sum_papers':sum_papers,
            'first_diff': first_diff,
            'last_diff': last_diff,
            'overlap_years': overlap_years,
            'common_keywords': common_keywords,
            'keyword_similarity': keyword_similarity,
            'common_venue': common_venue,
            'venue_similarity': venue_similarity,
            'common_neighbours': common_neighbours,
            'neighbours_similarity': neighbours_similarity,
            }
test_final_df = pd.DataFrame(data=test_df)


  venue_similarity = common_venue / union_venues


## Dealing with one null value

In [30]:
test_final_df[test_final_df['venue_similarity'].isnull()]

Unnamed: 0,sum_papers,first_diff,last_diff,overlap_years,common_keywords,keyword_similarity,common_venue,venue_similarity,common_neighbours,neighbours_similarity
1035,2,0,0,0,1,0.111111,0,,0,0.0


In [31]:
test_final_df['venue_similarity'].fillna(0,inplace=True)
test_final_df[test_final_df['venue_similarity'].isnull()]

Unnamed: 0,sum_papers,first_diff,last_diff,overlap_years,common_keywords,keyword_similarity,common_venue,venue_similarity,common_neighbours,neighbours_similarity


In [32]:

test_final_df.to_csv('../data/final/test-final.csv', index=False)