In [2]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import scipy.sparse as sp
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
import pickle

# Import the data

In [3]:
data_dir = "../../data/"
with open(data_dir + "train.txt", "r") as f:
     train_data = f.readlines()

# Data structure

In [4]:
# 1. node set
# 2. edge tuple
nodes = []
edges = []
for i in range(len(train_data)):
    #if i%100 == 0:
        #print(i)
    nodes_list = [int(n) for n in train_data[i].split()]
    for node in nodes_list:
        nodes.append(node)
    for node in nodes_list[1:]:
        edges.append((nodes_list[0],node))
nodes = set(nodes)
# ordered edges by the source
sorted_edges = sorted(edges, key=lambda tup: tup[0])


In [39]:
# General node class
class Node:
    def __init__(self, node_id, neighbour_id_set):
        self.node_id = node_id
        self.neighbour_id_set = neighbour_id_set
    def get_node_id(self):
        return self.get_node_id
    def get_neighbour_id_set(self):
        return self.neighbour_id_set

def find_neighbours(id):
    
    """
    find all the neighbours of node by id.
    1. All the sink node of node id will be appended as the neighbor first.
    2. All the source node of node id will also be added as neighbor then.
    3. return the neighbor set.
    """
    neighbour_set = set()
    for edge in sorted_edges:
        if edge[0] == id:
            # add the sink node of node id
            neighbour_set.add(edge[1])
        elif edge[1] == id:
            # add the source node of node id
            neighbour_set.add(edge[0])
    return neighbour_set

def get_jaccard_coefficient(node_x, node_y):
    """
    in: node_x::Node object
    in: node_y::Node object
    return: jaccard's cofficient::numeric
    """
    score = 0.0
    neigbours_set_of_node_x = node_x.get_neighbour_id_set()
    neigbours_set_of_node_y = node_y.get_neighbour_id_set()
    union_neighbours = neigbours_set_of_node_x | neigbours_set_of_node_y
    common_neighbours = neigbours_set_of_node_x & neigbours_set_of_node_y
    if len(union_neighbours)==0:
        return 0.0
    return(len(common_neighbours)/len(union_neighbours))


# Make the prediction

In [41]:
# print(type(2184483))
find_neighbours(2184483)


{116475,
 150037,
 163021,
 197658,
 212805,
 261200,
 293995,
 346796,
 368540,
 418720,
 475574,
 475915,
 487639,
 572208,
 579743,
 594699,
 631279,
 636045,
 696805,
 708203,
 763313,
 771731,
 848620,
 850459,
 907665,
 910755,
 911293,
 932451,
 960181,
 979757,
 982725,
 1054633,
 1061690,
 1112096,
 1146906,
 1149539,
 1161170,
 1218944,
 1253742,
 1283903,
 1310018,
 1340930,
 1440298,
 1494526,
 1500229,
 1598678,
 1622252,
 1663756,
 1687276,
 1706508,
 1729215,
 1753526,
 1763327,
 1812114,
 1840412,
 1860472,
 1869214,
 1869658,
 1903802,
 1908453,
 1942527,
 2019433,
 2023163,
 2035830,
 2089396,
 2102658,
 2120801,
 2121714,
 2122035,
 2124164,
 2146309,
 2173130,
 2174930,
 2277594,
 2300708,
 2336722,
 2371483,
 2389556,
 2393067,
 2396903,
 2422549,
 2444158,
 2460356,
 2549365,
 2633579,
 2651652,
 2700228,
 2712479,
 2739693,
 2806667,
 2819533,
 2900837,
 2916456,
 2930986,
 2935547,
 3045716,
 3057131,
 3116573,
 3117644,
 3123328,
 3193551,
 3209755,
 3222384,
 

In [None]:
with open(data_dir + "test-public.txt", "r") as f:
     test_data = f.readlines()
test_data = [i.split() for i in test_data[1:]]

def predict():
    """
    make the prediction using the jaccard's coefficient
    """
    result = []
    for line in test_data:
        # converse to integer
        point_x = int(line[1].strip())
        point_y = int(line[2].strip())
        node_x = Node(point_x,find_neighbours(point_x))
        node_y = Node(point_y,find_neighbours(point_y))
        jaccard_coefficient = get_jaccard_coefficient(node_x, node_y)
        result.append((line[0], jaccard_coefficient)
    return result
result = predict()

# Save to file

In [23]:
import csv
import time
'''
Description: get time
Input: 
Output: time
''' 
def nowtime():
    return time.strftime("%Y%m%d-%H%M", time.localtime())


"""
Description: Save prediction result to files
Input: (1) result
       (2) filename
Output: 
"""
def save_prediction_to_csv(result,filename):
    headers = ['id','Prediction']

    with open(filename + str(nowtime()) + ".csv", 'w', encoding = 'utf8') as f:
        f_csv = csv.writer(f)
        f_csv.writerow(headers)
        f_csv.writerows(result)

In [None]:
save_prediction_to_csv(result, "shawn_jc_")