# Load the data

In [2]:
import pandas as pd
import pickle
import math



In [3]:
data_dir = "../../data/"
def save_obj(obj, name ):
    with open( name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(data_dir + name + '.pkl', 'rb') as f:
        return pickle.load(f)
# load data
Edges = load_obj("New_Edges")
Nodes = load_obj("New_Nodes")
# dict
# feature_Name = ["Number of neighbours", "Log(Num of nei)", "list of Neighbours", "list of in neighbous", 
#                 "number of in_nei", "list of out neig", "num of out neig"]
BasicFeatures = load_obj("pre_features")

In [16]:
def get_jaccard_coefficient(source, sink):
    """
    in: source::Node object
    in: sink::Node object
    return: jaccard's cofficient::numeric
    """
    # transform
    neighbours_of_source_list = BasicFeatures[source][2]
    neighbours_of_sink_list = BasicFeatures[sink][2]
    
    neigbours_set_of_source = set(neighbours_of_source_list)
    neigbours_set_of_sink = set(neighbours_of_sink_list)
    union_neighbours = neigbours_set_of_source | neigbours_set_of_sink
    common_neighbours = neigbours_set_of_source & neigbours_set_of_sink
    if len(union_neighbours)==0:
        return 0.0
    return(len(common_neighbours)/len(union_neighbours))

def get_preferential_attachment(source, sink):
    # transform
    neighbours_of_source_list = BasicFeatures[source][2]
    neighbours_of_sink_list = BasicFeatures[sink][2]
    
    neigbours_set_of_source = set(neighbours_of_source_list)
    neigbours_set_of_sink = set(neighbours_of_sink_list)
    
    return len(neigbours_set_of_source)*len(neigbours_set_of_sink)

def get_adamic_adar(source, sink):
    # transform
    neighbours_of_source_list = BasicFeatures[source][2]
    neighbours_of_sink_list = BasicFeatures[sink][2]

    neigbours_set_of_source = set(neighbours_of_source_list)
    neigbours_set_of_sink = set(neighbours_of_sink_list)
    common_neighbours = neigbours_set_of_source & neigbours_set_of_sink
    # get the summation
    score = 0
    for common_node in common_neighbours:
        score = score + 1/math.log(len(BasicFeatures[common_node][2]))
    return score


# how similar are the outbound neighbors of source to sink
def get_outbound_similarity_score(source, sink, metric):
    # get the outbound_node of source
    outbound_node_for_source_set = set(BasicFeatures[source][5])
    summation = 0
    for outbound_node_for_source in outbound_node_for_source_set:
        summation =summation + metric(sink,outbound_node_for_source)
    score = 1/len(outbound_node_for_source_set)*summation
    return score

def get_inbound_similarity_score(source, sink, metric):
    # get the inbound_node of sink
    inbound_node_for_sink_set = set(BasicFeatures[source][3])
    summation = 0
    for inbound_node_for_sink in inbound_node_for_sink_set:
        summation =summation + metric(source,inbound_node_for_sink)
    score = 1/len(inbound_node_for_sink_set)*summation
    return score


In [19]:
from tqdm import tqdm
with open(data_dir + "test-public.txt", "r") as f:
     test_data = f.readlines()
test_data = [i.split() for i in test_data[1:]]
    
def predict():
    """
    make the prediction using the jaccard's coefficient
    """
    result = []
    for line in tqdm(test_data):
        # converse to integer
        node_x = int(line[1].strip())
        node_y = int(line[2].strip())
        score = get_outbound_similarity_score(node_x, node_y,get_adamic_adar)
        result.append((line[0], score))
    return result
result = predict()




  0%|          | 0/2000 [00:00<?, ?it/s][A[A

  0%|          | 3/2000 [00:00<02:17, 14.49it/s][A[A

  0%|          | 5/2000 [00:00<02:20, 14.17it/s][A[A

  0%|          | 9/2000 [00:00<01:46, 18.76it/s][A[A

  1%|          | 15/2000 [00:00<01:24, 23.56it/s][A[A

  1%|          | 18/2000 [00:00<01:23, 23.65it/s][A[A

  1%|          | 21/2000 [00:00<01:26, 23.00it/s][A[A

  1%|          | 24/2000 [00:01<02:28, 13.34it/s][A[A

  1%|▏         | 26/2000 [00:01<02:25, 13.60it/s][A[A

  2%|▏         | 31/2000 [00:02<02:12, 14.81it/s][A[A

  2%|▏         | 33/2000 [00:02<02:15, 14.53it/s][A[A

  2%|▏         | 35/2000 [00:02<02:14, 14.65it/s][A[A

  2%|▏         | 40/2000 [00:02<02:02, 16.06it/s][A[A

  2%|▏         | 43/2000 [00:02<01:59, 16.40it/s][A[A

  2%|▏         | 46/2000 [00:02<02:06, 15.43it/s][A[A

  2%|▏         | 49/2000 [00:03<02:28, 13.11it/s][A[A

  3%|▎         | 51/2000 [00:04<02:40, 12.17it/s][A[A

  3%|▎         | 53/2000 [00:04<02:44, 11

 54%|█████▎    | 1072/2000 [03:40<03:11,  4.86it/s][A[A

 54%|█████▎    | 1074/2000 [03:41<03:10,  4.86it/s][A[A

 54%|█████▍    | 1079/2000 [03:41<03:09,  4.87it/s][A[A

 54%|█████▍    | 1081/2000 [03:42<03:09,  4.86it/s][A[A

 54%|█████▍    | 1083/2000 [03:42<03:08,  4.86it/s][A[A

 54%|█████▍    | 1087/2000 [03:43<03:07,  4.87it/s][A[A

 54%|█████▍    | 1089/2000 [03:43<03:06,  4.87it/s][A[A

 55%|█████▍    | 1091/2000 [03:43<03:06,  4.88it/s][A[A

 55%|█████▍    | 1093/2000 [03:43<03:05,  4.89it/s][A[A

 55%|█████▍    | 1095/2000 [03:44<03:05,  4.89it/s][A[A

 55%|█████▌    | 1102/2000 [03:44<03:02,  4.92it/s][A[A

 55%|█████▌    | 1108/2000 [03:44<03:00,  4.94it/s][A[A

 56%|█████▌    | 1111/2000 [03:45<03:00,  4.93it/s][A[A

 56%|█████▌    | 1114/2000 [03:45<02:59,  4.94it/s][A[A

 56%|█████▌    | 1117/2000 [03:46<02:58,  4.94it/s][A[A

 56%|█████▌    | 1119/2000 [03:46<02:58,  4.95it/s][A[A

 56%|█████▌    | 1121/2000 [03:46<02:57,  4.95it/s][A[

 76%|███████▌  | 1519/2000 [04:31<01:25,  5.60it/s][A[A

 76%|███████▌  | 1522/2000 [04:31<01:25,  5.61it/s][A[A

 76%|███████▋  | 1525/2000 [04:31<01:24,  5.62it/s][A[A

 76%|███████▋  | 1529/2000 [04:31<01:23,  5.63it/s][A[A

 77%|███████▋  | 1533/2000 [04:31<01:22,  5.64it/s][A[A

 77%|███████▋  | 1536/2000 [04:32<01:22,  5.63it/s][A[A

 77%|███████▋  | 1539/2000 [04:32<01:21,  5.64it/s][A[A

 77%|███████▋  | 1542/2000 [04:33<01:21,  5.63it/s][A[A

 77%|███████▋  | 1549/2000 [04:34<01:20,  5.64it/s][A[A

 78%|███████▊  | 1551/2000 [04:35<01:19,  5.63it/s][A[A

 78%|███████▊  | 1553/2000 [04:36<01:19,  5.61it/s][A[A

 78%|███████▊  | 1555/2000 [04:36<01:19,  5.62it/s][A[A

 78%|███████▊  | 1556/2000 [04:36<01:19,  5.62it/s][A[A

 78%|███████▊  | 1559/2000 [04:37<01:18,  5.63it/s][A[A

 78%|███████▊  | 1561/2000 [04:37<01:17,  5.63it/s][A[A

 78%|███████▊  | 1563/2000 [04:37<01:17,  5.64it/s][A[A

 78%|███████▊  | 1566/2000 [04:37<01:16,  5.64it/s][A[

 99%|█████████▉| 1984/2000 [05:12<00:02,  6.36it/s][A[A

 99%|█████████▉| 1987/2000 [05:12<00:02,  6.36it/s][A[A

100%|█████████▉| 1992/2000 [05:12<00:01,  6.37it/s][A[A

100%|█████████▉| 1995/2000 [05:14<00:00,  6.34it/s][A[A

100%|█████████▉| 1999/2000 [05:14<00:00,  6.35it/s][A[A

100%|██████████| 2000/2000 [05:14<00:00,  6.35it/s][A[A

# Explantory

# Modelling

# Please save the training set as the csv file.

In [20]:
import csv
import time
'''
Description: get time
Input: 
Output: time
''' 
def nowtime():
    return time.strftime("%Y%m%d-%H%M", time.localtime())


"""
Description: Save prediction result to files
Input: (1) result
       (2) filename
Output: 
"""
def save_prediction_to_csv(result,filename):
    headers = ['id','Prediction']

    with open(filename + str(nowtime()) + ".csv", 'w', encoding = 'utf8') as f:
        f_csv = csv.writer(f)
        f_csv.writerow(headers)
        f_csv.writerows(result)
save_prediction_to_csv(result, "shawn_2_outbound_aa")

# Prediction