# Load the data

In [1]:
import pandas as pd
import pickle
import math



In [2]:
data_dir = "../../data/"
def save_obj(obj, name ):
    with open( name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(data_dir + name + '.pkl', 'rb') as f:
        return pickle.load(f)
# load data
Edges = load_obj("New_Edges")
Nodes = load_obj("New_Nodes")
# dict
# feature_Name = ["Number of neighbours", "Log(Num of nei)", "list of Neighbours", "list of in neighbous", 
#                 "number of in_nei", "list of out neig", "num of out neig"]
BasicFeatures = load_obj("pre_features")

In [13]:
def get_jaccard_coefficient(source, sink):
    """
    in: source::Node object
    in: sink::Node object
    return: jaccard's cofficient::numeric
    """
    # transform
    neighbours_of_source_list = BasicFeatures[source][2]
    neighbours_of_sink_list = BasicFeatures[sink][2]
    
    neigbours_set_of_source = set(neighbours_of_source_list)
    neigbours_set_of_sink = set(neighbours_of_sink_list)
    union_neighbours = neigbours_set_of_source | neigbours_set_of_sink
    common_neighbours = neigbours_set_of_source & neigbours_set_of_sink
    if len(union_neighbours)==0:
        return 0.0
    return(len(common_neighbours)/len(union_neighbours))

def get_preferential_attachment(source, sink):
    # transform
    neighbours_of_source_list = BasicFeatures[source][2]
    neighbours_of_sink_list = BasicFeatures[sink][2]
    
    neigbours_set_of_source = set(neighbours_of_source_list)
    neigbours_set_of_sink = set(neighbours_of_sink_list)
    
    return len(neigbours_set_of_source)*len(neigbours_set_of_sink)

def get_adamic_adar(source, sink):
    # transform
    neighbours_of_source_list = BasicFeatures[source][2]
    neighbours_of_sink_list = BasicFeatures[sink][2]

    neigbours_set_of_source = set(neighbours_of_source_list)
    neigbours_set_of_sink = set(neighbours_of_sink_list)
    common_neighbours = neigbours_set_of_source & neigbours_set_of_sink
    # get the summation
    score = 0
    for common_node in common_neighbours:
        if math.log(len(BasicFeatures[common_node][2])) == 0:
            return 0.0
        score = score + 1/math.log(len(BasicFeatures[common_node][2]))
    return score


# how similar are the outbound neighbors of source to sink
# either JA, PA, AA
def get_outbound_similarity_score(source, sink, metric):
    # get the outbound_node of source
    outbound_node_for_source_set = set(BasicFeatures[source][5])
    summation = 0
    for outbound_node_for_source in outbound_node_for_source_set:
        summation =summation + metric(sink,outbound_node_for_source)
    if len(outbound_node_for_source_set) == 0:
        return 0
    score = 1/len(outbound_node_for_source_set)*summation
    return score

# either JA, PA, AA
def get_inbound_similarity_score(source, sink, metric):
    # get the inbound_node of sink
    inbound_node_for_sink_set = set(BasicFeatures[source][3])
    summation = 0
    for inbound_node_for_sink in inbound_node_for_sink_set:
        summation =summation + metric(source,inbound_node_for_sink)
    if len(inbound_node_for_sink_set) == 0:
        return 0
    score = 1/len(inbound_node_for_sink_set)*summation
    return score


In [14]:
from tqdm import tqdm
with open(data_dir + "test-public.txt", "r") as f:
     test_data = f.readlines()
test_data = [i.split() for i in test_data[1:]]
    
def predict():
    """
    make the prediction using the jaccard's coefficient
    """
    result = []
    for line in tqdm(test_data):
        # converse to integer
        node_x = int(line[1].strip())
        node_y = int(line[2].strip())
        score = get_inbound_similarity_score(node_x, node_y,get_adamic_adar)
#         print(score)
        result.append((line[0], score))
#         print(result)
    return result
result = predict()




  0%|          | 0/2000 [00:00<?, ?it/s][A[A

  0%|          | 3/2000 [00:00<02:04, 16.08it/s][A[A

  0%|          | 5/2000 [00:00<02:10, 15.26it/s][A[A

  0%|          | 9/2000 [00:00<01:59, 16.73it/s][A[A

  1%|          | 17/2000 [00:00<01:19, 24.85it/s][A[A

  1%|          | 20/2000 [00:00<01:30, 21.77it/s][A[A

  1%|          | 23/2000 [00:01<01:44, 18.99it/s][A[A

  1%|▏         | 25/2000 [00:02<03:19,  9.89it/s][A[A

  1%|▏         | 28/2000 [00:02<03:10, 10.35it/s][A[A

  2%|▏         | 30/2000 [00:02<03:05, 10.60it/s][A[A

  2%|▏         | 32/2000 [00:02<03:01, 10.83it/s][A[A

  2%|▏         | 35/2000 [00:03<02:52, 11.41it/s][A[A

  2%|▏         | 41/2000 [00:03<02:36, 12.54it/s][A[A

  2%|▏         | 45/2000 [00:03<02:32, 12.79it/s][A[A

  2%|▏         | 48/2000 [00:07<05:10,  6.29it/s][A[A

  3%|▎         | 51/2000 [00:08<05:15,  6.18it/s][A[A

  3%|▎         | 53/2000 [00:08<05:12,  6.24it/s][A[A

  3%|▎         | 55/2000 [00:08<05:17,  6

 24%|██▍       | 480/2000 [00:57<03:00,  8.41it/s][A[A

 24%|██▍       | 482/2000 [00:57<03:00,  8.42it/s][A[A

 24%|██▍       | 484/2000 [00:57<02:59,  8.43it/s][A[A

 24%|██▍       | 487/2000 [00:57<02:59,  8.45it/s][A[A

 25%|██▍       | 491/2000 [00:57<02:57,  8.49it/s][A[A

 25%|██▍       | 494/2000 [00:57<02:56,  8.52it/s][A[A

 25%|██▍       | 496/2000 [00:58<02:56,  8.54it/s][A[A

 25%|██▌       | 503/2000 [00:58<02:54,  8.59it/s][A[A

 25%|██▌       | 505/2000 [00:58<02:53,  8.60it/s][A[A

 25%|██▌       | 507/2000 [00:58<02:53,  8.61it/s][A[A

 26%|██▌       | 510/2000 [00:59<02:52,  8.64it/s][A[A

 26%|██▌       | 512/2000 [00:59<02:51,  8.66it/s][A[A

 26%|██▌       | 514/2000 [00:59<02:51,  8.67it/s][A[A

 26%|██▌       | 516/2000 [00:59<02:50,  8.68it/s][A[A

 26%|██▌       | 523/2000 [00:59<02:48,  8.77it/s][A[A

 26%|██▋       | 526/2000 [00:59<02:47,  8.80it/s][A[A

 26%|██▋       | 529/2000 [01:00<02:47,  8.79it/s][A[A

 27%|██▋      

 47%|████▋     | 934/2000 [01:57<02:14,  7.92it/s][A[A

 47%|████▋     | 936/2000 [01:58<02:14,  7.93it/s][A[A

 47%|████▋     | 938/2000 [01:58<02:14,  7.91it/s][A[A

 47%|████▋     | 943/2000 [01:58<02:13,  7.94it/s][A[A

 47%|████▋     | 946/2000 [01:58<02:12,  7.95it/s][A[A

 47%|████▋     | 948/2000 [01:59<02:12,  7.95it/s][A[A

 48%|████▊     | 950/2000 [01:59<02:11,  7.96it/s][A[A

 48%|████▊     | 953/2000 [01:59<02:11,  7.97it/s][A[A

 48%|████▊     | 955/2000 [01:59<02:11,  7.98it/s][A[A

 48%|████▊     | 957/2000 [02:00<02:11,  7.94it/s][A[A

 48%|████▊     | 959/2000 [02:00<02:10,  7.95it/s][A[A

 48%|████▊     | 961/2000 [02:00<02:10,  7.96it/s][A[A

 48%|████▊     | 964/2000 [02:00<02:09,  7.97it/s][A[A

 49%|████▊     | 972/2000 [02:01<02:08,  8.03it/s][A[A

 49%|████▉     | 975/2000 [02:01<02:07,  8.02it/s][A[A

 49%|████▉     | 978/2000 [02:01<02:07,  8.02it/s][A[A

 49%|████▉     | 981/2000 [02:02<02:06,  8.04it/s][A[A

 49%|████▉    

 68%|██████▊   | 1365/2000 [03:06<01:26,  7.32it/s][A[A

 68%|██████▊   | 1367/2000 [03:06<01:26,  7.32it/s][A[A

 68%|██████▊   | 1370/2000 [03:06<01:25,  7.34it/s][A[A

 69%|██████▊   | 1372/2000 [03:06<01:25,  7.34it/s][A[A

 69%|██████▉   | 1377/2000 [03:07<01:24,  7.36it/s][A[A

 69%|██████▉   | 1380/2000 [03:07<01:24,  7.37it/s][A[A

 69%|██████▉   | 1383/2000 [03:07<01:23,  7.38it/s][A[A

 69%|██████▉   | 1386/2000 [03:08<01:23,  7.36it/s][A[A

 70%|██████▉   | 1390/2000 [03:08<01:22,  7.37it/s][A[A

 70%|██████▉   | 1392/2000 [03:08<01:22,  7.37it/s][A[A

 70%|██████▉   | 1394/2000 [03:08<01:22,  7.38it/s][A[A

 70%|██████▉   | 1396/2000 [03:09<01:21,  7.38it/s][A[A

 70%|███████   | 1403/2000 [03:09<01:20,  7.42it/s][A[A

 70%|███████   | 1407/2000 [03:09<01:19,  7.43it/s][A[A

 70%|███████   | 1410/2000 [03:09<01:19,  7.43it/s][A[A

 71%|███████   | 1413/2000 [03:10<01:19,  7.43it/s][A[A

 71%|███████   | 1417/2000 [03:10<01:18,  7.44it/s][A[

 89%|████████▉ | 1789/2000 [04:56<00:35,  6.02it/s][A[A

 90%|████████▉ | 1792/2000 [04:58<00:34,  6.00it/s][A[A

 90%|████████▉ | 1793/2000 [04:58<00:34,  6.00it/s][A[A

 90%|████████▉ | 1795/2000 [04:58<00:34,  6.01it/s][A[A

 90%|████████▉ | 1798/2000 [04:59<00:33,  6.01it/s][A[A

 90%|█████████ | 1801/2000 [04:59<00:33,  6.02it/s][A[A

 90%|█████████ | 1807/2000 [04:59<00:32,  6.03it/s][A[A

 90%|█████████ | 1809/2000 [04:59<00:31,  6.03it/s][A[A

 91%|█████████ | 1813/2000 [05:00<00:30,  6.04it/s][A[A

 91%|█████████ | 1815/2000 [05:00<00:30,  6.04it/s][A[A

 91%|█████████ | 1817/2000 [05:00<00:30,  6.05it/s][A[A

 91%|█████████ | 1820/2000 [05:00<00:29,  6.06it/s][A[A

 91%|█████████ | 1822/2000 [05:00<00:29,  6.06it/s][A[A

 91%|█████████▏| 1825/2000 [05:00<00:28,  6.07it/s][A[A

 91%|█████████▏| 1827/2000 [05:01<00:28,  6.07it/s][A[A

 92%|█████████▏| 1832/2000 [05:01<00:27,  6.08it/s][A[A

 92%|█████████▏| 1835/2000 [05:01<00:27,  6.08it/s][A[

# Explantory

# Modelling

# Please save the training set as the csv file.

In [11]:
import csv
import time
'''
Description: get time
Input: 
Output: time
''' 
def nowtime():
    return time.strftime("%Y%m%d-%H%M", time.localtime())


"""
Description: Save prediction result to files
Input: (1) result
       (2) filename
Output: 
"""
def save_prediction_to_csv(result,filename):
    headers = ['id','Prediction']

    with open(filename + str(nowtime()) + ".csv", 'w', encoding = 'utf8') as f:
        f_csv = csv.writer(f)
        f_csv.writerow(headers)
        f_csv.writerows(result)
save_prediction_to_csv(result, "shawn_3_inbound_pa")

# Prediction