# Load the data

In [1]:
import pandas as pd
import pickle
import math



In [2]:
data_dir = "../../data/"
def save_obj(obj, name ):
    with open( name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(data_dir + name + '.pkl', 'rb') as f:
        return pickle.load(f)
# load data
Edges = load_obj("New_Edges")
Nodes = load_obj("New_Nodes")
# dict
# feature_Name = ["Number of neighbours", "Log(Num of nei)", "list of Neighbours", "list of in neighbous", 
#                 "number of in_nei", "list of out neig", "num of out neig"]
BasicFeatures = load_obj("pre_features")

In [5]:
def get_jaccard_coefficient(source, sink):
    """
    in: source::Node object
    in: sink::Node object
    return: jaccard's cofficient::numeric
    """
    # transform
    neighbours_of_source_list = BasicFeatures[source][2]
    neighbours_of_sink_list = BasicFeatures[sink][2]
    
    neigbours_set_of_source = set(neighbours_of_source_list)
    neigbours_set_of_sink = set(neighbours_of_sink_list)
    union_neighbours = neigbours_set_of_source | neigbours_set_of_sink
    common_neighbours = neigbours_set_of_source & neigbours_set_of_sink
    if len(union_neighbours)==0:
        return 0.0
    return(len(common_neighbours)/len(union_neighbours))


# how similar are the outbound neighbors of source to sink
def get_outbound_similarity_score(source, sink, metric):
    # get the outbound_node of source
    outbound_node_for_source_set = set(BasicFeatures[source][5])
    summation = 0
    for outbound_node_for_source in outbound_node_for_source_set:
        summation =summation + metric(sink,outbound_node_for_source)
    score = 1/len(outbound_node_for_source_set)*summation
    return score

def get_inbound_similarity_score(source, sink, metric):
    # get the inbound_node of sink
    inbound_node_for_sink_set = set(BasicFeatures[source][3])
    summation = 0
    for inbound_node_for_sink in inbound_node_for_sink_set:
        summation =summation + metric(source,inbound_node_for_sink)
    score = 1/len(inbound_node_for_sink_set)*summation
    return score


In [None]:
from tqdm import tqdm
with open(data_dir + "test-public.txt", "r") as f:
     test_data = f.readlines()
test_data = [i.split() for i in test_data[1:]]
    
def predict():
    """
    make the prediction using the jaccard's coefficient
    """
    result = []
    for line in tqdm(test_data):
        # converse to integer
        node_x = int(line[1].strip())
        node_y = int(line[2].strip())
        score = get_inbound_similarity_score(node_x, node_y,get_jaccard_coefficient)
#         print(score)
        result.append((line[0], score))
#         print(result)
    return result
result = predict()
print(result)


  0%|          | 0/2000 [00:00<?, ?it/s][A
  0%|          | 1/2000 [00:00<05:41,  5.85it/s][A
  0%|          | 3/2000 [00:00<05:07,  6.50it/s][A
  0%|          | 5/2000 [00:00<05:20,  6.22it/s][A
  0%|          | 8/2000 [00:01<04:11,  7.91it/s][A
  0%|          | 9/2000 [00:01<04:50,  6.84it/s][A
  1%|          | 16/2000 [00:01<02:56, 11.21it/s][A
  1%|          | 19/2000 [00:02<03:40,  8.97it/s][A
  1%|          | 22/2000 [00:02<04:10,  7.88it/s][A
  1%|          | 24/2000 [00:04<06:25,  5.12it/s][A
  1%|▏         | 25/2000 [00:04<06:29,  5.07it/s][A
  1%|▏         | 28/2000 [00:05<06:18,  5.21it/s][A
  2%|▏         | 30/2000 [00:05<06:12,  5.29it/s][A
  2%|▏         | 31/2000 [00:05<06:19,  5.19it/s][A
  2%|▏         | 33/2000 [00:06<06:05,  5.38it/s][A
  2%|▏         | 35/2000 [00:06<05:50,  5.60it/s][A
  2%|▏         | 41/2000 [00:06<05:19,  6.13it/s][A
  2%|▏         | 45/2000 [00:07<05:11,  6.28it/s][A
  2%|▏         | 47/2000 [00:07<05:12,  6.24it/s][A
  2%|▏

 19%|█▉        | 382/2000 [01:27<06:10,  4.37it/s][A
 19%|█▉        | 384/2000 [01:27<06:09,  4.37it/s][A
 19%|█▉        | 386/2000 [01:28<06:11,  4.34it/s][A
 19%|█▉        | 387/2000 [01:29<06:13,  4.32it/s][A
 20%|█▉        | 390/2000 [01:29<06:10,  4.35it/s][A
 20%|█▉        | 392/2000 [01:30<06:09,  4.35it/s][A
 20%|█▉        | 397/2000 [01:30<06:04,  4.40it/s][A
 20%|██        | 401/2000 [01:30<06:00,  4.43it/s][A
 20%|██        | 403/2000 [01:30<06:00,  4.43it/s][A
 20%|██        | 405/2000 [01:31<05:58,  4.44it/s][A
 20%|██        | 407/2000 [01:31<05:57,  4.45it/s][A
 21%|██        | 411/2000 [01:32<05:55,  4.46it/s][A
 21%|██        | 412/2000 [01:32<05:55,  4.47it/s][A
 21%|██        | 416/2000 [01:32<05:52,  4.49it/s][A
 21%|██        | 417/2000 [01:33<05:54,  4.47it/s][A
 21%|██▏       | 425/2000 [01:33<05:47,  4.54it/s][A
 21%|██▏       | 427/2000 [01:33<05:45,  4.55it/s][A
 21%|██▏       | 429/2000 [01:34<05:44,  4.56it/s][A
 22%|██▏       | 431/2000 [0

# Explantory

# Modelling

# Please save the training set as the csv file.

In [None]:
from sklearn.linear_model.logistic import LogisticRegression
# TODO: Do we need to split the train and validation data via this one?
from sklearn.cross_validation import train_test_split
classifier = LogisticRegression()
classifier.fit(X_train, y_train)


# Prediction