In [19]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import scipy.sparse as sp
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
import pickle
import math
from tqdm import tqdm_notebook as tqdm


## Load data

In [4]:
def save_obj(obj, name ):
    with open( name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open( name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [5]:
Edges = load_obj("New_Edges")
Nodes = load_obj("New_Nodes")
pre_features = load_obj("pre_features")

In [6]:
pre_features[1]

[3,
 0.7213475204444817,
 [1247754, 2382107, 4588320],
 [4588320, 1247754, 2382107],
 3,
 [],
 0]

# feature engineering

In [30]:
#Adamic-Adar similarity
def AA(Node1, Node2):
    sim = 0.0
    n1 = pre_features[Node1]
    n2 = pre_features[Node2]
    common_neighors = list(set(n1[2]).intersection(n2[2]))
    #print(len(common_neighors))
    for node in common_neighors:
        sim += pre_features[node][1]
    return sim

#Jaccard
def Jaccard(Node1, Node2):
    n1 = pre_features[Node1]
    n2 = pre_features[Node2]
    common_neighors = list(set(n1[2]).intersection(n2[2]))
    lm = len(common_neighors)
    if lm == 0:
        return 0
    else:
        return (0.0+lm)/(n1[0] + n2[0] -lm)

#Cosine
def Cosine(Node1, Node2):
    n1 = pre_features[Node1]
    n2 = pre_features[Node2]
    common_neighors = list(set(n1[2]).intersection(n2[2]))
    lm = len(common_neighors)
    if lm == 0:
        return 0
    else:
        return (0.0+lm)/((n1[0])*(n2[0]))    
    
#Salton Similarity
def salton_similarity(node1, node2):
    n1 = pre_features[node1]
    n2 = pre_features[node2]
    common_neighors = list(set(n1[2]).intersection(n2[2]))
    inter = len(common_neighors)
    degree_out_flow = n1[6]
    degree_in_flow = n2[4]
    
    if inter == 0:
        return 0
    else:
        sqrt_of_degree = math.sqrt(degree_out_flow * degree_in_flow)
        salton = inter / sqrt_of_degree
        probability = 1 /(1 - math.log(salton)*0.2)
        return probability

In [23]:
import copy
#Adding feature to data
def add_feature(d, feature):
    data = copy.deepcopy(d)
    for i in tqdm(range(len(data))):
        source, slink = data[i][0]
        for ff in feature:
            data[i].append(ff(source, slink))
    return data

# Make prediction

In [24]:
import math
def sigmoid(x):
    return (1 / (1 + math.exp(-x)))
def sigmoid_n(x):
    return ((1 / (1 + math.exp(-x))-0.5)*2)

In [25]:
with open("test-public.txt", "r") as f:
    test_data = f.readlines()
    test_data = [i.split() for i in test_data[1:]]

In [26]:
def predict(method):
    result = []
    for l in tqdm(range(len(test_data))):
        line = test_data[l]
        result.append((line[0], method(int(line[1]), int(line[2]))))
    return result

In [27]:
P_salton_similarity = predict(salton_similarity)

HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))




In [31]:
test_data[1000]

['1001', '1094048', '594102']

In [32]:
# Combination of 3 method
mixture_result = []
for line in test_data:
    source = int(line[1])
    slink = int(line[2])
    try:
        aa = AA(source, slink)
        ja = Jaccard(source, slink)
        co = Cosine(source, slink)
        salton = salton_similarity(source, slink)
    except:
        aa = 0
        ja = 0
        salton = 0
        co = 0
    mixture_result.append([aa, ja, co, salton])

In [33]:
from scipy.stats import rankdata
aa = np.array(rankdata([i[0] for i in mixture_result]))
ja = np.array(rankdata([i[1] for i in mixture_result]))
co = np.array(rankdata([i[2] for i in mixture_result]))
salton = np.array(rankdata([i[3] for i in mixture_result]))

In [34]:
# Score of combination model
com_prediction = []
order = 1
for score in list((aa + ja + co + salton)/np.max((aa + ja + salton + co))):
    com_prediction.append((order, score))
    order += 1

In [36]:
com_prediction[0]

(1, 0.20364935562077324)

## Save to file

In [28]:
import csv
import time
'''
Description: get time
Input: 
Output: time
''' 
def nowtime():
    return time.strftime("%Y%m%d-%H%M", time.localtime())


"""
Description: Save prediction result to files
Input: (1) result
       (2) filename
Output: 
"""
def save_prediction_to_csv(result,filename):
    headers = ['id','Prediction']

    with open(filename + str(nowtime()) + ".csv", 'w', encoding = 'utf8') as f:
        f_csv = csv.writer(f)
        f_csv.writerow(headers)
        f_csv.writerows(result)

In [29]:
save_prediction_to_csv(P_salton_similarity, "kuldeep")

In [37]:
save_prediction_to_csv(com_prediction, "xudong")