In [1]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import scipy.sparse as sp
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
import pickle
import math

In [2]:
data_dir = "../../data/"
train_data = data_dir + "train.txt"

In [3]:
def node2index():
    node_index = dict()
    with open(train_data) as f:
        for line in f:
            line = [x.strip('\n') for x in line.split('\t')]
            node_index[line[0]] = node_index.get(line[0],[])+line[1:]
            for node in line:
                node_index[node] = node_index.get(node,[]) + [line[0]]
    return node_index
node_index = node2index()

4867136

In [30]:
# Calculation
def find_neighbours(id):
    
    """
    find all the neighbours of node by id.
    1. All the sink node of node id will be appended as the neighbor first.
    2. All the source node of node id will also be added as neighbor then.
    3. return the neighbor set.
    """
    neighbour_set = set()
    for edge in sorted_edges:
        if edge[0] == id:
            # add the sink node of node id
            neighbour_set.add(edge[1])
        elif edge[1] == id:
            # add the source node of node id
            neighbour_set.add(edge[0])
    return neighbour_set

def get_jaccard_coefficient(node_x, node_y):
    """
    in: node_x::Node object
    in: node_y::Node object
    return: jaccard's cofficient::numeric
    """
    neigbours_set_of_node_x = set(node_index[str(node_x)])
    neigbours_set_of_node_y = set(node_index[str(node_y)])
    union_neighbours = neigbours_set_of_node_x | neigbours_set_of_node_y
    common_neighbours = neigbours_set_of_node_x & neigbours_set_of_node_y
    if len(union_neighbours)==0:
        return 0.0
    return(len(common_neighbours)/len(union_neighbours))

def get_preferential_attachment(node_x, node_y):
    neigbours_set_of_node_x = set(node_index[str(node_x)])
    neigbours_set_of_node_y = set(node_index[str(node_y)])
    return len(neigbours_set_of_node_x)*len(neigbours_set_of_node_y)

def get_adamic_adar(node_x, node_y):
    neigbours_set_of_node_x = set(node_index[str(node_x)])
    neigbours_set_of_node_y = set(node_index[str(node_y)])
    common_neighbours = neigbours_set_of_node_x & neigbours_set_of_node_y
    # get the summation
    score = 0
    for common_node in common_neighbours:
        score = score + 1/math.log(len(set(node_index[str(node_x)])))
    return score

# Make the prediction

In [29]:
import numpy as np
with open(data_dir + "test-public.txt", "r") as f:
     test_data = f.readlines()
test_data = [i.split() for i in test_data[1:]]

def predict():
    """
    make the prediction using the jaccard's coefficient
    """
#     result = np.zeros()
    id_list = []
    new_result = []
    count = 0
    for line in test_data:
        # converse to integer
        node_x = int(line[1].strip())
        node_y = int(line[2].strip())
        jaccard_coefficient = get_jaccard_coefficient(node_x, node_y)
        adamic_adar = get_adamic_adar(node_x, node_y)
        preferential_attachment = get_preferential_attachment(node_x,node_y)
        # 构成新的矩阵
        new_result.append([jaccard_coefficient, adamic_adar, preferential_attachment])
#         result = np.vstack([result, new_result])
        id_list.append(line[0])
    return id_list, new_result
id_list, result = predict()
# 预测结果
result_array = np.asarray(result)

# Data preprocessing:

In [92]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
# data 需要为array
def rescale_min_max(data): 
    """
    min-max normalisation
    """
    scaler = MinMaxScaler()
    scaler.fit(data)
    result = scaler.transform(data)
    return result

def standardise(data):
    """remove the mean and transform to unit variance"""
    scaler = StandardScaler()
    scaler.fit(data)
    result = scaler.transform(data)
    return result

preprocessed_data = rescale_min_max(result_array)

# 计算三个测度的均值，等

In [93]:
# return the min, max, mean and median of different score of a single test edge.
import statistics
def get_min(input_row):
    return min(input_row)

def get_max(input_row):
    return max(input_row)

def get_mean(input_row):
    return statistics.mean(input_row)

def get_median(input_row):
    return statistics.median(input_row)


In [95]:
explantory_result = np.apply_along_axis(get_min, 1, preprocessed_data)
id_array = np.transpose(np.asarray(id_list))
# 拼接矩阵
result_to_write = np.column_stack((id_array, explantory_result))
print(result_to_write)

[['1' '6.450000546912489e-07']
 ['2' '0.008333333333333333']
 ['3' '6.182489003492156e-07']
 ..., 
 ['1998' '8.379650480117828e-05']
 ['1999' '1.8042167428460278e-05']
 ['2000' '2.95005785382979e-06']]


# Save the result to the csv

In [90]:
import csv
import time
'''
Description: get time
Input: 
Output: time
''' 
def nowtime():
    return time.strftime("%Y%m%d-%H%M", time.localtime())


"""
Description: Save prediction result to files
Input: (1) result
       (2) filename
Output: 
"""
def save_prediction_to_csv(result,filename):
    headers = ['id','Prediction']

    with open(filename + str(nowtime()) + ".csv", 'w', encoding = 'utf8') as f:
        f_csv = csv.writer(f)
        f_csv.writerow(headers)
        f_csv.writerows(result)

In [91]:
save_prediction_to_csv(result_to_write, "shawn_normalise_min")