In [33]:
import warnings
warnings.filterwarnings("ignore")
import csv
import pandas as pd
import datetime
import time
import numpy as np
import matplotlib
import seaborn as sns
import math
import os
import xgboost as xgb
import networkx as nx
import pdb
import pickle
from pandas import HDFStore,DataFrame
from scipy.sparse.linalg import svds, eigs
import gc
from tqdm import tqdm

In [34]:
train_graph = nx.read_edgelist('data/train_set.csv', delimiter = ',', create_using = nx.DiGraph(), nodetype = int)
print(nx.info(train_graph))


DiGraph with 1780722 nodes and 7550015 edges


In [20]:
# 1. Jaccard distance similarity measure:

def jaccard_for_followed(a,b):
    try:
        if len(set(train_graph.successors(a))) == 0  | len(set(train_graph.successors(b))) == 0:
            return 0
        dist = (len(set(train_graph.successors(a)).intersection(set(train_graph.successors(b)))))/\
                                    (len(set(train_graph.successors(a)).union(set(train_graph.successors(b)))))
    except:
        return 0
    return dist

In [12]:
print(jaccard_for_followed(912810,1678443))

0.058823529411764705


In [9]:
def jaccard_for_following(a,b):
    try:
        if len(set(train_graph.predecessors(a))) == 0  | len(set(g.predecessors(b))) == 0:
            return 0
        dist = (len(set(train_graph.predecessors(a)).intersection(set(train_graph.predecessors(b)))))/\
                                 (len(set(train_graph.predecessors(a)).union(set(train_graph.predecessors(b)))))
        return dist
    except:
        return 0

In [14]:
print(jaccard_for_following(1722833,544361))

0


In [19]:
# 2. Cosine distance similarity measure:
def cosine_for_followed(a,b):
    try:
        if len(set(train_graph.successors(a))) == 0  | len(set(train_graph.successors(b))) == 0:
            return 0
        dist = (len(set(train_graph.successors(a)).intersection(set(train_graph.successors(b)))))/\
                                    (math.sqrt(len(set(train_graph.successors(a)))*len((set(train_graph.successors(b))))))
        return dist
    except:
        return 0

In [16]:
print(cosine_for_followed(273084,1505602))

0.0


In [17]:
def cosine_for_following(a,b):
    try:  
        if len(set(train_graph.predecessors(a))) == 0  | len(set(train_graph.predecessors(b))) == 0:
            return 0
        dist = (len(set(train_graph.predecessors(a)).intersection(set(train_graph.predecessors(b)))))/\
                                     (math.sqrt(len(set(train_graph.predecessors(a))))*(len(set(train_graph.predecessors(b)))))
        return dist
    except:
        return 0

In [18]:
print(cosine_for_following(2,470294))

0.02886751345948129


In [21]:
# 3. Page ranking measure:
rank = pickle.load(open('data/rank.p','rb'))

In [23]:
print('min',rank[min(rank, key=rank.get)])
print('max',rank[max(rank, key=rank.get)])
print('mean',float(sum(rank.values())) / len(rank))

min 1.6556497245737814e-07
max 2.7098251341935827e-05
mean 5.615699699389075e-07


In [27]:
# Calculating shortest path between two nodes:
def compute_shortest_path_length(a,b):
    p = -1
    try:
        if train_graph.has_edge(a,b):
            train_graph.remove_edge(a,b)
            p = nx.shortest_path_length(train_graph,source = a,target = b)
            train_graph.add_edge(a,b)
        else:
            p = nx.shortest_path_length(train_graph,source = a,target = b)
        return p
    except:
        return -1

In [28]:
# 4. Checking for same community feature:
wcc = list(nx.weakly_connected_components(train_graph))
def belongs_to_same_wcc(a,b):
    index = []
    if train_graph.has_edge(b,a):
        return 1
    if train_graph.has_edge(a,b):
            for i in wcc:
                if a in i:
                    index = i
                    break
            if (b in index):
                train_graph.remove_edge(a,b)
                if compute_shortest_path_length(a,b) == -1:
                    train_graph.add_edge(a,b)
                    return 0
                else:
                    train_graph.add_edge(a,b)
                    return 1
            else:
                return 0
    else:
            for i in wcc:
                if a in i:
                    index = i
                    break
            if(b in index):
                return 1
            else:
                return 0

In [29]:
belongs_to_same_wcc(861, 1659750)

0

In [30]:
# 5. Adamic/Adar index
def adamic_adar(a,b):
    sum = 0
    try:
        n = list(set(train_graph.successors(a)).intersection(set(train_graph.successors(b))))
        if len(n) != 0:
            for i in n:
                sum = sum + (1/np.log10(len(list(train_graph.predecessors(i)))))
            return sum
        else:
            return 0
    except:
        return 0

In [32]:
adamic_adar(582634,1048523)

2.9284406484993215