In [1]:
import numpy as np
import pandas as pd

# 1.1 Data Input

In [2]:
å = 0.85
e = 0.00001

# 1.2 Creating an Adjacency Matrix

In [3]:
am = np.matrix('1 0 2 0 4 3; 3 0 1 1 0 0; 2 0 4 0 1 0; 0 0 1 0 0 1; 8 0 3 0 5 2; 0 0 0 0 0 0', dtype=float)
print(am)

[[ 1.  0.  2.  0.  4.  3.]
 [ 3.  0.  1.  1.  0.  0.]
 [ 2.  0.  4.  0.  1.  0.]
 [ 0.  0.  1.  0.  0.  1.]
 [ 8.  0.  3.  0.  5.  2.]
 [ 0.  0.  0.  0.  0.  0.]]


# 1.3 Modifying the Adjacency Matrix

## diagonal of matrix to zero

In [4]:
def digonalZero(am):
    np.fill_diagonal(am, 0)

In [5]:
digonalZero(am)
am

matrix([[ 0.,  0.,  2.,  0.,  4.,  3.],
        [ 3.,  0.,  1.,  1.,  0.,  0.],
        [ 2.,  0.,  0.,  0.,  1.,  0.],
        [ 0.,  0.,  1.,  0.,  0.,  1.],
        [ 8.,  0.,  3.,  0.,  0.,  2.],
        [ 0.,  0.,  0.,  0.,  0.,  0.]])

## normalize the columns of the matrix

In [6]:
def normalizeCol(am):
    # sum up each column 
    sum = am.sum(axis=0)
    # divide each entry in a column by the sum of that column
    return np.divide(am, sum, out=np.zeros_like(am), where=sum!=0)

In [7]:
H = normalizeCol(am)
H

matrix([[ 0.        ,  0.        ,  0.28571429,  0.        ,  0.8       ,
          0.5       ],
        [ 0.23076923,  0.        ,  0.14285714,  1.        ,  0.        ,
          0.        ],
        [ 0.15384615,  0.        ,  0.        ,  0.        ,  0.2       ,
          0.        ],
        [ 0.        ,  0.        ,  0.14285714,  0.        ,  0.        ,
          0.16666667],
        [ 0.61538462,  0.        ,  0.42857143,  0.        ,  0.        ,
          0.33333333],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ]])

# 1.4 Identifying the Dangling Nodes

In [8]:
def danglingNode(am):
    sum = am.sum(axis=0)
    # if the sum of each column is 0, it means there is no citation
#     danglingNode = (sum == 0).astype(float)
    return (sum == 0).astype(float)

In [9]:
danglingNode = danglingNode(am)
danglingNode

matrix([[ 0.,  1.,  0.,  0.,  0.,  0.]])

# 1.5 Calculating the Stationary Vector

## Article Vector

In [10]:
def articleVector(npMatrix):
    '''npMatrix is the total number of articles published by all of the journals'''
    # column vector of the number of articles published in each journal over the (five-year) target window, 
    # normalized so that its entries sum to 1
    return npMatrix / npMatrix.sum()

In [11]:
a = articleVector(np.matrix("3; 2; 5; 1; 2; 1"))
a

matrix([[ 0.21428571],
        [ 0.14285714],
        [ 0.35714286],
        [ 0.07142857],
        [ 0.14285714],
        [ 0.07142857]])

## Initial Vector

In [12]:
def initialVector(a, am):
    return np.ones_like(a) / am.shape[0]

In [13]:
pi0 = initialVector(a, am)
pi0

matrix([[ 0.16666667],
        [ 0.16666667],
        [ 0.16666667],
        [ 0.16666667],
        [ 0.16666667],
        [ 0.16666667]])

## Influence Vector

In [14]:
def calPiK1(H, piK, danglingNode, a):
    # π(k+1) euqation
    p1 = (å * H).dot(piK)
    p2 = (å * danglingNode).dot(piK)
    p2 = p2  + (1 - å)
    p2 = np.multiply(p2, a)
    # return piK1
    return p1 + p2

def iteration(H, pi0, danglingNode, a):
    # initialize the piK1 and norm
    piK1 = calPiK1(H, pi0, danglingNode, a)
    norm = np.linalg.norm((piK1 - pi0))
    # first iteration piK is equal to pi0
    piK = pi0
    
    counter = 0
    # while residual is less than e, piK ~ piK1 is the influence vector
    while norm > e:
        # calculate the norm again and update the current influence vector to iterate
        piK1 = calPiK1(H, piK, danglingNode, a)
        norm = np.linalg.norm((piK1 - piK))
        piK = piK1
        counter += 1
    return counter, piK1

In [15]:
counter, iv = iteration(H, pi0, danglingNode, a)
print("Iterated %i times" %(counter))
print("Influence vector is: {0}".format(iv))

Iterated 17 times
Influence vector is: [[ 0.30402454]
 [ 0.16360216]
 [ 0.18979672]
 [ 0.04661902]
 [ 0.2753102 ]
 [ 0.02064736]]


# 1.6 Calculationg the EigenFactor (EF) Sco

In [16]:
def calEFSco(H, pi):
    Hpi = H.dot(pi)
    return 100*(Hpi/Hpi.sum())

In [17]:
calEFSco(H, iv)

matrix([[ 34.05071853],
        [ 17.20381588],
        [ 12.17543157],
        [  3.65317104],
        [ 32.91686298],
        [  0.        ]])

# Real data

In [21]:
import numpy as np
import pandas as pd
import time

def createAM(nodes, link):
    am = np.zeros((nodes, nodes))
    journal = []
    with open(link) as f:
        for i, col in enumerate(f):
            journal.append(col[1])
            col = col.split(',')
            am[int(col[1])][int(col[0])] = int(col[2])
    return journal, am

def digonalZero(am):
    np.fill_diagonal(am, 0)

def normalizeCol(am):
    # sum up each column 
    sum = am.sum(axis=0)
    # divide each entry in a column by the sum of that column
    return np.divide(am, sum, out=np.zeros_like(am), where=sum!=0)

def danglingNode(am):
    sum = am.sum(axis=0)
    # if the sum of each column is 0, it means there is no citation
    return (sum == 0).astype(float)

def articleVector(npMatrix):
    '''npMatrix is the total number of articles published by all of the journals'''
    # column vector of the number of articles published in each journal over the (five-year) target window, 
    # normalized so that its entries sum to 1
    return npMatrix / npMatrix.sum()

def initialVector(a, am):
    return np.ones_like(a) / am.shape[0]

def calPiK1(H, piK, danglingNode, a):
    # π(k+1) euqation
    p1 = (å * H).dot(piK)
    p2 = (å * danglingNode).dot(piK)
    p2 = p2  + (1 - å)
    p2 = np.multiply(p2, a)
    # return piK1
    return p1 + p2

def iteration(H, pi0, danglingNode, a):
    # initialize the piK1 and norm
    piK1 = calPiK1(H, pi0, danglingNode, a)
    norm = np.linalg.norm((piK1 - pi0))
    # first iteration piK is equal to pi0
    piK = pi0
    
    counter = 0
    # while residual is less than e, piK ~ piK1 is the influence vector
    while norm > e:
        # calculate the norm again and update the current influence vector to iterate
        piK1 = calPiK1(H, piK, danglingNode, a)
        norm = np.linalg.norm((piK1 - piK))
        piK = piK1
        counter += 1
    return counter, piK1

def calEFSco(H, pi):
    Hpi = H.dot(pi)
    return 100*(Hpi/Hpi.sum())

In [22]:
start_time = time.time()
journal, am = createAM(10748, './links.txt')
digonalZero(am)
H = normalizeCol(am)
danglingNode = danglingNode(am)
tempAM = np.empty((10748))
tempAM[:] = 1
articleVector = articleVector(tempAM)
intialVector = initialVector(articleVector, am)
counter, influenceVector = iteration(H, intialVector, danglingNode, articleVector)
eiganfactor = calEFSco(H, influenceVector)

In [23]:
for j in eiganfactor.argsort()[::-1][:20]:
    print("{0}: {1}".format(j, eiganfactor[j]))

4408: 1.4475384035784298
4801: 1.412037574246075
6610: 1.2346058183328048
2056: 0.6793346438705757
6919: 0.6646919701918372
6667: 0.634252774272873
4024: 0.5768669411286474
6523: 0.4806087244452295
8930: 0.4775893614030741
6857: 0.43962239580152657
5966: 0.42962702460107494
1995: 0.38598353313554246
1935: 0.3850483689431774
3480: 0.3795244737968881
4598: 0.372625313922009
2880: 0.33019385958105496
3314: 0.3273062302440311
6569: 0.319195230539843
5035: 0.31659068819564273
1212: 0.3112124767619225


In [24]:
# iterated 21 times
print("Iteration time: %i" %(counter))
# ≈ 33 seconds
print("Run time: %s seconds." %(time.time() - start_time))

Iteration time: 21
Run time: 48.41633892059326 seconds.
