In [1]:
import os

import numpy as np
import pandas as pd
import scipy
from scipy import sparse
from scipy.stats import kendalltau

In [2]:
# connect to Google drive
from google.colab import drive
drive.mount('/content/drive')
data_path = '/content/drive/MyDrive/ml_w_graphs_final_project'

Mounted at /content/drive


# Parse citation data

In [3]:
# read data is slow (~3mins)
citation_df0 = pd.read_json(os.path.join(data_path, "dblp-ref-0.json"), lines=True);
# print("citation data0 shape:", citation_df0.shape); display(citation_df0.head(2))

citation_df1 = pd.read_json(os.path.join(data_path, "dblp-ref-1.json"), lines=True);
# print("citation data1 shape:", citation_df1.shape); display(citation_df1.head(2))

citation_df2 = pd.read_json(os.path.join(data_path, "dblp-ref-2.json"), lines=True);
# print("citation data2 shape:", citation_df2.shape); display(citation_df2.head(2))

citation_df3 = pd.read_json(os.path.join(data_path, "dblp-ref-3.json"), lines=True);
# print("citation data3 shape:", citation_df3.shape); display(citation_df3.head(2))

In [4]:
citation_df = pd.concat([citation_df0, citation_df1, citation_df2, citation_df3], axis=0)
print("citation data shape:", citation_df.shape)
display(citation_df.head(2))

citation data shape: (3079007, 8)


Unnamed: 0,abstract,authors,n_citation,references,title,venue,year,id
0,The purpose of this study is to develop a lear...,"[Makoto Satoh, Ryo Muramatsu, Mizue Kayama, Ka...",0,"[51c7e02e-f5ed-431a-8cf5-f761f266d4be, 69b625b...",Preliminary Design of a Network Protocol Learn...,international conference on human-computer int...,2013,00127ee2-cb05-48ce-bc49-9de556b93346
1,This paper describes the design and implementa...,"[Gareth Beale, Graeme Earl]",50,"[10482dd3-4642-4193-842f-85f3b70fcf65, 3133714...",A methodology for the physically accurate visu...,visual analytics science and technology,2011,001c58d3-26ad-46b3-ab3a-c1e557d16821


In [5]:
citation_df.venue.nunique()

5079

In [8]:
citation_df.venue.unique()[-20:]

array(['Synthesis', 'Seibutsu Butsuri', 'Oceanology',
       'International Journal of Biomathematics', 'Applied Geomatics',
       'Crisis-the Journal of Crisis Intervention and Suicide Prevention',
       'BMC Infectious Diseases',
       'Canadian Journal of Neurological Sciences',
       'Environmental Monitoring and Assessment',
       'Genetic Resources and Crop Evolution',
       'Australian Journal of Early Childhood', 'Stroke',
       'Journal of Residuals Science & Technology',
       'Structural Control & Health Monitoring', 'Stress', 'Geobios',
       'Carbon', 'Applied Computational Intelligence and Soft Computing',
       'Advanced Nonlinear Studies',
       'International Journal of Robotics & Automation'], dtype=object)

In [None]:
# filter data in the following venues: NIPS, ICML, KDD, IJCAI, UAI, ICLR, and COLT
ml_paper_df = citation_df[(citation_df.venue=="neural information processing systems") |  # 6355
                          (citation_df.venue=="international conference on machine learning") |  # 3963
                          (citation_df.venue=="knowledge discovery and data mining") | # 4839
                          (citation_df.venue=="international joint conference on artificial intelligence") | # 6882
                          (citation_df.venue=="uncertainty in artificial intelligence") | # 1984
                          (citation_df.venue=="international conference on learning representations") | # 157
                          (citation_df.venue=="computational learning theory")  # 1243
                          ]
ml_paper_df.shape # each hyperedge is a paper (25423 hyperedges)

(25423, 8)

# Collect information for hyperedges

In [None]:
# collect unique authors
author_list = []
for i in ml_paper_df.authors:
    author_list.extend(i)
author_list = list(set(author_list))

print("Number of authors:", len(author_list))
print(author_list[:5])

Number of authors: 28551
['Thomas Desautels', 'Fabien Cardinaux', 'Howard E. Motteler', 'Renata Slota', 'Seymour Douglas']


In [None]:
# collect information for hyperedge generation
pi_list = list(zip(ml_paper_df.authors, ml_paper_df.n_citation))
print("Number of hyperedges:", len(pi_list))
print(pi_list[:5])

Number of hyperedges: 25423
[(['Minoru Shigenaga', 'Yoshihiro Sekiguchi'], 0), (['B. K. Bog', 'K. Sparck Jones'], 50), (['Pierre E. Bonzon'], 0), (['Stephan Busemann'], 0), (['Siddharth Gopal', 'Yiming Yang'], 46)]


# Helper function for computing PageRank rankings

In [None]:
##################################################
# COMPUTE PAGERANK
##################################################

# given probability transition matrix P
# where P_{v,w} = Prob(w -> v)
# find pagerank scores with restart probability r
def compute_pr(P, r, n, eps=1e-8):
    x = np.ones(n) / n*1.0
    flag = True
    t=0
    while flag:
        if t % 100 == 0:
            print("time step:", t)

        x_new = (1-r)*P*x
        x_new = x_new + np.ones(n) * r / n
        # gap = np.linalg.norm(x_new - x,ord=1)
        gap = np.nanmax(np.nansum(abs(x_new - x), axis=0))
        # Note: the code computes L1 norm and ignores nan values

        diff = np.linalg.norm(x_new - x)
        if gap < eps and t > 100:
            flag = False
        t=t+1
        x = x_new
    return x

# Create hypergraph rankings

## Hypergraph with trivial weights

In [None]:
# running time: ~2mins
universe = np.array(author_list)
# first create these matrices
# R = |E| x |V|, R(e, v) = lambda_e(v)
# W = |V| x |E|, W(v, e) = w(e) 1(v in e)

m = len(pi_list) # number of hyperedges
n = len(universe) # number of authors to be ranked
R = np.zeros([m, n])
W = np.zeros([n, m])

for i in range(len(pi_list)):
    authors, n_citations = pi_list[i]
    # print(authors)
    # print(n_citations)
    if len(authors) > 1:
        for j in range(len(authors)):
            v = authors[j]
            # print(v)
            v = np.where(universe == v)[0][0] #equivalent to universe.index(v) but for np arrays
            # print(v)
            R[i, v] = 1.0
            W[v,i] = (n_citations + 1.0)
        R[i, :] = R[i,:] / sum(R[i,:])

# first, normalize W
# Wnorm=W/(W.sum(axis=1)[:,None])
Wnorm = np.divide(W, W.sum(axis=1)[:,None], out=np.zeros(W.shape, dtype=float), where=W.sum(axis=1)[:,None]!=0)
# Note: the code sets Wnorm to be 0 instead of nan if divide by 0.
Ws = sparse.csr_matrix(Wnorm)
Rs = sparse.csr_matrix(R)

In [None]:
# running time:
# create prob trans matrices
P = np.transpose(Ws.dot(Rs))
# P = Ws.dot(Rs)

# create rankings
r=0.40
rankings_hg_t = compute_pr(P, r, n, eps=1e-10).flatten()

time step: 0
time step: 100


## Hypergraph with edge-dependent weights

In [None]:
# check whether authors are alphabetically ordered
def isInAlphabeticalOrder(author_names):
    for i in range(len(author_names) - 1):
        if author_names[i] > author_names[i + 1]:
            return False
    return True

In [None]:
# running time: 1.5mins
universe = np.array(author_list)
# first create these matrices
# R = |E| x |V|, R(e, v) = lambda_e(v)
# W = |V| x |E|, W(v, e) = w(e) 1(v in e)

m = len(pi_list) # number of hyperedges
n = len(universe) # number of authors to be ranked
R = np.zeros([m, n])
W = np.zeros([n, m])

for i in range(len(pi_list)):
    authors, n_citations = pi_list[i]
    if len(authors) > 1:
        # check whether authors are alphabetically ordered
        is_alphabetic = isInAlphabeticalOrder(authors)

        for j in range(len(authors)):
            v = authors[j]
            v = np.where(universe == v)[0][0] #equivalent to universe.index(v) but for np arrays
            if is_alphabetic:
                R[i, v] = 1
            else:
                if (j==0) | (j==len(authors)-1):
                    R[i, v] = 2
                else:
                    R[i, v] = 1
            W[v,i] = (n_citations + 1.0)
        R[i, :] = R[i,:] / sum(R[i,:])

# first, normalize W
# Wnorm=W/W.sum(axis=1)[:,None]
Wnorm = np.divide(W, W.sum(axis=1)[:,None], out=np.zeros(W.shape, dtype=float), where=W.sum(axis=1)[:,None]!=0)
# Note: the code sets Wnorm to be 0 instead of nan if divide by 0.
Ws = sparse.csr_matrix(Wnorm)
Rs = sparse.csr_matrix(R)

In [None]:
# create prob trans matrices
P = np.transpose(Ws.dot(Rs))
# P = Ws.dot(Rs)

# create rankings
r=0.40
rankings_hg_d = compute_pr(P, r, n, eps=1e-10).flatten()

time step: 0
time step: 100


# Replicate Results

In [None]:
# compute Kendall tau correlation coefficient
print(kendalltau(rankings_hg_t, rankings_hg_d).statistic)
print(kendalltau(rankings_hg_t, rankings_hg_d).pvalue)

0.760478129157942
0.0


In [None]:
# recreate table 1
sorted_rankings_hg_t = sorted(rankings_hg_t, reverse=True)
sorted_rankings_hg_d = sorted(rankings_hg_d, reverse=True)

selected_author_list = ["Richard Socher", "Zhongzhi Shi", "Daniel Rueckert",
                        "Lars Schmidt-Thieme", "Tat-Seng Chua", "Ian J. Goodfellow"]
selected_author_trivial_ranking = []
selected_author_weighted_ranking = []
for author in selected_author_list:
    pos = np.where(universe == author)[0][0]
    selected_author_trivial_ranking.append(sorted_rankings_hg_t.index(rankings_hg_t[pos]))
    selected_author_weighted_ranking.append(sorted_rankings_hg_d.index(rankings_hg_d[pos]))

In [None]:
pd.DataFrame({"Name": selected_author_list,
              "Rank in H_T": selected_author_trivial_ranking,
              "Rank in H_D": selected_author_weighted_ranking})

Unnamed: 0,Name,Rank in H_T,Rank in H_D
0,Richard Socher,680,366
1,Zhongzhi Shi,606,350
2,Daniel Rueckert,618,383
3,Lars Schmidt-Thieme,673,436
4,Tat-Seng Chua,645,435
5,Ian J. Goodfellow,609,407
