In [1]:
import numpy as np
import pandas as pd
import time
import os
import random
from collections import defaultdict
import deepwalk_graph as graph
from gensim.models.word2vec import Word2Vec

In [2]:
papers = np.load('extr-titles.npy')

In [3]:
conference_list = pd.read_csv('conf_list.csv', header=None, delimiter='\t', names=['abbrev', 'fullname'])

In [4]:
author_conference = defaultdict(lambda: [])

In [5]:
for paper in iter(papers):
    authors = paper[1]
    venue = paper[2]
    for author in iter(authors):
        author_conference[author].append(venue)

noname = author_conference.pop('')

In [6]:
conf_matrix = conference_list['abbrev'].as_matrix()
conf2indx = {k: v for v, k in enumerate(conf_matrix)}
author2indx = {}
conf_link_author = defaultdict(lambda: [])
conf_num = len(conf_matrix)
minimum_publications = 5

In [7]:
f = open('author_conference_net.adjlist', 'w')

In [8]:
counter = 0
for author, confs in author_conference.iteritems():
    if len(confs) >= minimum_publications:
        at_indx = conf_num + counter
        author2indx[author] = at_indx
        line = str(at_indx)
        for cf in iter(confs):
            cf_indx = conf2indx[cf]
            conf_link_author[cf_indx].append(at_indx)
            line += ' ' + str(cf_indx)
        line += '\n'
        f.write(line)
        counter += 1

In [9]:
for cf_indx, at_indxes in conf_link_author.iteritems():
    line = str(cf_indx)
    for at_indx in iter(at_indxes):
        line += ' ' + str(at_indx)
    
    line += '\n'
    f.write(line)

In [10]:
f.close()

In [11]:
G = graph.load_adjacencylist('author_conference_net.adjlist')

In [12]:
st = time.time()
walks = graph.build_deepwalk_corpus(G, num_paths=10, path_length=40, \
                                    alpha=0, rand=random.Random(1234))

print time.time() - st

17.1699998379


In [13]:
# all the node number add "n" prefix
# to solve deepwalk issue #3: TypeError: unsupported operand type(s) for +: 'int' and 'str'
sentences = []
for walk in iter(walks):
    sent = []
    for node in iter(walk):
        sent.append('n' + str(node))
    
    sentences.append(sent)

In [14]:
st = time.time()
model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
print time.time() -st

38.1300001144


In [15]:
indx2author = {k: v for v, k in author2indx.iteritems()}

In [16]:
def author_conf_similarity(model, author2indx, indx2author, conf2indx, author, conf1, conf2, topn=10):
    '''
        return topn similar author by calculating author - conf1 + conf2
    '''
    node1 = 'n' + str(author2indx[author])
    node2 = 'n' + str(conf2indx[conf1])
    node3 = 'n' + str(conf2indx[conf2])
    
    vec1 = model[node1]
    vec2 = model[node2]
    vec3 = model[node3]
    
    results = model.most_similar([vec1 - vec2 + vec3], topn=topn+10)
    counter = 0
    sim_authors = []
    for i, res in enumerate(results):
        index = int(res[0][1:])
        if index > 75: # not conference, but author index
            sim_authors.append((indx2author[index], res[1]))
            counter += 1
            if counter == topn:
                break
    
    return sim_authors

In [17]:
a1 = "Jiawei Han"
c1 = "SIGKDD"
c2 = "SIGIR"
author_conf_similarity(model, author2indx, indx2author, conf2indx, a1, c1, c2)

[('James Allan', 0.6710748672485352),
 ('Tetsuya Sakai', 0.6528191566467285),
 ('Alistair Moffat', 0.6473925113677979),
 ('Charles L.A. Clarke', 0.6466618776321411),
 ('Jamie Callan', 0.6432892084121704),
 ('Milad Shokouhi', 0.6432796716690063),
 ('Stephen Robertson', 0.6386977434158325),
 ('Arjen P. de Vries', 0.63796067237854),
 ('David Hawking', 0.637642502784729),
 ('Nicholas J. Belkin', 0.6345908045768738)]

In [18]:
a1 = "Jiawei Han"
c1 = "SIGKDD"
c2 = "SIGMOD"
author_conf_similarity(model, author2indx, indx2author, conf2indx, a1, c1, c2)

[('Peter Muth', 0.6977096199989319),
 ('Richard Hull', 0.6892060041427612),
 ('Nesime Tatbul', 0.6891045570373535),
 ('Alkis Simitsis', 0.6846166849136353),
 ('Wenfei Fan', 0.6813915967941284),
 ('Betty Salzberg', 0.6813511848449707),
 ('Michael J. Carey', 0.6809432506561279),
 ('Arie Segev', 0.6800684928894043),
 ('Maurice van Keulen', 0.679888129234314),
 ('D. Agrawal', 0.6789271831512451)]

In [19]:
a1 = "Michael I. Jordan"
c1 = "ICML"
c2 = "SIGKDD"
author_conf_similarity(model, author2indx, indx2author, conf2indx, a1, c1, c2)

[('Padhraic Smyth', 0.4324393570423126),
 ('Claudia Perlich', 0.4297933578491211),
 ('Junjie Wu', 0.42488935589790344),
 ('Gregory Piatetsky-Shapiro', 0.4159601032733917),
 ('Yehuda Koren', 0.41522032022476196),
 ('Jieping Ye', 0.410919189453125),
 ('Aristides Gionis', 0.40918970108032227),
 ('Ravi Kumar', 0.40248775482177734),
 ('Deepayan Chakrabarti', 0.4009716510772705),
 ('Jaideep Srivastava', 0.39661943912506104)]