In [None]:
#Necessary Libraries
import xml.etree.ElementTree as ET
from urllib.parse import urlencode
from urllib.request import urlopen
from urllib.error import HTTPError
import ssl
import pandas as pd

In [None]:
#All subcategories under Computer Science on arXiv
cs_master = {"cs.AI":"Artificial Intelligence",
"cs.AR":"Hardware Architecture",
"cs.CC":"Computational Complexity",
"cs.CE":"Computational Engineering, Finance, and Science",
"cs.CG":"Computational Geometry",
"cs.CL":"Computation and Language",
"cs.CR":"Cryptography and Security",
"cs.CV":"Computer Vision and Pattern Recognition",
"cs.CY":"Computers and Society",
"cs.DB":"Databases",
"cs.DC":"Distributed, Parallel, and Cluster Computing",
"cs.DL":"Digital Libraries",
"cs.DM":"Discrete Mathematics",
"cs.DS":"Data Structures and Algorithms",
"cs.ET":"Emerging Technologies",
"cs.FL":"Formal Languages and Automata Theory",
"cs.GL":"General Literature",
"cs.GR":"Graphics",
"cs.GT":"Computer Science and Game Theory",
"cs.HC":"Human-Computer Interaction",
"cs.IR":"Information Retrieval",
"cs.IT":"Information Theory",
"cs.LG":"Learning",
"cs.LO":"Logic in Computer Science",
"cs.MA":"Multiagent Systems",
"cs.MM":"Multimedia",
"cs.MS":"Mathematical Software",
"cs.NA":"Numerical Analysis",
"cs.NE":"Neural and Evolutionary Computing",
"cs.NI":"Networking and Internet Architecture",
"cs.OH":"Other Computer Science",
"cs.OS":"Operating Systems",
"cs.PF":"Performance",
"cs.PL":"Programming Languages",
"cs.RO":"Robotics",
"cs.SC":"Symbolic Computation",
"cs.SD":"Sound",
"cs.SE":"Software Engineering",
"cs.SI":"Social and Information Networks",
"cs.SY":"Systems and Control"}

cat = list(cs_master.keys())

In [None]:
#Function to generate an edge list for the input of list of authors
def gen_edge_list(sub_list):
    list_nodes = []
    n = len(sub_list)
    
    if n==1:
        list_nodes.append(tuple(sub_list))
    
    else:
        for i in range(n):
            for j in range(i+1,n):
                list_nodes.append((sub_list[i],sub_list[j]))
    return list_nodes

In [None]:
#Main function doing the scraping and returning the edge list of the co-authorship network and the list of nodes
def output(cat,**extra):
    category = cs_master[cat]
    
    if(bool(extra)):
        s = str(extra['start'])
        url = 'http://export.arxiv.org/api/query?search_query=cat:' + cat + '&start=' + s + '&max_results=10000&sortBy=submittedDate&sortOrder=ascending'
    
    else:
        url = 'http://export.arxiv.org/api/query?search_query=cat:' + cat + '&start=0&max_results=10000&sortBy=submittedDate&sortOrder=ascending'
    
    uh = urlopen(url)
    data = uh.read()
    tree = ET.fromstring(data)
    total = int(tree.find('./{http://a9.com/-/spec/opensearch/1.1/}totalResults').text)   
    
    edge_list = []
    authors_list = []
    nodes_list = []
    timeline = []
    entries = tree.findall('.//{http://www.w3.org/2005/Atom}entry')
    
    for entry in entries:
        
        #date = entry.find('{http://www.w3.org/2005/Atom}published').text
        #year = date.split('-')[0]
        #timeline.append(int(year))
        
        authors = entry.findall('{http://www.w3.org/2005/Atom}author')
        names = [author.find('{http://www.w3.org/2005/Atom}name').text for author in authors]
        authors_list.append(names)
    
    for sub_list in authors_list:
        for author in sub_list:
            if(author not in authors_list):
                nodes_list.append(author)
    
    all_edges = [gen_edge_list(sub_list) for sub_list in authors_list if len(sub_list) > 1]

    for sub_list in all_edges:
        for edge in sub_list:
            edge_list.append(edge)
    
    if(total > 10000 and not bool(extra)):
        n = int(total/10000) 
        for i in range(1,(n+1)):
            (a,b) = output(cat, start = i*10000)
            edge_list.extend(a)
            nodes_list.extend(b)
            #timeline.extend(c)        
    
    df_nodes = pd.DataFrame(nodes_list, columns = ['Node'])
    df_nodes['Category'] = category
    #years_range = [min(timeline),max(timeline)]
         
    return(edge_list,df_nodes)

In [None]:
(edge_list_ai,df_nodes_ai) = output("cs.AI") 

print("Number of edges: ", len(edge_list_ai))
print("Number of nodes: ", len(df_nodes_ai))
#print("Records from: ", years_range_ai[0], "to", years_range_ai[1])

Number of edges:  206667
Number of nodes:  26513


In [None]:
df_nodes_ai.to_csv("Nodes_list.csv")

In [None]:
#Creation of network from the edge list
import networkx as nx
G = nx.Graph()
G.add_edges_from(edge_list_ai)

In [None]:
#Writing the network in a .gexf file to be used for analysis
nx.write_gexf(G, "test.gexf")

In [None]:
edge_list_cs = []
df_nodes_cs = pd.DataFrame(columns=['Node','Category'])
years_range_cs = []

for k in cs_master.keys():
    (a,b,c) = output(k)
    edge_list_cs.extend(a)
    df_nodes_cs = df_nodes_cs.append(b)
    years_range_cs.append(c)

print("Number of edges: ", len(edge_list_cs))
print("Number of nodes: ", len(df_nodes_cs))

cats = list(cs_master.values())
for i in range(len(cs_master)):
  print(cats[i], "--- Records from: ", years_range_cs[i][0], "to", years_range_cs[i][1])