### Imports:

In [None]:
from lxml import etree
from lxml.etree import XMLSyntaxError
import pandas as pd
import sys
import os
import networkx as nx
from natsort import index_natsorted, order_by_index
from itertools import combinations

### Parse data

In [None]:
source = "dblp.xml" #dataset of publications
dtd = etree.DTD(file="dblp.dtd") #read DTD
publication_data = [] #fill this list with pairs of author/title
edge_list = [] #fill this with edges
counter = 0
bad_titles = ["Home Page", "Title Page", "Welcome message.", "Workshop preface.", "Vorwort.", "The", "The ", "Session Summary.", "Reviewers.", "Program Committee.", 
    "Preface.", "Organizing Committee.", "Workshop Organization.", "Workshop Organizers' Message.", "Title Page.", "Steering Committee.", "Session details: Keynote Address.",
    "S"]
#iterate through nodes
for event, element in etree.iterparse(source, load_dtd=True):
    title = ""
    author_list = []
    year = 0
    #iterate through children: author, title, year, etc.
    for child in element:
        if child.tag == "author":
            author_list.append(str(child.text))
        elif child.tag == "title":
            title = str(child.text)
            if title in bad_titles:
                title = ""; 
        elif child.tag == "year":
            year = int(child.text)
            for author in author_list:
                publication_data.append({"Author" : author.strip(), "Title" : title.strip(), "Year" : year}) #add author/title/year pair to data list
            if len(author_list) > 1:
                edge_list.append(author_list)
            counter += 1
            break
    if counter % 100000 == 0:
        sys.stdout.write(f'\r  progress:  {counter}  publications')
    if counter > 2000000:
        print("\nbroke")
        break
#create dataframe
publication_df = pd.DataFrame(publication_data)
edges_df = pd.DataFrame(edge_list)
element.clear()

### Converting Dataframe to CSV

In [None]:
publication_df.to_csv('add_data.csv')
edges_df.to_csv('add_edges.csv')
print(len(publication_df))
print(len(edges_df))

### Converting CSV to graph: adding authors as nodes

In [None]:
publication_df = pd.read_csv("data.csv")
publication_df = publication_df.reindex(index=order_by_index(publication_df.index, index_natsorted(publication_df['Author'], reverse=False))) # sorting by author name
publication_df.to_csv('data.csv')


In [None]:
author_graph = nx.Graph()
publication_df = pd.read_csv("data.csv")
author_graph.add_node(publication_df["Author"][0], Publications = {publication_df["Title"][0]})
for row in range(1, len(publication_df)):
    author = publication_df["Author"][row]
    if author == publication_df["Author"][row-1]:
        author_graph.nodes[author]["Publications"].add(publication_df["Title"][row]) #adding title to list of publications from that author
    else:
        author_graph.add_node(author, Publications = {publication_df["Title"][row]}) #adding author to graph with title to attibute
print("Number of unique authors: " + str(len(author_graph.nodes)))

### Adding edges based on CSV of edge list

In [None]:
edges_list = pd.read_csv("edges.csv")

In [None]:
count = 0
for row in edges_list.iterrows():
    i = 1
    coauthors = []
    while (type(row[1][i])) == str and row[1][i] != "None" and (i < 15):
        coauthors.append(row[1][i])
        i += 1
    author_graph.add_edges_from(list(combinations(coauthors, 2)))
    count+=1
    if count % 10000 == 0:
        sys.stdout.write(f'\r  rows processed: {count}')

In [None]:
print(len(author_graph.edges))

### Save graph as a gpickle

In [None]:
nx.write_gpickle(author_graph, "author_graph.gpickle")

### Read gpickle back into networkX graph

In [None]:
author_graph = nx.read_gpickle("author_graph.gpickle")