In [32]:
import re
import io
import unicodecsv as csv
import pandas as pd
from unidecode import unidecode

In [33]:
fout_dir         = "output/DBLP_ACM/"
acm_csv          = "ACM.csv"
dblp_csv         = "DBLP2.csv"
futf_set1        = fout_dir + 'utf8_' + acm_csv
futf_set2        = fout_dir + 'utf8_' + dblp_csv
fnodes           = fout_dir + 'nodes.csv'
fedges           = fout_dir + 'edges.csv'

In [34]:
df_acm = pd.read_csv(futf_set1, encoding='utf-8')

In [35]:
df_dblp2 = pd.read_csv(futf_set2, encoding='utf-8')

In [36]:
def checkifstr(obj):
    if isinstance(obj, float):
        return False
    else:
        return bool(obj) and all(isinstance(elem, str) for elem in obj)

In [37]:
def preProcess(column):
    """
    Do a little bit of data cleaning with the help of Unidecode and Regex.
    Things like casing, extra spaces, quotes and new lines can be ignored.
    """
    column = unidecode(column)
    column = re.sub('\n', ' ', column)
    column = re.sub('-', '', column)
    column = re.sub('/', ' ', column)
    column = re.sub("'", '', column)
    column = re.sub(",", '', column)
    column = re.sub(":", ' ', column)
    column = re.sub('  +', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    if not column :
        column = None
    return column

In [38]:
def create_node_dict(node_id, node_typ, node_label, node_extref_id=None):
    node_dict = {}
    node_dict['Id'] = node_id
    node_dict['Type'] = node_typ
    node_dict['Label'] = node_label
    node_dict['Extref_Id'] = node_extref_id
    return node_dict

In [39]:
def create_edge_dict(node1, rel, node2):
    rel_dict = {}
    rel_dict['Source'] = node1
    rel_dict['Rel'] = rel
    rel_dict['Target'] = node2
    return rel_dict

In [40]:
def populate_coauthor_edge(authors_dict, authors_list):
    co_authorship = []
    for i in range(len(authors_list)):
        for j in range(i+1, len(authors_list)):
            node1 = authors_dict.get(authors_list[i])
            node2 = authors_dict.get(authors_list[j])
            rel = 'co-authored with'
#             print (node1, " co-authored with ", node2)
            if node1 is not None and node2 is not None:
                co_authorship.append(create_edge_dict(node1, rel, node2))
            elif node1 is None:
                print ('populate_coauthor_edge:', node1, ' not found')
            elif node2 is None:
                print ('populate_coauthor_edge:', node2, ' not found')
    return co_authorship

In [41]:
def populate_write_edge(authors_dict, authors_list, paper_dict, title_proc):
    authoring = []
    rel = 'wrote'
    node2 = paper_dict.get(title_proc)
    for i in range(len(authors_list)):
        node1 = authors_dict.get(authors_list[i])
        if node1:
            authoring.append(create_edge_dict(node1, rel, node2))
        else:
            print ('populate_write_edge:', authors_list[i], ' not found')
    return authoring

In [42]:
def write_csv(fname, headernames, data_arr, write_header):
    with io.open(fname, 'a+b') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=headernames)
        if write_header:
            writer.writeheader()
        for r in data_arr:
            writer.writerow(r)
            

In [44]:

global_id                = 0
NODE_TYP_AUTHOR_ACM      = 0
NODE_TYP_AUTHOR_DBLP2    = 1
NODE_TYP_PAPER_ACM       = 2
NODE_TYP_PAPER_DBLP2     = 3

dict_node                = {}     # map { id: original author or paper name}
dict_node_acm_author     = {}     # map { author_proc_name: id}
dict_node_dblp2_author   = {}
dict_node_acm_paper      = {}     # map { paper_proc_name: id}
dict_node_dblp2_paper    = {}     

node_arr                 = []     # to create nodes.csv (including author and paper)
author_paper_arr         = []     # to create 'author [writes] paper' rel. 
coauthor_link_arr        = []     # to create 'author [co-authored with] author' rel.

for index, row in df_acm.iterrows():
    acm_authors   = row['authors']
    acm_title     = row['title']
    acm_id        = row['id']
    
    # Create paper node
    proc_acm_title = preProcess(acm_title)
    dict_node_acm_paper[proc_acm_title] = global_id
    dict_node[global_id] = acm_title
    node_prop = create_node_dict(global_id, NODE_TYP_PAPER_ACM, acm_title, acm_id)
    node_arr.append(node_prop)
    global_id += 1
    
    # Populate co-author list
    acm_authors_proc = []
    
    # Create author node
    if checkifstr(acm_authors):
        for oa in acm_authors.split(','):
            a = preProcess(oa)
            acm_authors_proc.append(a)
            if a not in dict_node_acm_author:
                dict_node_acm_author[a] = global_id
                dict_node[global_id] = oa
                node_prop = create_node_dict(global_id, NODE_TYP_AUTHOR_ACM, oa)
                node_arr.append(node_prop)
                global_id += 1
                
    # Populate co-author list
    edges_list = populate_coauthor_edge(dict_node_acm_author, acm_authors_proc)
    coauthor_link_arr.extend(edges_list)

    # Populate authoring edge
    edges_list = populate_write_edge(dict_node_acm_author, acm_authors_proc, dict_node_acm_paper, proc_acm_title)
    author_paper_arr.extend(edges_list)
    
    
    
for index, row in df_dblp2.iterrows():
    dblp2_authors   = row['authors']
    dblp2_title     = row['title']
    dblp2_id        = row['id']
    
    proc_dblp2_title = preProcess(dblp2_title)
    dict_node_dblp2_paper[proc_dblp2_title] = global_id
    dict_node[global_id] = dblp2_title
    node_prop = create_node_dict(global_id, NODE_TYP_PAPER_DBLP2, dblp2_title, dblp2_id)
    node_arr.append(node_prop)
    global_id += 1
    
    # Populate co-author list
    dblp2_authors_proc = []
    
    if checkifstr(dblp2_authors):
        for oa in dblp2_authors.split(','):
            a = preProcess(oa)
            dblp2_authors_proc.append(a)
            if a not in dict_node_dblp2_author:
                dict_node_dblp2_author[a] = global_id
                dict_node[global_id] = oa
                node_prop = create_node_dict(global_id, NODE_TYP_AUTHOR_DBLP2, oa)
                node_arr.append(node_prop)
                global_id += 1
                
    # Populate co-author list
    edges_list = populate_coauthor_edge(dict_node_dblp2_author, dblp2_authors_proc)
    coauthor_link_arr.extend(edges_list)
    
    # Populate authoring edge
    edges_list = populate_write_edge(dict_node_dblp2_author, dblp2_authors_proc, dict_node_dblp2_paper, proc_dblp2_title)
    author_paper_arr.extend(edges_list)
    
    
print(len(node_arr))
print(len(author_paper_arr))
print(len(coauthor_link_arr))
write_csv(fnodes, ['Id', 'Extref_Id', 'Type', 'Label'], node_arr, True)
write_csv(fedges, ['Source', 'Rel', 'Target'], author_paper_arr, True)
write_csv(fedges, ['Source', 'Rel', 'Target'], coauthor_link_arr, False)
                    

11703
14612
25176
