In [1]:
import re
import io
import unicodecsv as csv
import pandas as pd
from unidecode import unidecode

In [2]:
fout_dir         = "output/DBLP_ACM/"
acm_csv          = "ACM.csv"
dblp_csv         = "DBLP2.csv"
futf_set1        = fout_dir + 'utf8_' + acm_csv
futf_set2        = fout_dir + 'utf8_' + dblp_csv
fnodes           = fout_dir + 'nodes.csv'
fedges           = fout_dir + 'edges.csv'

In [3]:
df_acm = pd.read_csv(futf_set1, encoding='utf-8')

In [4]:
df_dblp2 = pd.read_csv(futf_set2, encoding='utf-8')

In [5]:
def checkifstr(obj):
    if isinstance(obj, float):
        return False
    else:
        return bool(obj) and all(isinstance(elem, str) for elem in obj)

In [6]:
def preProcess(column):
    """
    Do a little bit of data cleaning with the help of Unidecode and Regex.
    Things like casing, extra spaces, quotes and new lines can be ignored.
    """
    column = unidecode(column)
    column = re.sub('\n', ' ', column)
    column = re.sub('-', '', column)
    column = re.sub('/', ' ', column)
    column = re.sub("'", '', column)
    column = re.sub(",", '', column)
    column = re.sub(":", ' ', column)
    column = re.sub('  +', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    if not column :
        column = None
    return column

In [7]:
def create_node_dict(node_id, node_typ, node_label):
    node_dict = {}
    node_dict['Id'] = node_id
    node_dict['Type'] = node_typ
    node_dict['Label'] = node_label
    return node_dict

In [8]:
def create_edge_dict(node1, rel, node2):
    rel_dict = {}
    rel_dict['Source'] = node1
    rel_dict['Rel'] = rel
    rel_dict['Target'] = node2
    return rel_dict

In [9]:
def populate_coauthor_edge(authors_dict, authors_list):
    co_authorship = []
    for i in range(len(authors_list)):
        for j in range(i+1, len(authors_list)):
            node1 = authors_dict.get(authors_list[i])
            node2 = authors_dict.get(authors_list[j])
            rel = 'co-authored with'
#             print (node1, " co-authored with ", node2)
            if node1 is not None and node2 is not None:
                co_authorship.append(create_edge_dict(node1, rel, node2))
            elif node1 is None:
                print ('populate_coauthor_edge:', node1, ' not found')
            elif node2 is None:
                print ('populate_coauthor_edge:', node2, ' not found')
    return co_authorship

In [10]:
def populate_write_edge(authors_dict, authors_list, paper_dict, title_proc):
    authoring = []
    rel = 'wrote'
    node2 = paper_dict.get(title_proc)
    for i in range(len(authors_list)):
        node1 = authors_dict.get(authors_list[i])
        if node1:
            authoring.append(create_edge_dict(node1, rel, node2))
        else:
            print ('populate_write_edge:', authors_list[i], ' not found')
    return authoring

In [11]:
def write_csv(fname, headernames, data_arr, write_header):
    with io.open(fname, 'a+b') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=headernames)
        if write_header:
            writer.writeheader()
        for r in data_arr:
            writer.writerow(r)
            

In [12]:
# Get a list of author nodes (name, paper-id, node-typ)
df_acm_idx               = df_acm.set_index(['id'])
df_acm_idstr             = df_acm['id'].astype(str).tolist()
df_dblp2_idx             = df_dblp2.set_index(['id'])

global_id                = 0
NODE_TYP_AUTHOR_ACM      = 0
NODE_TYP_AUTHOR_DBLP2    = 1
NODE_TYP_PAPER_ACM       = 2
NODE_TYP_PAPER_DBLP2     = 3

dict_node                = {}     # map { id: original author or paper name}
dict_node_acm_author     = {}     # map { author_proc_name: id}
dict_node_dblp2_author   = {}
dict_node_acm_paper      = {}     # map { paper_proc_name: id}
dict_node_dblp2_paper    = {}     

node_arr                 = []     # to create nodes.csv (including author and paper)
author_paper_arr         = []     # to create 'author [writes] paper' rel. 
coauthor_link_arr        = []     # to create 'author [co-authored with] author' rel.

for index, row in df_acm.iterrows():
    acm_authors   = row['authors']
    acm_title     = row['title']
    
    # Create paper node
    proc_acm_title = preProcess(acm_title)
    dict_node_acm_paper[proc_acm_title] = global_id
    dict_node[global_id] = acm_title
    node_prop = create_node_dict(global_id, NODE_TYP_PAPER_ACM, acm_title)
    node_arr.append(node_prop)
    global_id += 1
    
    # Populate co-author list
    acm_authors_proc = []
    
    # Create author node
    if checkifstr(acm_authors):
        for oa in acm_authors.split(','):
            a = preProcess(oa)
            acm_authors_proc.append(a)
            if a not in dict_node_acm_author:
                dict_node_acm_author[a] = global_id
                dict_node[global_id] = oa
                node_prop = create_node_dict(global_id, NODE_TYP_AUTHOR_ACM, oa)
                node_arr.append(node_prop)
                global_id += 1
                
    # Populate co-author list
    edges_list = populate_coauthor_edge(dict_node_acm_author, acm_authors_proc)
    coauthor_link_arr.extend(edges_list)

    # Populate authoring edge
    edges_list = populate_write_edge(dict_node_acm_author, acm_authors_proc, dict_node_acm_paper, proc_acm_title)
    author_paper_arr.extend(edges_list)
    
    
write_csv(fnodes, ['Id', 'Type', 'Label'], node_arr, True)
write_csv(fedges, ['Source', 'Rel', 'Target'], author_paper_arr, True)
write_csv(fedges, ['Source', 'Rel', 'Target'], coauthor_link_arr, False)
                    

In [13]:
# def populate_graph(res_map, true_map):
#     # Get a list of author nodes (name, paper-id, node-typ)
#     df_acm_idx               = df_acm.set_index(['id'])
#     df_acm_idstr             = df_acm['id'].astype(str).tolist()
#     df_dblp2_idx             = df_dblp2.set_index(['id'])

#     global_id                = 0
#     NODE_TYP_AUTHOR_ACM      = 0
#     NODE_TYP_AUTHOR_DBLP2    = 1
#     NODE_TYP_PAPER_ACM       = 2
#     NODE_TYP_PAPER_DBLP2     = 3

#     dict_node                = {}     # map { id: original author or paper name}
#     dict_node_acm_author     = {}     # map { author_proc_name: id}
#     dict_node_dblp2_author   = {}
#     dict_node_acm_paper      = {}     # map { paper_proc_name: id}
#     dict_node_dblp2_paper    = {}     

#     node_arr                 = []     # to create nodes.csv (including author and paper)
#     author_paper_arr         = []     # to create 'author [writes] paper' rel. 
#     coauthor_link_arr        = []     # to create 'author [co-authored with] author' rel.

#     for k, pred_v in res_map.items():
#         if true_map.get(k) is not None:
#             true_v = true_map.get(k)        
#             if true_v == pred_v:
#                 if k in df_acm_idstr:   # Filter only one-way
#                     acm_authors   = df_acm_idx.loc[int(k)].authors
#                     dblp2_authors = df_dblp2_idx.loc[pred_v].authors
#                     acm_title     = df_acm_idx.loc[int(k)].title
#                     dblp2_title   = df_dblp2_idx.loc[pred_v].title

#                     # Create paper node
#                     proc_acm_title = preProcess(acm_title)
#                     dict_node_acm_paper[proc_acm_title] = global_id
#                     dict_node[global_id] = acm_title
#                     node_prop = create_node_dict(global_id, NODE_TYP_PAPER_ACM, acm_title, k)
#                     node_arr.append(node_prop)
#                     global_id += 1

#                     proc_dblp2_title = preProcess(dblp2_title)
#                     dict_node_dblp2_paper[proc_dblp2_title] = global_id
#                     dict_node[global_id] = dblp2_title
#                     node_prop = create_node_dict(global_id, NODE_TYP_PAPER_DBLP2, dblp2_title, pred_v)
#                     node_arr.append(node_prop)
#                     global_id += 1

#                     # Populate co-author list
#                     acm_authors_proc = []
#                     dblp2_authors_proc = []

#                     # Create author node
#                     if checkifstr(acm_authors):
#                         for oa in acm_authors.split(','):
#                             a = preProcess(oa)
#                             acm_authors_proc.append(a)
#                             if a not in dict_node_acm_author:
#                                 dict_node_acm_author[a] = global_id
#                                 dict_node[global_id] = oa
#                                 node_prop = create_node_dict(global_id, NODE_TYP_AUTHOR_ACM, oa, k)
#                                 node_arr.append(node_prop)
#                                 global_id += 1

#                     if checkifstr(dblp2_authors):
#                         for oa in dblp2_authors.split(','):
#                             a = preProcess(oa)
#                             dblp2_authors_proc.append(a)
#                             if a not in dict_node_dblp2_author:
#                                 dict_node_dblp2_author[a] = global_id
#                                 dict_node[global_id] = oa
#                                 node_prop = create_node_dict(global_id, NODE_TYP_AUTHOR_DBLP2, oa, pred_v)
#                                 node_arr.append(node_prop)
#                                 global_id += 1

#                     # Populate co-author list
#                     edges_list = populate_coauthor_edge(dict_node_acm_author, acm_authors_proc)
#                     coauthor_link_arr.extend(edges_list)
#                     edges_list = populate_coauthor_edge(dict_node_dblp2_author, dblp2_authors_proc)
#                     coauthor_link_arr.extend(edges_list)
                    
#                     # Populate authoring edge
#                     edges_list = populate_write_edge(dict_node_acm_author, acm_authors_proc, dict_node_acm_paper, proc_acm_title)
#                     author_paper_arr.extend(edges_list)
#                     edges_list = populate_write_edge(dict_node_dblp2_author, dblp2_authors_proc, dict_node_dblp2_paper, proc_dblp2_title)
#                     author_paper_arr.extend(edges_list)
                    
#     return [node_arr, coauthor_link_arr, author_paper_arr]
        