In [4]:
import csv
import re
import os, sys

In [5]:
# Author Fields: 0-based index, Normalized name, List of raw variants of the name
author_pathname = "author_normalization_table_reversed_indexed.tsv"

# Document Fields: 0-based index, Title, auth1/aff1|auth2/aff2|... year txt_pathname url
document_pathname = "all_hpec_records_cleaned_normalized_authors_indexed.tsv"

# Coauthor Graph fields: author1-index, author2-index, document-index1[|document-index2]*
coauthor_graph_pathname = "hpec_coauthors.tsv"

# Coauthor Graph in Matrix market formate where edge value is number of document-indices"
coauthor_graph_mtx_pathname = "hpec_coauthors.mtx"

In [7]:
# Create the lookup table
author_lookup = {}  # key = full name, value = author_index

# Read in the author index table
with open(author_pathname, 'r') as infile:
    reader = csv.reader(infile, delimiter='\t')
    line_number = 0
    for row in reader:
        line_number = line_number + 1
        if (len(row) != 3):
            print("ERROR: wrong number of fields in line ",line_number,", record=",row)
            continue
            
        (author_index, full_name, name_variants) = row

        if (full_name in author_lookup):
            print("ERROR: duplicate key (full name) in line ",line_number,", name =",full_name)
        else:
            author_lookup[full_name] = int(author_index)

num_nodes = len(author_lookup.keys())
print("Number of author entries: ", len(author_lookup.keys()))

edge_list = {} # key = (auth1_index, auth2_index), value = (list of document indices)

with open(document_pathname, 'r') as infile:
    line_number = 0
    reader = csv.reader(infile, delimiter='\t')
    #outfile = open(output_pathname, 'w')
    #writer = csv.writer(outfile, delimiter='\t')
    for row in reader:
        line_number = line_number + 1
        if (len(row) != 6):
            print("ERROR: bad record length in line ",line_number)
        else:
            (document_index, title, authors, pub_year, txt_filepath, url) = row
            doc_index = int(document_index)
            
            author_index_list = []
            alist = authors.split("|")
            for author in alist:
                (full_name,affiliation) = author.split("/")
                if (full_name not in author_lookup):
                    print("ERROR: '",full_name,"' not found in lookup table, line = ",line_number)
                else:
                    author_index_list.append(int(author_lookup[full_name]))
            #create bidirectional pairs of all possible author indices
            for idx1 in range(len(author_index_list)):
                for idx2 in range(len(author_index_list)):
                    if (idx1 != idx2):
                        auth_idx1 = author_index_list[idx1]
                        auth_idx2 = author_index_list[idx2]
                        if (auth_idx1 == auth_idx2):
                            print("WARNING: skipping duplicate author entries, line = ", line_number,
                                  ", author index = ", auth_idx1)
                        else:
                            if ((auth_idx1,auth_idx2) in edge_list):
                                edge_list[(auth_idx1, auth_idx2)].append(str(doc_index))
                            else:
                                edge_list[(auth_idx1, auth_idx2)] = [str(doc_index)]

num_edges = len(edge_list.keys())                                
print("Number of coauthor edges: ", len(edge_list.keys()))

outfile = open(coauthor_graph_pathname, 'w')
mtx_outfile = open(coauthor_graph_mtx_pathname, 'w')
mtx_outfile.write("%%MatrixMarket matrix coordinate integer\n")
        
writer     = csv.writer(outfile,     delimiter='\t')
mtx_writer = csv.writer(mtx_outfile, delimiter=' ')
mtx_writer.writerow((num_nodes, num_nodes, num_edges))

for (key,value) in sorted(edge_list.items()):
    #print(key, ":", edge_list[key])
    writer.writerow((key[0], key[1], "|".join(edge_list[key])))
    mtx_writer.writerow((key[0], key[1], len(edge_list[key])))

outfile.close()
mtx_outfile.close()


Number of author entries:  1747
Number of coauthor edges:  10072
