# Generating a Citation Network Graph
**Author:** Tim Denzler

This notebooks converts the previously identified references to .gexf format in order to be able to visualize the citation graph in Gephi.

In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0,"../")
import lib
import warnings
from tqdm import tqdm

from pprint import pprint
warnings.filterwarnings('ignore')

In [2]:
corpus = []
different_language_keys = ['K9KJVYIR','Y4MGKEBB','JE66BNAN','FSP7ANYJ','C7GB4MNG','6BIQJRKD','NRFYP7MT','XTCH9IQV','Q48XB5ZN','I3HDJH54','YPLTERAG','BLPTWEH5']
literature_reviews_keys = ['DKBW9ABI','TJMJYHAG', 'BNS43SCZ', 'CMB96B57', 'T96GMF6N', 'AISSN8RV', 'TPQI9BTD', 'GKL3GBBR', '8PUA6WJ3', '9P8AAUVM', '5UHAEFIT','6QXIW79J', 'NI4RMT4H', '4GDNPC75', '927ACLCW', 'B5FK7VNP', 'TU9MYDMI', 'TEQ8MVA5', '6FT5QEXR', '4ZU54FDA', '3YVNNY8Y', 'PHDIVR6K', 'AMA5QADB', 'BPGK8V7N', '34FWNV6J', 'JVMYN4TV', '2C997CVV', 'UIAS2E56', 'MRTEUZPG', 'N7YVDTF2', 'U9UCIP9I', 'I29GGRLL', '7V6GJVRN', 'KT6KZGBB', 'J7ZDC6BP', 'L99VI2FU']

for p,paper in enumerate(lib.Paper.objects()):
    if paper.pdf_p and paper._relevant_reason != "Duplicate entry" and paper._relevant_reason != 'not in English' and paper.key not in different_language_keys: #only import papers that had PDFs
        corpus.append(paper)
print("Imported ", len(corpus), " documents from the corpus.")

Imported  761  documents from the corpus.


In [3]:
import networkx as nx
G = nx.DiGraph()

In [4]:
from fuzzywuzzy import fuzz

def get_title_string(container_title):
    title_string = ""
    for element in container_title:
        title_string += " " + element 
    return title_string[1:]

def check_internal(doi,reference_title):
    for paper in corpus:
        if bool(paper._crossref_entry) == True and paper._crossref_entry['DOI'] == doi:
            return paper.key
        if fuzz.partial_ratio(paper._title, reference_title)>99:
            return paper.key
    return None

In [5]:
checked_author_dict = dict() #dict of authors already added
for paper in corpus: #add all nodes in corpus
    if paper._relevant_p == True:
        G.add_node(paper.key, label = paper._title, _relevant_reason = paper._relevant_reason, _item_type = paper._item_type, type='paper_relevant')
    if paper._relevant_p == False:
        G.add_node(paper.key, label = paper._title, _relevant_reason = paper._relevant_reason, _item_type = paper._item_type, type='paper_irrelevant')

In [6]:
for paper in tqdm(corpus): #add all nodes from external references and double check if internal, then add edges
    title = paper._title
    key = paper.key
    item_type = paper._item_type
    if paper._external_references is not None:
        for reference in paper._external_references:
            reference_title = ""
            if 'title' not in reference:
                if 'container-title' in reference:
                    reference_title = get_title_string(reference['container-title'])
            if 'title'  in reference:
                reference_title = reference['title']
            if check_internal(reference['DOI'], reference_title) == None: #paper referenced is not in corpus
                G.add_node(reference['DOI'], label = reference_title, type='paper_external')
                G.add_edge(key,reference['DOI'])
            else: #paper is in corpus but not yet identified
                referenced_paper_key = check_internal(reference['DOI'], reference_title)
                G.add_edge(key,referenced_paper_key)

100%|██████████| 761/761 [32:17<00:00,  2.55s/it]  


In [7]:
for paper in corpus: #add internal edges
    for reference in paper._identified_references:
        G.add_edge(paper.key, reference[0])

In [8]:
G_nodes = G.number_of_nodes()
G_edges = G.number_of_edges()
print("Nodes = ", G_nodes, " Edges = ",G_edges)

Nodes =  9618  Edges =  13654


In [9]:
nx.write_gexf(G, "../data/citation_network.gexf") #save for gephi