# Imports

In [1]:
import pandas as pd
import numpy as np

# Utility functions

In [2]:
import re

# Normalizing titles and names makes it easier to compare titles and names found from different sources.

def normalize_title(title):
    return " ".join(title.strip().title().split()).replace(".", "")

# Normalizing names:
# All the components (e.g first, middle and last name) will be in title case.
# A dot will be added to single letter components.
# All the components will be separated by one space.
# Dashes will not have spaces around it.

def normalize_name(name):
    name = name.replace(".", ". ")
    name = name.replace("-", " -")
    name_components = re.split(r"\s+", name.strip())

    def mapper(name):
        if re.fullmatch(r"-*\w", name):
            return (name + ".").title()
        return name.title()
        
    name = " ".join(list(map(mapper, name_components)))
    return name.replace(" -", "-")

# Examples:
#    parse_first_name("Alejandro Corichi") == "Alejandro"
#    parse_first_name("P. M. Nadolsky") == "P. M."
#    parse_last_name("P. M. Nadolsky") == "Nadolsky"

def parse_first_name(normalized_name):
    return " ".join(normalized_name.split()[:-1])

def parse_last_name(normalized_name):
    return normalized_name.split()[-1]

# "Yoshifumi" and "Chi-Peng" are full names, but "C." and "C. -P." are not.

def is_full_name(normalized_name):
    return (len(normalized_name) > 1) and not re.search(r"\.", normalized_name)

# API functions

Each function is able to query the real API or return mock data, based on the global variable `MOCK_API`. The mock data is in exactly the same format as the data from real API.

In [24]:
MOCK_API = True
MOCK_FOLDER = "API_mock_data"

In [3]:
from urllib.request import urlopen
import os
import json

def get_JSON(URL):
    response = urlopen(URL)
    decoded = response.read().decode("utf-8")
    return json.loads(decoded)

### Gender API

In [4]:
import random

def query_gender_API(first_name):
    if MOCK_API:
        return {
            "name": first_name, 
            "gender": random.choice(["male", "female"]), 
            "samples":  random.randint(10, 10000), 
            "accuracy": random.randint(1, 100), 
            "duration": str(random.randint(10, 50)) + "ms"
        }
    else:
        key = "WX923RkrwWYQE4UGSt4GHEFk7EZYgwUJ5adt"
        url = "https://gender-api.com/get?key=" + key + "&name=" + first_name
        return get_JSON(url)
        

### University domain API

In [5]:
def query_university_domain_API(domain):
    if MOCK_API:
        file_name = os.path.join(MOCK_FOLDER, "university_" + domain + ".json")

        if not os.path.exists(file_name):
            return []
            
        with open(file_name) as f:
            data = json.load(f)
        return data
    else:
        url = "http://universities.hipolabs.com/search?domain="
        return get_JSON(url + domain)

### SerpAPI

In [6]:
serpapi_key = "799376a440acaaaf1a9fedfb955dba15c59fd831751ad3a175b5b3fe84fddeb9"

def query_serpapi_title(title):
    if MOCK_API:
        file_name = os.path.join(MOCK_FOLDER, "serpapi_title.json")
        with open(file_name) as f:
            data = json.load(f)
        return data
    else:
        url = "https://serpapi.com/search.json?engine=google_scholar&q="

        query_title = "+".join(title.split())
        return get_JSON(url + query_title + "&hl=en&api_key=" + serpapi_key)

def query_serpapi_author(author_id):
    if MOCK_API:
        file_name = os.path.join(MOCK_FOLDER, "serpapi_author_" + author_id + ".json")

        if not os.path.exists(file_name):
            file_name = "serpapi_author_GOqnRJcAAAAJ.json"
            
        with open(file_name) as f:
            data = json.load(f)
        return data
    else:
        url = "https://serpapi.com/search.json?engine=google_scholar_author&author_id="

        return get_JSON(url + author_id + "&hl=en&api_key=" + serpapi_key)

### Semantic scholar API

This requires `pip install semanticscholar`. The result of calling these functions is not JSON, but a python object.

In [7]:
import pickle
from semanticscholar import SemanticScholar

sch = SemanticScholar()

def sch_find_papers(paper_title):
    if MOCK_API:
        file_name = "sch_paper_sparsity-certifying_graph_decompositions.pickle"
        file_path = os.path.join(MOCK_FOLDER, file_name)

        with open(file_path, "rb") as f:
            data = pickle.load(f)
        return data
    else:
        return sch.search_paper(paper_title)

def sch_find_authors(author_name):
    if MOCK_API:
        file_name = "sch_author_c.-p._yuan.pickle"
        file_path = os.path.join(MOCK_FOLDER, file_name)

        with open(file_path, "rb") as f:
            data = pickle.load(f)
        return data
    else:
        return sch.search_author(author_name)

# Functions for augmenting and proccessing data from APIs

In [8]:
def get_university_name(email_domain):
    # In case of pa.msu.edu, university domain API returns nothing.
    # But msu.edu gives Michigan State University. So we need to 
    # only use last two parts of domain (msu.edu).
    split_by_dot = email_domain.split(".")
    if len(split_by_dot) > 2:
        email_domain = ".".join(split_by_dot[-2:])
    
    domain_API_json = query_university_domain_API(email_domain)
    if len(domain_API_json) > 0:
        return domain_API_json[0]["name"]
    return ""

In [9]:
def sch_doi(paper_title):
    data = sch_find_papers(paper_title)

    normalized_title = normalize_title(paper_title)

    for paper in data:
        if normalize_title(paper.title) == normalized_title:
            return paper.externalIds.get("DOI", None)

    return None

In [10]:
def sch_full_name(author_name):
    data = sch_find_authors(author_name)
    norm_author_name = normalize_name(author_name)

    for author in data:
        match_found = False
        if author.aliases:
            names = [author.name] + author.aliases
        else:
            names = [author.name]

        full_name = None

        for name in names:
            norm_name = normalize_name(name)
            first_name = parse_first_name(norm_name)
            
            if is_full_name(first_name):
                full_name = name

            if norm_author_name == norm_name:
                match_found = True

        if match_found and full_name:
            return normalize_name(full_name)

In [11]:
def full_name_and_gender(name):
    full_name = None
    gender = None
    first_name = parse_first_name(name)

    if not is_full_name(first_name):
        full_name = sch_full_name(name)
    else:
        full_name = name

    if full_name:
        gender_json = query_gender_API(parse_first_name(full_name))
        if gender_json["gender"] != "unknown":
            gender = gender_json["gender"]
    
    return full_name if full_name else name, gender

In [12]:
def get_author_info(google_scholar_id):
    json = query_serpapi_author(google_scholar_id)

    name = normalize_name(json["author"]["name"])
    name, gender = full_name_and_gender(name)
    
    affiliation = json["author"]["affiliations"]
    role = ""
    university = ""

    if "," in affiliation:
        role = affiliation.split(",")[0:-1]
        role = ", ".join(role)
        university = affiliation.split(",")[-1].strip()
    else:
        role = affiliation
    
        email_info = json["author"]["email"].strip()
        if email_info != "":
            # Get rid of "Verified email at"
            email_domain = email_info.split()[-1]
            university = get_university_name(email_domain)
        
    return name, gender, university, role

In [13]:
def get_info_from_serpapi(paper_title):
    json = query_serpapi_title(paper_title)
    organic_results = json["organic_results"]
    
    title = paper_title
    authors = {}

    if (len(organic_results) > 0):
        first_result = organic_results[0]
        title = first_result["title"]
        authors_json = first_result["publication_info"]["authors"]

        # Some authors of the paper might be missing
        for json in authors_json:
            author_id = json["author_id"]

            name, gender, university, role = get_author_info(author_id)
            authors[name] = {
                "google_scholar_id": author_id,
                "gender": gender,
                "university": university,
                "role": role
            }

    return title, authors

# Testing

Here are some tests for functions from previous sections. They will give correct information using mock data (except gender which is randomly generated). Set the `MOCK_API = False` to test with real APIs.

In [14]:
sch_find_papers("Sparsity-certifying Graph Decompositions")[16].title

'Sparsity-certifying Graph Decompositions'

In [15]:
sch_doi("Sparsity-certifying Graph Decompositions")

'10.1007/S00373-008-0834-4'

In [16]:
sch_find_authors("C.-P. Yuan")[2].name

'C. Yuan'

In [17]:
sch_full_name("C.-P. Yuan")

'Chien-Peng Yuan'

In [18]:
get_university_name("monash.edu")

'Monash University'

In [19]:
get_university_name("smu.edu")

'Southern Methodist University'

In [20]:
get_university_name("pa.msu.edu")

'Michigan State University'

In [21]:
# Since C.-P. Yuan doesn't have a full name in SerpAPI, it turns to 
# Semantic scholar API to find that the full name is Chien-Peng Yuan.
get_author_info("CtDh12YAAAAJ")

('Chien-Peng Yuan',
 'female',
 'Michigan State University',
 'Professor,  Department of Physics and Astronomy')

In [22]:
# Since Csaba Balazs doesn't have the name of university in SerpAPI,
# it takes the email domain monash.edu to find out that it's Monash University.
get_author_info("boYjNZQAAAAJ")

('Csaba Balazs', 'female', 'Monash University', 'Professor of Physics')

In [23]:
get_info_from_serpapi("Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies")

('Calculation of prompt diphoton production cross sections at Fermilab Tevatron and CERN LHC energies',
 {'Csaba Balazs': {'google_scholar_id': 'boYjNZQAAAAJ',
   'gender': 'female',
   'university': 'Monash University',
   'role': 'Professor of Physics'},
  'Pavel Nadolsky': {'google_scholar_id': 'GOqnRJcAAAAJ',
   'gender': 'male',
   'university': 'Southern Methodist University',
   'role': 'Professor of Theoretical Physics'},
  'Chien-Peng Yuan': {'google_scholar_id': 'CtDh12YAAAAJ',
   'gender': 'female',
   'university': 'Michigan State University',
   'role': 'Professor,  Department of Physics and Astronomy'}})

# The data pipeline

In [None]:
with open("papers.json") as f:
    lines = f.readlines()
    
N = 50000
papers = pd.DataFrame([json.loads(x) for x in lines])
papers = papers.head(N)
papers = papers.drop('abstract', axis=1)

The following code block loops through the rows of original data and does the following:

1) Send the title of paper to SerpAPI and fetch info about authors.
2) SerpAPI also gives the role of aurthor and possibly the name of university.
2) If there is no university from SerpAPI, take the email domain and send this to University domain API to get the university.
3) If the author doesn't have a full first name in SerpAPI, try to get it from Semantic Scholar API.
4) If the full name exists (either from SerpAPI or Semantic Scholar) send it to Gender API to find gender.
5) In addition to author information from SerpAPI, the original dataset also has a list of author names. There might be some that do not exist in SerpAPI. Fetch their full names and genders and merge them with SerpAPI authors.

In [None]:
for index, row in papers.iterrows():
    title = row["title"]
    title_serpapi, authors = get_info_from_serpapi(title)
    paper_doi = sch_doi(title_serpapi)

    for author in row["authors_parsed"]:
        first_name = author[1].replace(",", "")
        last_name = author[0].replace(",", "")
        
        if first_name.strip() != "":
            name = normalize_name(first_name + " " + last_name)
            name, gender = full_name_and_gender(name)
            
            if not (name in authors):
                authors[name] = {
                    "google_scholar_id": None,
                    "gender": gender,
                    "university": None,
                    "role": None
                }
        else:
            # When the first name is empty, it usually 
            # means that it's an organization,
            # e.g PHENIX Collaboration, The DELPHI Collaboration etc.

            authors[name] = {
                "google_scholar_id": None,
                "gender": None,
                "university": None,
                "role": None
            }

    # TODO: do something with paper title, DOI and author information.

# Neo4J example

In [None]:
from neo4j import GraphDatabase

URL = "bolt://localhost:7687"

# To connect without username and password, you need this in neo4j.conf:
# dbms.security.auth_enabled=false
driver = GraphDatabase.driver(URL) 

# If dbms.security.auth_enabled = true
# The you need to write:
# driver(uri, auth = (user, password))

def run_query(tx, query):
    result = tx.run(query)
    return result.single()[0]

with driver.session() as session:
    query = """
        CREATE (a:Test)
        SET a.hello = "Hello"
        SET a.world = "World"
        RETURN a.hello + ' ' + a.world + ' from node ' + id(a)
    """
    result = session.execute_write(run_query, query)
    print(result)

driver.close()