# Get Article Metadata from PubMed

This notebook gets metadata from PubMed using metapub python library, create Article nodes and link the TM nodes to Article nodes.

## Install METAPUB python library

In [103]:
pip install metapub

Note: you may need to restart the kernel to use updated packages.


## Import Libraries

In [104]:
#import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# import matplotlib.pyplot as plt 
# import seaborn as sns
# sns.set_style('darkgrid')
# sns.set_palette("colorblind")
# sns.set(rc={'figure.figsize':(11,6)})

import os 
import configparser
import csv
from collections import defaultdict
from metapub import PubMedFetcher
fetch = PubMedFetcher()

In [105]:
# install or import Neo4j GraphDataScience library
try: 
  from graphdatascience import GraphDataScience
  print('Successfully imported GraphDataScience')
except ModuleNotFoundError:
  !pip3 install graphdatascience
  from graphdatascience import GraphDataScience
  print('installed and imported GraphDataScience')

Successfully imported GraphDataScience


# Custom Functions

In [106]:
# function adapted from Neo4j GDS Fraud Demo Notebook (h/t Zach B.)
def read_neo4j_properties(NEO4J_PROPERTIES_FILE: str=None) -> str:
  '''Parses Neo4j database or Aura connection details from provided .ini filepath.
  Requirements:
    configparser

  Args:
    NEO4J_PROPERTIES_FILE: path to a .ini file
  
  Returns:
    HOST: link to Neo4j or Aura host 
    USERNAME: login username
    PASSWORD: login password 

  Note: The .ini file should use the following syntax
    [NEO4J]
    PASSWORD=<password>
    USERNAME=<database name>
    HOST=<host uri>

  If no path is passed, the function will return the defaults:
    HOST = 'neo4j://localhost'
    USERNAME = 'neo4j'
    PASSWORD = 'password'
  '''

  if NEO4J_PROPERTIES_FILE is not None and os.path.exists(NEO4J_PROPERTIES_FILE):
      config = configparser.RawConfigParser()
      config.read(NEO4J_PROPERTIES_FILE)
      HOST = config['NEO4J']['HOST']
      USERNAME = config['NEO4J']['USERNAME']
      PASSWORD = config['NEO4J']['PASSWORD']
      print('Using HOST, USERNAME, PASSWORD from .ini file')
      return HOST, USERNAME, PASSWORD
  else:
      print('Could not find database properties file, using defaults:')
      HOST = 'neo4j://localhost'
      USERNAME = 'neo4j'
      PASSWORD = 'password'
      print(f'HOST: {HOST} \nUSERHAME: {USERNAME} \nPASSWORD: {PASSWORD}')
      return HOST, USERNAME, PASSWORD 

In [116]:
# Get article journal full name using pubmed api
import requests

def get_article_journal_fullname_pubmed_api(pubmed_id):

    # Base URL for the PubMed API
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    
    # Parameters for the request
    params = {
        "db": "pubmed",
        "id": pubmed_id,
        "retmode": "json",
        "resulttype": "full",
    }
    
    # Send GET request and get response
    response = requests.get(base_url, params=params)
    
    # Check for successful response
    if response.status_code == 200:
        data = response.json()
        #print(data)
        return data["result"][str(pubmed_id)]['fulljournalname']
    else:
        print(f"Error: {response.status_code}")
        return None

In [117]:
# Create merged dataframe for article metadata

from random import randint
from time import sleep


def get_article_metadata_df(fetch, article_ids_df):
    records = []
    #for pmid in pmids:
    for index, row in article_ids_df.iterrows():
        pmid = row['pmid']
        pmcid = row['pmcid']
        print(pmid)
        sleep(randint(1,5))
        meta =[]
        meta.append(pmid)
        meta.append(pmcid)
        article = fetch.article_by_pmid(pmid)
        meta.append(article.title)
        meta.append(article.abstract)
        meta.append(', '.join(article.authors))
        meta.append(article.year)
        meta.append(article.volume)
        meta.append(article.issue)
        journal_full_name = get_article_journal_fullname_pubmed_api(pmid)
        if journal_full_name is None:
            journal_full_name = article.journal
        print(journal_full_name)
        meta.append(journal_full_name)
        
        meta.append(article.citation)
        meta.append("https://pubmed.ncbi.nlm.nih.gov/"+str(pmid)+"/")
        
        records.append(meta)

    df = pd.DataFrame(records, columns=['PMID', 'PMCID', 'Title', 'Abstract', 'Author', 'Year', 'Volume', 'Issue', 'Journal', 'Citation', 'Link'])
    
    return df

# Connect to Neo4j DB
It is recommended to store authentication credentials in a separate file and read them in to the notebook as variables. This code assumes the files are stored in a local auth directory.

In [118]:
# get authentication credentials from local auth file
NEO4J_PROPERTIES_FILE = 'auth/immerse_kg_auth.ini'
HOST, USERNAME, PASSWORD = read_neo4j_properties(NEO4J_PROPERTIES_FILE=NEO4J_PROPERTIES_FILE)

Using HOST, USERNAME, PASSWORD from .ini file


In [None]:
# connect to neo4j instance 
gds = GraphDataScience(HOST, auth=(USERNAME, PASSWORD), aura_ds=False)

In [120]:
# confirm connection with GDS version 
gds.version()

'2.5.0'

# Clean up Article nodes

In [121]:
# if necessary, clear out the Article nodes and relationships
gds.run_cypher('''
                MATCH (n:Article)
                DETACH DELETE n
                ''')

# Get Article Metadata

In [122]:
# Get pmids for all TM nodes
article_ids_df = gds.run_cypher('''
                MATCH (t:TextMining)
                WHERE t.pmid is not null
                RETURN DISTINCT t.pmid as pmid, t.pmcid as pmcid
                ''')

In [123]:
# Get article meta data
article_meta = get_article_metadata_df(fetch, article_ids_df)
#article_meta

25042542
Biotechnology and bioengineering
25641927
Biotechnology progress
25875452
mAbs
26910040
Applied microbiology and biotechnology
27559765
mAbs
27752770
Bioprocess and biosystems engineering
28597152
Protein & cell
28921534
Biotechnology and bioengineering
29511312
Scientific reports
30552760
Biotechnology and bioengineering
30682623
iScience
31425633
Biotechnology progress
31487120
Biotechnology progress
31544815
Antibodies (Basel, Switzerland)
32170810
Biotechnology journal
32748555
Biotechnology progress
33476097
Biotechnology progress
33656168
Biotechnology and bioengineering
33682619
mAbs
33738790
Biotechnology and bioengineering
33742789
Biotechnology progress
33804825
International journal of molecular sciences
34542245
Biotechnology progress
34575094
Life (Basel, Switzerland)
34935124
Biotechnology and bioengineering
35087805
Frontiers in bioengineering and biotechnology
35182428
Biotechnology and bioengineering
35441833
Biotechnology progress
35470430
Biotechnology and b

In [124]:
# Check artilce metadata size
article_meta.shape

(89, 11)

In [126]:
# Spot checking
article_meta[['PMID', 'PMCID', 'Journal']].head()

Unnamed: 0,PMID,PMCID,Journal
0,25042542,PMC4282109,Biotechnology and bioengineering
1,25641927,PMC4492121,Biotechnology progress
2,25875452,PMC4622614,mAbs
3,26910040,PMC4947490,Applied microbiology and biotechnology
4,27559765,PMC5098448,mAbs


In [127]:
article_meta.head(2).to_dict(orient='records')

[{'PMID': '25042542',
  'PMCID': 'PMC4282109',
  'Title': 'Use of a small molecule cell cycle inhibitor to control cell growth and improve specific productivity and product quality of recombinant proteins in CHO cell cultures.',
  'Abstract': 'The continued need to improve therapeutic recombinant protein productivity has led to ongoing assessment of appropriate strategies in the biopharmaceutical industry to establish robust processes with optimized critical variables, that is, viable cell density (VCD) and specific productivity (product per cell, qP). Even though high VCD is a positive factor for titer, uncontrolled proliferation beyond a certain cell mass is also undesirable. To enable efficient process development to achieve consistent and predictable growth arrest while maintaining VCD, as well as improving qP, without negative impacts on product quality from clone to clone, we identified an approach that directly targets the cell cycle G1-checkpoint by selectively inhibiting the f

In [137]:
# if necessary, clear out the Article nodes and their relationships
gds.run_cypher('''
                MATCH (a:Article)
                DETACH DELETE a
                ''')

In [138]:
# Create Article nodes
gds.run_cypher('''
               UNWIND $article_meta AS node
               //CALL apoc.merge.node (["Article"], {pmid: node.PMID, pmcid: node.PMCID, title: node.Title, abstract: node.Abstract, authors: node.Author, year: toInteger(node.Year), voulme: node.Volume, issue: coalesce(node.Issue, 'None'), journal: node.Journal, citation: node.Citation, link: node.Link, impact_factor: node.ImpactFactor})
               
               CALL apoc.merge.node (["Article"], {pmid: node.PMID, pmcid: node.PMCID, title: node.Title, abstract: node.Abstract, authors: node.Author, year: toInteger(node.Year), voulme: node.Volume, issue: coalesce(node.Issue, 'None'), journal: node.Journal, citation: node.Citation, link: node.Link})
               YIELD node as n 
               RETURN n
              ''', {'article_meta': article_meta.to_dict('records')})

Unnamed: 0,n
0,"(journal, issue, citation, year, voulme, link, abstract, pmid, pmcid, title, authors)"
1,"(journal, issue, citation, year, voulme, link, abstract, pmid, pmcid, title, authors)"
2,"(journal, issue, citation, year, voulme, link, abstract, pmid, pmcid, title, authors)"
3,"(journal, issue, citation, year, voulme, link, abstract, pmid, pmcid, title, authors)"
4,"(journal, issue, citation, year, voulme, link, abstract, pmid, pmcid, title, authors)"
5,"(journal, issue, citation, year, voulme, link, abstract, pmid, pmcid, title, authors)"
6,"(journal, issue, citation, year, voulme, link, abstract, pmid, pmcid, title, authors)"
7,"(journal, issue, citation, year, voulme, link, abstract, pmid, pmcid, title, authors)"
8,"(journal, issue, citation, year, voulme, link, abstract, pmid, pmcid, title, authors)"
9,"(journal, issue, citation, year, voulme, link, abstract, pmid, pmcid, title, authors)"


In [139]:
# Remove property 'None' issue
gds.run_cypher('''
    MATCH(A:Article { issue: 'None' })
    REMOVE A.issue
    ''')

In [140]:
# Link Textming nodes to articles
gds.run_cypher('''
    MATCH (a:Article), (t:TextMining)
    WHERE a.pmid = t.pmid
    MERGE (t)-[:IS_MENTIONED_IN]->(a)
    ''')

2024-04-21 21:31:41 UD-Q3J7G7FQ7J-D root[35492] INFO {'severity': 'INFORMATION', 'description': 'If a part of a query contains multiple disconnected patterns, this will build a cartesian product between all those parts. This may produce a large amount of data and slow down query processing. While occasionally intended, it may often be possible to reformulate the query that avoids the use of this cross product, perhaps by adding a relationship between the different parts or by using OPTIONAL MATCH (identifier is: (t))', 'code': 'Neo.ClientNotification.Statement.CartesianProduct', 'position': {'column': 1, 'offset': 5, 'line': 2}, 'title': 'This query builds a cartesian product between disconnected patterns.', 'category': 'PERFORMANCE'}


In [141]:
# Use the entities from the text mining to get the ontological concepts as the context for a given Article
context_df = gds.run_cypher('''match(n:TextMining)-[:HAS_CANONICAL_NAME]->(d:DictionaryConcept)-[:IS_INSTANCE_OF]->(o:OntologicalConcept)-[:SUB_CLASS_OF*]->(o2:OntologicalConcept)
where n.pmid is not null
with n, collect (distinct o.name) as child, collect (distinct o2.name) as ancestor
with n, child, ancestor, apoc.coll.unionAll(child, ancestor) as all
return distinct n.pmid as pmid, apoc.coll.sort(all) as context order by pmid''')

In [142]:
data = defaultdict(set)
# Loop through each row
for index, row in context_df.iterrows():
  #print(f"Row index: {index}")
  #print(f"PMID: {row['pmid']}, Context: {row['context']}")
  for val in row['context']:
      #print(val)
      data[row['pmid']].add(val)
merged_dict =[]
for key in data:
    #print(key, (", ").join([element for element in sorted(data[key])]))
    value = (", ").join([element for element in sorted(data[key])])
    article = {}
    article['pmid'] = key
    article['context'] = value
    merged_dict.append(article)
merged_dict
# type(merged_dict)
merged_context_df = pd.DataFrame.from_dict(merged_dict)
merged_context_df

Unnamed: 0,pmid,context
0,11150551,"PTM, biological process, glycosylation, metabolic pathways"
1,12950230,"carbon source, cell line, chinese hamster ovary (CHO), primary raw materials, saccharides, secondary raw materials, sugar"
2,15593097,"PTM, amino acid, biological process, carbon source, glycosylation, metabolic pathways, process outcome, product quality attributes, secondary raw materials, substrates, sugar"
3,15903239,"PTM, biological process, cell line, glycosylation, metabolic pathways, primary raw materials"
4,16609957,"PTM, antibody, biological process, carbon source, cell line, cell line characteristics , genotype, glycosylation, metabolic pathways, non-fucosylated antibody, primary raw materials, raw material characteristics, secondary raw materials, sugar"
5,19224598,"PTM, biological process, cell line, glycosylation, metabolic pathways, primary raw materials"
6,20159578,"cell culture parameters, process parameters or process control"
7,20589669,"PTM, biological process, glycosylation, metabolic pathways, process outcome, product quality profile"
8,20639190,"PTM, biological process, carbon source, glycosylation, metabolic pathways, secondary raw materials, sugar"
9,22699308,"carbon source, secondary raw materials, sugar"


In [143]:
# Create Article context
gds.run_cypher('''
               UNWIND $context_list AS cl
               MATCH (a:Article)
               WHERE a.pmid = cl.pmid
               SET a.doc_context = cl.context
               //RETURN a
              ''', {'context_list': merged_context_df.to_dict('records')})

In [148]:
# Create contraint on Concept nodes
gds.run_cypher('''CREATE CONSTRAINT article IF NOT EXISTS FOR (a:Article) REQUIRE a.pmid IS UNIQUE''')

In [149]:
# Check existing constraints
gds.run_cypher('''SHOW CONSTRAINTS''')

Unnamed: 0,id,name,type,entityType,labelsOrTypes,properties,ownedIndex
0,11,article,UNIQUENESS,NODE,[Article],[pmid],article
1,5,dictionary_concept,UNIQUENESS,NODE,[DictionaryConcept],[id],dictionary_concept
2,2,ontological_concept,UNIQUENESS,NODE,[OntologicalConcept],[id],ontological_concept
3,8,text_mining,UNIQUENESS,NODE,[TextMining],[id],text_mining


In [150]:
# Create fulltext index on Article nodes
gds.run_cypher('''CREATE FULLTEXT INDEX article_search IF NOT EXISTS FOR (a:Article) ON EACH [a.title, a.abstract, a.authors, a.citation, a.journal, a.issue, a.volume, a.year]''')

In [151]:
# exports the whole database incl. indexes as cypher statements to the provided file
gds.run_cypher('''CALL apoc.export.cypher.all('kg_export_after_load_article.cypher',{format:'cypher-shell'})''')

Unnamed: 0,file,batches,source,format,nodes,relationships,properties,time,rows,batchSize,cypherStatements,nodeStatements,relationshipStatements,schemaStatements,cleanupStatements
0,kg_export_after_load_article.cypher,1,"database: nodes(3488), rels(7373)",cypher,3488,7373,48948,240,10861,20000,,,,,
