<a href="https://colab.research.google.com/github/tomasonjo/blogs/blob/master/pubmed/Pubmed%20NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import json

from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)

# Adding license key-value pairs to environment variables
import os
os.environ.update(license_keys)

Saving spark_nlp_for_healthcare_spark_ocr_4435.json to spark_nlp_for_healthcare_spark_ocr_4435 (1).json


In [2]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display
# Installing neo4j driver and xml parser
! pip install neo4j xmltodict



In [3]:
import urllib
import gzip
import io
import xmltodict
from datetime import date

# Get latest pubmed articles
# https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/
url = "https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/pubmed22n1211.xml.gz"

oec = xmltodict.parse(gzip.GzipFile(fileobj=io.BytesIO(urllib.request.urlopen(url).read())))

In [4]:
# Export pubmed article params
params = list()

for row in oec['PubmedArticleSet']['PubmedArticle']:

    # Skip articles without abstract or other text
    if not row['MedlineCitation']['Article'].get('Abstract'):
        continue

    # Article id
    pmid = row['MedlineCitation']['PMID']['#text']

    abstract_raw = row['MedlineCitation']['Article']['Abstract']['AbstractText']

    if isinstance(abstract_raw, str):
        text = [{'label': 'SINGLE', 'text': abstract_raw}]
    elif isinstance(abstract_raw, list):
        text = [{'label': el.get('@Label', 'SINGLE'), 'text': el['#text']}
                for el in abstract_raw if not isinstance(el, str) and el.get('#text')]
    else:
        text = [{'label': abstract_raw.get(
            '@Label', 'SINGLE'), 'text': abstract_raw.get('#text')}]

    # Completed date
    if row['MedlineCitation'].get('DateCompleted'):
        completed_year = int(row['MedlineCitation']['DateCompleted']['Year'])
        completed_month = int(row['MedlineCitation']['DateCompleted']['Month'])
        completed_day = int(row['MedlineCitation']['DateCompleted']['Day'])
        completed_date = date(completed_year, completed_month, completed_day)
    else:
        completed_date = None

    # Revised date
    revised_year = int(row['MedlineCitation']['DateRevised']['Year'])
    revised_month = int(row['MedlineCitation']['DateRevised']['Month'])
    revised_day = int(row['MedlineCitation']['DateRevised']['Day'])
    revised_date = date(revised_year, revised_month, revised_day)

    # title
    title_raw = row['MedlineCitation']['Article']['ArticleTitle']
    if isinstance(title_raw, str):
        title = title_raw
    else:
        title = title_raw['#text'] if title_raw else None
    # Country
    country = row['MedlineCitation']['MedlineJournalInfo']['Country']

    # Mesh headings
    mesh_raw = row['MedlineCitation'].get('MeshHeadingList')
    if mesh_raw:
        if isinstance(mesh_raw['MeshHeading'], list):
            mesh = [{'mesh_id': el['DescriptorName']['@UI'], 'text': el['DescriptorName']['#text'], 'major_topic': el['DescriptorName']
                     ['@MajorTopicYN']} for el in mesh_raw['MeshHeading']]
        else:
            mesh = [{'mesh_id': el['DescriptorName']['@UI'], 'text': el['DescriptorName']['#text'], 'major_topic': el['DescriptorName']
                     ['@MajorTopicYN']} for el in [mesh_raw['MeshHeading']]]
    else:
        mesh = []

    # Authors
    authors_raw = row['MedlineCitation']['Article'].get('AuthorList')
    if not authors_raw:
        authors = []
    elif isinstance(authors_raw['Author'], list):
        authors = [
            f"{el['ForeName']} {el['LastName']}" for el in authors_raw['Author'] if el.get('ForeName')]
    else:
        authors = [authors_raw['Author']]

    params.append({'pmid': pmid, 'text': text, 'completed_date': completed_date,
                  'revised_date': revised_date, 'title': title, 'country': country, 'mesh': mesh, 'authors': authors})


In [5]:
# Define Neo4j connections
import pandas as pd
from neo4j import GraphDatabase
host = 'bolt://18.214.25.95:7687'
user = 'neo4j'
password = 'lubrication-motel-salutes'
driver = GraphDatabase.driver(host,auth=(user, password))

def run_query(query, params={}):
    with driver.session() as session:
        result = session.run(query, params)
        return pd.DataFrame([r.values() for r in result], columns=result.keys())

In [6]:
# Define constraints

run_query("CREATE CONSTRAINT IF NOT EXISTS ON (a:Article) ASSERT a.pmid IS UNIQUE;")
run_query("CREATE CONSTRAINT IF NOT EXISTS ON (a:Author) ASSERT a.name IS UNIQUE;")
run_query("CREATE CONSTRAINT IF NOT EXISTS ON (m:Mesh) ASSERT m.id IS UNIQUE;")
run_query("CREATE CONSTRAINT IF NOT EXISTS ON (s:Sentence) ASSERT s.id IS UNIQUE;")

In [7]:
import_pubmed_query = """
UNWIND $data AS row
MERGE (a:Article {pmid: row.pmid})
SET a.completed_date = date(row.completed_date),
    a.revised_date = date(row.revised_date),
    a.title = row.title,
    a.country = row.country
FOREACH (map IN row.text | 
    CREATE (a)-[r:HAS_TEXT]->(text:Text)
    SET text.text = map.text,
        r.type = map.label)
FOREACH (heading IN row.mesh | 
    MERGE (m:Mesh {id: heading.mesh_id})
    ON CREATE SET m.text = heading.text
    MERGE (a)-[r:MENTIONS_MESH]->(m)
    SET r.isMayor = heading.major_topic)
FOREACH (author IN row.author | 
    MERGE (au:Author {name: author})
    MERGE (a)<-[:AUTHORED]-(au))

"""

In [8]:
# Import pubmed articles into Neo4j
step = 1000

for x in range(0, len(params), step):
    chunk = params[x:x+step]
    try:
        run_query(import_pubmed_query, {'data': chunk})
    except Exception as e:
        print(e)
    


In [9]:
# NLP step

nlp_input = run_query("""
MATCH (t:Text)
RETURN id(t) AS nodeId, t.text as text
""")

In [10]:
nlp_input.head()

Unnamed: 0,nodeId,text
0,950,Breast Care Nurses (BCNs) are now established ...
1,951,To assess the effectiveness of individual inte...
2,952,We searched the Cochrane Breast Cancer Group S...
3,953,Randomised controlled trials assessing the eff...
4,954,Two authors independently assessed relevant st...


In [11]:
import json
import os
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql import SparkSession

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl
import sparknlp
import pyspark.sql.functions as F

import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"5G", 
          "spark.kryoserializer.buffer.max":"2000M", 
          "spark.driver.maxResultSize":"2000M"} 

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

Spark NLP Version : 3.4.1
Spark NLP_JSL Version : 3.4.1


In [12]:
spark

In [13]:
documenter = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentencer = SentenceDetector()\
    .setInputCols(["document"])\
    .setOutputCol("sentences")

tokenizer = Tokenizer()\
    .setInputCols(["sentences"])\
    .setOutputCol("tokens")

# NER for ReDL

redl_words_embedder = WordEmbeddingsModel()\
    .pretrained("embeddings_clinical", "en", "clinical/models")\
    .setInputCols(["sentences", "tokens"])\
    .setOutputCol("redl_embeddings")

redl_drugprot_ner_tagger = MedicalNerModel.pretrained("ner_drugprot_clinical", "en", "clinical/models")\
    .setInputCols("sentences", "tokens", "redl_embeddings")\
    .setOutputCol("redl_ner_tags")

redl_ner_converter = NerConverter()\
    .setInputCols(["sentences", "tokens", "redl_ner_tags"])\
    .setOutputCol("redl_ner_chunks")

# NER for ADE

ade_words_embedder = BertEmbeddings() \
    .pretrained("biobert_pubmed_base_cased", "en") \
    .setInputCols(["sentences", "tokens"]) \
    .setOutputCol("ade_embeddings")

ade_ner_tagger = MedicalNerModel() \
    .pretrained("ner_ade_biobert", "en", "clinical/models") \
    .setInputCols(["sentences", "tokens", "ade_embeddings"]) \
    .setOutputCol("ade_ner_tags")

ade_ner_converter = NerConverter() \
    .setInputCols(["sentences", "tokens", "ade_ner_tags"]) \
    .setOutputCol("ade_ner_chunks")

# PoS and Dependency parser

pos_tagger = PerceptronModel()\
    .pretrained("pos_clinical", "en", "clinical/models")\
    .setInputCols(["sentences", "tokens"])\
    .setOutputCol("pos_tags")

dependency_parser = DependencyParserModel()\
    .pretrained("dependency_conllu", "en")\
    .setInputCols(["sentences", "pos_tags", "tokens"])\
    .setOutputCol("dependencies")

# ReDL relaton extraction

# Set a filter on pairs of named entities which will be treated as relation candidates
drugprot_re_ner_chunk_filter = RENerChunksFilter()\
    .setInputCols(["redl_ner_chunks", "dependencies"])\
    .setOutputCol("redl_re_ner_chunks")\
    .setMaxSyntacticDistance(4)
    # .setRelationPairs(['CHEMICAL-GENE'])
    
drugprot_re_Model = RelationExtractionDLModel()\
    .pretrained('redl_drugprot_biobert', "en", "clinical/models")\
    .setPredictionThreshold(0.9)\
    .setInputCols(["redl_re_ner_chunks", "sentences"])\
    .setOutputCol("redl_relations")

# ADE relation extraction

ade_re_model = RelationExtractionModel()\
        .pretrained("re_ade_biobert", "en", 'clinical/models')\
        .setInputCols(["ade_embeddings", "pos_tags", "ade_ner_chunks", "dependencies"])\
        .setOutputCol("ade_relations")\
        .setMaxSyntacticDistance(3)\
        .setPredictionThreshold(0.9)\
        .setRelationPairs(["drug-ade"]) # Possible relation pairs. Default: All Relations.

# Define whole pipeline
pipeline = Pipeline(
    stages=[documenter, sentencer, tokenizer,redl_words_embedder,
            redl_drugprot_ner_tagger,
            redl_ner_converter,
            ade_words_embedder,
            ade_ner_tagger,
            ade_ner_converter,
            pos_tagger,
            dependency_parser,
            drugprot_re_ner_chunk_filter,
            drugprot_re_Model,
            ade_re_model])

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_drugprot_clinical download started this may take some time.
Approximate size to download 14 MB
[OK!]
biobert_pubmed_base_cased download started this may take some time.
Approximate size to download 386.4 MB
[OK!]
ner_ade_biobert download started this may take some time.
Approximate size to download 15.3 MB
[OK!]
pos_clinical download started this may take some time.
Approximate size to download 1.5 MB
[OK!]
dependency_conllu download started this may take some time.
Approximate size to download 16.7 MB
[OK!]
redl_drugprot_biobert download started this may take some time.
Approximate size to download 386.6 MB
[OK!]
re_ade_biobert download started this may take some time.
Approximate size to download 17.1 MB
[OK!]


In [14]:
def extract_rel_params(df):
  """
  Extract relationship parameters from the output dataframe for ADE and ReDL relations
  """
  rel_params = list()
  for i, row in df.iterrows():
      node_id = row['nodeId']
      if row['redl_relations']:
          for result in row['redl_relations']:
              rel_type = result['result'].replace('-', '_')
              confidence = result['metadata']['confidence']
              entity_1_type = result['metadata']['entity1']
              entity_1_label = result['metadata']['chunk1']
              entity_2_type = result['metadata']['entity2']
              entity_2_label = result['metadata']['chunk2']

              rel_params.append({'node_id': node_id, 'rel_type': rel_type, 'confidence': confidence,
                                'entity_1_type': entity_1_type, 'entity_1_label': entity_1_label, 'entity_2_type': entity_2_type, 'entity_2_label': entity_2_label})
      if row['ade_relations']:
          for result in row['ade_relations']:
              # Skip when ADE is not found
              if result['result'] == '0':
                  continue
              rel_type = 'ADE'
              confidence = result['metadata']['confidence']
              entity_1_type = result['metadata']['entity1']
              entity_1_label = result['metadata']['chunk1']
              entity_2_type = result['metadata']['entity2']
              entity_2_label = result['metadata']['chunk2']

              rel_params.append({'node_id': node_id, 'rel_type': rel_type, 'confidence': confidence,
                                'entity_1_type': entity_1_type, 'entity_1_label': entity_1_label, 'entity_2_type': entity_2_type, 'entity_2_label': entity_2_label})

  return rel_params


In [15]:
# Define neo4j import query
import_rels_query = """
UNWIND $data AS row
MATCH (a:Text)
WHERE id(a) = toInteger(row.node_id)
WITH row, a 
CALL apoc.merge.node(
  ['Entity', row.entity_1_type],
  {name: row.entity_1_label},
  {},
  {}
) YIELD node AS startNode
CALL apoc.merge.node(
  ['Entity', row.entity_2_type],
  {name: row.entity_2_label},
  {},
  {}
) YIELD node AS endNode

MERGE (startNode)-[:RELATIONSHIP]->(rel:Relationship {type: row.rel_type})-[:RELATIONSHIP]->(endNode)

MERGE (a)-[:MENTIONS]->(startNode)
MERGE (a)-[:MENTIONS]->(endNode)
MERGE (a)-[rm:MENTIONS]->(rel)
SET rm.confidence = row.confidence

"""

In [16]:
from datetime import datetime


nlp_input = nlp_input.head(1000)


step = 200  #batch size
for i in range(0, len(nlp_input), step):
  print(f"Start processing row {i} at {datetime.now()}")
  # Create a chunk from the original Pandas Dataframe
  chunk_df = nlp_input[i: i + step]
  # Convert Pandas into Spark Dataframe
  sparkDF=spark.createDataFrame(chunk_df)
  # Run through NLP pipeline
  result = pipeline.fit(sparkDF).transform(sparkDF)
  df = result.toPandas()
  # Extract REL params
  rel_params = extract_rel_params(df)
  # Store to Neo4j
  run_query(import_rels_query, {'data': rel_params})



Start processing row 0 at 2022-03-09 10:23:22.653259
1
ADE
1
ADE
1
ADE
1
ADE
1
ADE
1
ADE
1
ADE
1
ADE
1
ADE
1
ADE
1
ADE
1
ADE
1
ADE
1
ADE
1
ADE
1
ADE
1
ADE
1
ADE
1
ADE
1
ADE
1
ADE
1
ADE
1
ADE
1
ADE
1
ADE
1
ADE
1
ADE
1
ADE
1
ADE
1
ADE
