In [None]:
import json

from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)

# Adding license key-value pairs to environment variables
import os
os.environ.update(license_keys)

In [None]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

In [None]:
import json
import os
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql import SparkSession

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl
import sparknlp
import pyspark.sql.functions as F

import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G", 
          "spark.kryoserializer.buffer.max":"2000M", 
          "spark.driver.maxResultSize":"2000M"} 

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

In [None]:
spark

In [1]:
import urllib
import gzip
import io
import xmltodict
from datetime import date

# https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/
url = "https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/pubmed22n1211.xml.gz"

oec = xmltodict.parse(gzip.GzipFile(fileobj=io.BytesIO(urllib.request.urlopen(url).read())))

In [2]:
params = list()

for row in oec['PubmedArticleSet']['PubmedArticle']:

    # Skip articles without abstract or other text
    if not row['MedlineCitation']['Article'].get('Abstract'):
        continue

    # Article id
    pmid = row['MedlineCitation']['PMID']['#text']

    abstract_raw = row['MedlineCitation']['Article']['Abstract']['AbstractText']

    if isinstance(abstract_raw, str):
        text = [{'label': 'SINGLE', 'text': abstract_raw}]
    elif isinstance(abstract_raw, list):
        text = [{'label': el.get('@Label', 'SINGLE'), 'text': el['#text']}
                for el in abstract_raw if not isinstance(el, str) and el.get('#text')]
    else:
        text = [{'label': abstract_raw.get(
            '@Label', 'SINGLE'), 'text': abstract_raw.get('#text')}]

    # Completed date
    if row['MedlineCitation'].get('DateCompleted'):
        completed_year = int(row['MedlineCitation']['DateCompleted']['Year'])
        completed_month = int(row['MedlineCitation']['DateCompleted']['Month'])
        completed_day = int(row['MedlineCitation']['DateCompleted']['Day'])
        completed_date = date(completed_year, completed_month, completed_day)
    else:
        completed_date = None

    # Revised date
    revised_year = int(row['MedlineCitation']['DateRevised']['Year'])
    revised_month = int(row['MedlineCitation']['DateRevised']['Month'])
    revised_day = int(row['MedlineCitation']['DateRevised']['Day'])
    revised_date = date(revised_year, revised_month, revised_day)

    # title
    title_raw = row['MedlineCitation']['Article']['ArticleTitle']
    if isinstance(title_raw, str):
        title = title_raw
    else:
        title = title_raw['#text'] if title_raw else None
    # Country
    country = row['MedlineCitation']['MedlineJournalInfo']['Country']

    # Mesh headings
    mesh_raw = row['MedlineCitation'].get('MeshHeadingList')
    if mesh_raw:
        if isinstance(mesh_raw['MeshHeading'], list):
            mesh = [{'mesh_id': el['DescriptorName']['@UI'], 'text': el['DescriptorName']['#text'], 'major_topic': el['DescriptorName']
                     ['@MajorTopicYN']} for el in mesh_raw['MeshHeading']]
        else:
            mesh = [{'mesh_id': el['DescriptorName']['@UI'], 'text': el['DescriptorName']['#text'], 'major_topic': el['DescriptorName']
                     ['@MajorTopicYN']} for el in [mesh_raw['MeshHeading']]]
    else:
        mesh = []

    # Authors
    authors_raw = row['MedlineCitation']['Article'].get('AuthorList')
    if not authors_raw:
        authors = []
    elif isinstance(authors_raw['Author'], list):
        authors = [
            f"{el['ForeName']} {el['LastName']}" for el in authors_raw['Author'] if el.get('ForeName')]
    else:
        authors = [authors_raw['Author']]

    params.append({'pmid': pmid, 'text': text, 'completed_date': completed_date,
                  'revised_date': revised_date, 'title': title, 'country': country, 'mesh': mesh, 'authors': authors})


In [3]:
# Define Neo4j connections
import pandas as pd
from neo4j import GraphDatabase
host = 'bolt://18.214.25.95:7687'
user = 'neo4j'
password = 'lubrication-motel-salutes'
driver = GraphDatabase.driver(host,auth=(user, password))

def run_query(query, params={}):
    with driver.session() as session:
        result = session.run(query, params)
        return pd.DataFrame([r.values() for r in result], columns=result.keys())

In [4]:
# Define constraints

run_query("CREATE CONSTRAINT IF NOT EXISTS ON (a:Article) ASSERT a.pmid IS UNIQUE;")
run_query("CREATE CONSTRAINT IF NOT EXISTS ON (a:Author) ASSERT a.name IS UNIQUE;")
run_query("CREATE CONSTRAINT IF NOT EXISTS ON (m:Mesh) ASSERT m.id IS UNIQUE;")

In [5]:
import_pubmed_query = """
UNWIND $data AS row
MERGE (a:Article {pmid: row.pmid})
SET a.completed_date = date(row.completed_date),
    a.revised_date = date(row.revised_date),
    a.title = row.title,
    a.country = row.country
FOREACH (map IN row.text | 
    CREATE (a)-[r:HAS_TEXT]->(text:Text)
    SET text.text = map.text,
        r.type = map.label)
FOREACH (heading IN row.mesh | 
    MERGE (m:Mesh {id: heading.mesh_id})
    ON CREATE SET m.text = heading.text
    MERGE (a)-[r:MENTIONS]->(m)
    SET r.isMayor = heading.major_topic)
FOREACH (author IN row.author | 
    MERGE (au:Author {name: author})
    MERGE (a)<-[:AUTHORED]-(au))

"""

In [6]:
step = 1000

for x in range(0, len(params), step):
    chunk = params[x:x+step]
    try:
        run_query(import_pubmed_query, {'data': chunk})
    except Exception as e:
        print(e)
    


{code: Neo.ClientError.Statement.EntityNotFound} {message: Unable to load NODE with id 101.}


In [7]:
# NLP step

nlp_input = run_query("""
MATCH (t:Text)
RETURN id(t) AS nodeId, t.text as text
""")

In [8]:
nlp_input

Unnamed: 0,nodeId,text
0,863,: Remarkable advances have been made in acute ...
1,864,Hernias spanning both chest and abdominal wall...
2,865,The cross sectional imaging of patients presen...
3,866,Six patients were identified. All hernias occu...
4,867,The typical injury pattern of thoracoabdominal...
...,...,...
48307,100080,"The hypothesis requires experimental testing, ..."
48308,100081,Ultra-processed food (UPF) and Ultra-processed...
48309,100082,We conducted a mixed methods synthesis review....
48310,100083,We project that the combined sales volume of U...


In [18]:
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql import SparkSession

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl
import sparknlp
import pyspark.sql.functions as F

In [22]:
import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"8G", 
          "spark.kryoserializer.buffer.max":"2000M", 
          "spark.driver.maxResultSize":"2000M"} 

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

Exception: Java gateway process exited before sending its port number

In [19]:
documentAssembler = DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("ner_chunk")

sbert_embedder = BertSentenceEmbeddings\
      .pretrained('sbiobert_base_cased_mli', 'en','clinical/models')\
      .setInputCols(["ner_chunk"])\
      .setOutputCol("sbert_embeddings")\
      .setCaseSensitive(False)
    
mesh_resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_mesh","en", "clinical/models") \
      .setInputCols(["ner_chunk", "sbert_embeddings"]) \
      .setOutputCol("mesh_code")\
      .setDistanceFunction("EUCLIDEAN")

mesh_pipeline = Pipeline(
    stages = [
        documentAssembler,
        sbert_embedder,
        mesh_resolver])

result = mesh_pipeline.fit(nlp_input).transform(nlp_input)


AttributeError: Cannot load _jvm from SparkContext. Is SparkContext initialized?