![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)

# Spark Setup

In [5]:
import json
import os
import random

license_key = "cabir/keys/6.0.2.spark_nlp_for_healthcare.json"
with open('/home/jovyan/work/shared/' + license_key) as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)
os.environ.update(license_keys)

In [2]:
# Installing pyspark and spark-nlp
%pip install --upgrade -q pyspark==3.5.0  spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
%pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
%pip install -q spark-nlp-display

In [7]:
import os
import json
import numpy as np
import pandas as pd

# pd.set_option("display.max_colwidth", 100)
# pd.set_option("display.max_colwidth",0)

import sparknlp
import sparknlp_jsl

from sparknlp.base import *
from sparknlp.util import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline, PipelineModel

import warnings
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

params = {"spark.driver.memory":"58G", # Amount of memory to use for the driver process, i.e. where SparkContext is initialized
          "spark.kryoserializer.buffer.max":"2000M", # Maximum allowable size of Kryo serialization buffer, in MiB unless otherwise specified.
          "spark.driver.maxResultSize":"2000M"} # Limit of total size of serialized results of all partitions for each Spark action (e.g. collect) in bytes.
                                                # Should be at least 1M, or 0 for unlimited.

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)
spark.sparkContext.setLogLevel("ERROR")
print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark

Spark NLP Version : 6.0.2
Spark NLP_JSL Version : 6.0.2


# Versions

In [1]:
!python --version

Python 3.10.9


In [2]:
!java -version

openjdk version "11.0.17" 2022-10-18
OpenJDK Runtime Environment (build 11.0.17+8-post-Ubuntu-1ubuntu222.04)
OpenJDK 64-Bit Server VM (build 11.0.17+8-post-Ubuntu-1ubuntu222.04, mixed mode, sharing)


In [3]:
%pip show tensorflow

Name: tensorflow
Version: 2.12.0
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: /home/jovyan/work/shared/venvs/cabir-ds/lib/python3.10/site-packages
Requires: absl-py, astunparse, flatbuffers, gast, google-pasta, grpcio, h5py, jax, keras, libclang, numpy, opt-einsum, packaging, protobuf, setuptools, six, tensorboard, tensorflow-estimator, tensorflow-io-gcs-filesystem, termcolor, typing-extensions, wrapt
Required-by: 
Note: you may need to restart the kernel to use updated packages.


# Sample text

In [8]:
text = """
(NOTE) Patient Name: John Lee. MR#: 7789201 Location: LERE Date Reported: 2025-05-12 16:30
Specimen #RD23-4897 Clinical History: None Given. CLINICAL INFORMATION: Date of Last Menstrual Period: N/A
Electronically Signed Out By Dr. Smith, Dr. Carter, CT(ASCP) Date Reported: 2025-05-12 16:30
General Hospital Dr. Fan Gabriel 90210 CPT Code(s) A: 88305

General Hospital in New York City Dr. Williams, NYC, NY
(212) 555-7890 Patient Name: John Lee Accession #: GH-556672
Patient ID #: 7789201 Collected: 2025-05-10 Address:
123 Main Street, FALL RIVER
NIAGARA FALLS, NY 14304
Received: 2025-05-10 Reported: 2025-05-12
Soc. Sec. #: XXX-XX-1234 DOB/Age/Sex: 1973 (Age: 52) M
Physician(s): Dr. Jameson. Peripheral sequestration, i.e. splenomegaly or hepatomegaly should be excluded to be sure if peripheral sequestration is not present.
The following special studies were performed at Barstow Heights Christus Southeast, NY – St Elizabeth; New York City.
· Chromosome analysis cytogenetics. (ADDENDUM REPORT TO FOLLOW.)
· Leukemic immunophenotyping flow cytometry.

...., and there is no evidence of dysplasia.
Fr/ap MATERIAL RECEIVED 6 SLIDES LABELED 032-1902, COLLECTED 2025-05-10
SPECIMEN SOURCE: GASTRIC, ILEUM AND RANDOM COLON, BIOPSIES
REFERRING FACILITY: NY
"""

# Pretrained Pipeline

In [8]:
from sparknlp.pretrained import PretrainedPipeline

deid_pipeline = PretrainedPipeline("clinical_deidentification_docwise_benchmark", "en", "clinical/models")

clinical_deidentification_docwise_benchmark download started this may take some time.
Approx size to download 2.3 GB
[OK!]


In [9]:
deid_pipeline.model.stages

[DocumentAssembler_ae0f203deedd,
 InternalDocumentSplitter_cc36578ceda6,
 REGEX_TOKENIZER_2e85686aea12,
 WORD_EMBEDDINGS_MODEL_9004b1d00302,
 MedicalNerModel_1a8637089929,
 NER_CONVERTER_1aef7e9d2de5,
 MedicalNerModel_d92d47622e85,
 MedicalNerModel_32184c1db80b,
 MedicalNerModel_ada39ac0d359,
 NER_CONVERTER_a99db4e6a79d,
 NER_CONVERTER_4a9436714344,
 NER_CONVERTER_ea6433988e18,
 PretrainedZeroShotNER_5f30ab9002f1,
 NER_CONVERTER_c97040caf7b3,
 MedicalNerModel_b8b167ec3114,
 NER_CONVERTER_06db473f3215,
 ContextualEntityRuler_11ff6711ef6b,
 ChunkMergeModel_95d6827691bb,
 CONTEXTUAL-PARSER_bf2a6abaf5fa,
 CONTEXTUAL-PARSER_ff6bad379d91,
 CONTEXTUAL-PARSER_89341cae7221,
 CONTEXTUAL-PARSER_c6b9eded8d31,
 CONTEXTUAL-PARSER_9480c24bd9f8,
 CONTEXTUAL-PARSER_3886bce391c8,
 CONTEXTUAL-PARSER_0bb3fb75cd01,
 ENTITY_EXTRACTOR_6792f2f6e85a,
 ENTITY_EXTRACTOR_74ace4be4f73,
 CONTEXTUAL-PARSER_dfb32adc7555,
 REGEX_MATCHER_5003669d6422,
 CONTEXTUAL-PARSER_746a25662aa6,
 CONTEXTUAL-PARSER_079220479a3d,
 C

## New Stages

In [15]:
document_assembler = DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")

splitter = (
            InternalDocumentSplitter()
            .setInputCols("document")
            .setOutputCol("splitter")
            .setSplitMode("recursive")
            .setSplitPatterns(["\s+"])  # Token base
            .setPatternsAreRegex(True)
            .setChunkSize(512)    # 512 Char Lenght
            .setChunkOverlap(50)
            .setEnableSentenceIncrement(True)  # Like sentenceDetector
)

tokenizer = (
    Tokenizer()
    .setInputCols("splitter")
    .setOutputCol("token")
)

### **cpt_parser**

In [16]:
cpt_rule = {
    "entity": "CPT_CODE",
    "ruleScope": "sentence",
    "regex": r"(?:CPT(?: Code\(s\)?|#|:)?\s*:?[\s#]*)?(\b88[0-9]{3}\b)",
    "matchScope": "token"
}

with open('cpt.json', 'w') as f:
    json.dump(cpt_rule, f)

cpt_parser = ContextualParserApproach() \
    .setInputCols(["splitter", "token"]) \
    .setOutputCol("entity_cpt") \
    .setJsonPath("cpt.json") \
    .setCaseSensitive(False) \
    .setPrefixAndSuffixMatch(False)

cpt_parser_pipeline = Pipeline(stages=[
    document_assembler,
    splitter,
    tokenizer,
    cpt_parser
  ])

empty_data = spark.createDataFrame([[""]]).toDF("text")

cpt_parser_model = cpt_parser_pipeline.fit(empty_data)
cpt_parser_model.stages[-1].write().overwrite().save("./parsers/cpt_parser")

cpt_parser = ContextualParserModel.load("parsers/cpt_parser") \
    .setInputCols(["splitter", "token"])\
    .setOutputCol("entity_cpt")

In [17]:
annotations = LightPipeline(cpt_parser_model).annotate(text)

annotations["entity_cpt"]

['88305']

### **specimen_parser**

In [18]:
with open('specimen.json', 'w') as f:
    json.dump({
        "entity": "IDNUM",
        "ruleScope": "sentence",
        "regex": "(?:Specimen(?:\s*(?:ID|Number|Code|#|No\.?)?:?)?\s*)?#?[A-Z]{1,5}[0-9]{2,4}-?[0-9]{3,6}",
        "contextLength": 25,
        "matchScope": "token"
    }, f)

specimen_parser = ContextualParserApproach() \
    .setInputCols(["splitter", "token"]) \
    .setOutputCol("entity_specimen") \
    .setJsonPath("specimen.json") \
    .setCaseSensitive(False) \
    .setPrefixAndSuffixMatch(False)

specimen_parser_pipeline = Pipeline(stages=[
    document_assembler,
    splitter,
    tokenizer,
    specimen_parser
  ])

empty_data = spark.createDataFrame([[""]]).toDF("text")

specimen_parser_model = specimen_parser_pipeline.fit(empty_data)
specimen_parser_model.stages[-1].write().overwrite().save("./parsers/specimen_parser")

specimen_parser = ContextualParserModel.load("./parsers/specimen_parser") \
    .setInputCols(["splitter", "token"])\
    .setOutputCol("entity_specimen")

In [19]:
annotations = LightPipeline(specimen_parser_model).annotate(text)

annotations["entity_specimen"]

['#RD23-4897']

### **IOBTagger**

In [20]:
iobTagger = sparknlp_jsl.annotator.IOBTagger()\
  .setInputCols(["token", "ner_chunk"])\
  .setOutputCol("ner_label")

## **Update Mergers**

In [21]:
merger_input_cols = deid_pipeline.model.stages[35].getInputCols()
#merger_input_cols.remove("entity_zip")
merger_input_cols

['entity_icd10',
 'entity_email',
 'entity_ip_address',
 'entity_age',
 'entity_medicalrecord',
 'entity_ssn',
 'entity_account',
 'entity_vin',
 'entity_date',
 'entity_phone',
 'entity_phone2',
 'entity_country',
 'entity_state',
 'entity_zip',
 'entity_plate',
 'entity_dln',
 'entity_license']

In [22]:
merger_input_cols = deid_pipeline.model.stages[35].getInputCols()

chunk_merge_rulebase = deid_pipeline.model.stages[35]\
      .setInputCols(["entity_cpt", "entity_specimen"] + merger_input_cols)

**black list**

In [23]:
deid_pipeline.model.stages[38]

ChunkMergeModel_5a3f1e608447

In [24]:
deid_pipeline.model.stages[38] = deid_pipeline.model.stages[38]\
                                      .setBlackList(['CPT_CODE'])

## **Stages**

In [25]:
deid_pipeline.model.stages = (
    deid_pipeline.model.stages[:35]
    + [cpt_parser, specimen_parser, chunk_merge_rulebase]
    + deid_pipeline.model.stages[36:]
    + [iobTagger]
)

In [26]:
deid_pipeline.model.stages

[DocumentAssembler_ae0f203deedd,
 InternalDocumentSplitter_cc36578ceda6,
 REGEX_TOKENIZER_2e85686aea12,
 WORD_EMBEDDINGS_MODEL_9004b1d00302,
 MedicalNerModel_1a8637089929,
 NER_CONVERTER_1aef7e9d2de5,
 MedicalNerModel_d92d47622e85,
 MedicalNerModel_32184c1db80b,
 MedicalNerModel_ada39ac0d359,
 NER_CONVERTER_a99db4e6a79d,
 NER_CONVERTER_4a9436714344,
 NER_CONVERTER_ea6433988e18,
 PretrainedZeroShotNER_5f30ab9002f1,
 NER_CONVERTER_c97040caf7b3,
 MedicalNerModel_b8b167ec3114,
 NER_CONVERTER_06db473f3215,
 ContextualEntityRuler_11ff6711ef6b,
 ChunkMergeModel_95d6827691bb,
 CONTEXTUAL-PARSER_bf2a6abaf5fa,
 CONTEXTUAL-PARSER_ff6bad379d91,
 CONTEXTUAL-PARSER_89341cae7221,
 CONTEXTUAL-PARSER_c6b9eded8d31,
 CONTEXTUAL-PARSER_9480c24bd9f8,
 CONTEXTUAL-PARSER_3886bce391c8,
 CONTEXTUAL-PARSER_0bb3fb75cd01,
 ENTITY_EXTRACTOR_6792f2f6e85a,
 ENTITY_EXTRACTOR_74ace4be4f73,
 CONTEXTUAL-PARSER_dfb32adc7555,
 REGEX_MATCHER_5003669d6422,
 CONTEXTUAL-PARSER_746a25662aa6,
 CONTEXTUAL-PARSER_079220479a3d,
 C

In [27]:
empty_result = deid_pipeline.transform(spark.createDataFrame([[""]]).toDF("text"))

deid_pipeline.model.write().overwrite().save("modified_pipeline")

In [20]:
# We are loading the pretrained pipeline using the `from_disk` method.
from sparknlp.pretrained import PretrainedPipeline

modified_pipeline = PretrainedPipeline.from_disk('modified_pipeline')

## Sample Result

In [21]:
samples_df = spark.createDataFrame([[text]]).toDF("text")

result = modified_pipeline.transform(samples_df).cache()

In [22]:
result.select(F.explode(F.arrays_zip(result.ner_chunk.result,
                                     result.ner_chunk.begin,
                                     result.ner_chunk.end,
                                     result.ner_chunk.metadata)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']").alias("begin"),
              F.expr("cols['2']").alias("end"),
              F.expr("cols['3']['entity']").alias("ner_label"),
              F.expr("cols['3']['confidence']").alias("confidence")).show(50,truncate=False)



Using CPUs
+----------------------------------+-----+----+---------+----------+
|chunk                             |begin|end |ner_label|confidence|
+----------------------------------+-----+----+---------+----------+
|John Lee                          |22   |29  |NAME     |0.9999912 |
|7789201                           |37   |43  |IDNUM    |0.72      |
|LERE                              |55   |58  |LOCATION |0.8618449 |
|2025-05-12                        |75   |84  |DATE     |NULL      |
|#RD23-4897                        |101  |110 |IDNUM    |0.50      |
|Smith                             |232  |236 |NAME     |0.9992543 |
|Carter                            |243  |248 |NAME     |0.9988757 |
|2025-05-12                        |275  |284 |DATE     |NULL      |
|General Hospital                  |292  |307 |LOCATION |0.9980348 |
|Fan Gabriel                       |313  |323 |NAME     |0.98504204|
|90210                             |325  |329 |IDNUM    |0.5666    |
|General Hospital      

                                                                                

In [23]:
pd.set_option("display.max_colwidth", 1000)
# pd.set_option("display.max_colwidth",0)


result_df = result.selectExpr("text",
                              "mask_entity.result as masked_result",
                              "obfuscated.result as obfuscated_result").toPandas()
result_df

Unnamed: 0,text,masked_result,obfuscated_result
0,"\n(NOTE) Patient Name: John Lee. MR#: 7789201 Location: LERE Date Reported: 2025-05-12 16:30\nSpecimen #RD23-4897 Clinical History: None Given. CLINICAL INFORMATION: Date of Last Menstrual Period: N/A\nElectronically Signed Out By Dr. Smith, Dr. Carter, CT(ASCP) Date Reported: 2025-05-12 16:30\nGeneral Hospital Dr. Fan Gabriel 90210 CPT Code(s) A: 88305\n\nGeneral Hospital in New York City Dr. Williams, NYC, NY\n(212) 555-7890 Patient Name: John Lee Accession #: GH-556672\nPatient ID #: 7789201 Collected: 2025-05-10 Address:\n123 Main Street, FALL RIVER\nNIAGARA FALLS, NY 14304\nReceived: 2025-05-10 Reported: 2025-05-12\nSoc. Sec. #: XXX-XX-1234 DOB/Age/Sex: 1973 (Age: 52) M\nPhysician(s): Dr. Jameson. Peripheral sequestration, i.e. splenomegaly or hepatomegaly should be excluded to be sure if peripheral sequestration is not present.\nThe following special studies were performed at Barstow Heights Christus Southeast, NY – St Elizabeth; New York City.\n· Chromosome analysis cytogene...","[\n(NOTE) Patient Name: <NAME>. MR#: <IDNUM> Location: <LOCATION> Date Reported: <DATE> 16:30\nSpecimen <IDNUM> Clinical History: None Given. CLINICAL INFORMATION: Date of Last Menstrual Period: N/A\nElectronically Signed Out By Dr. <NAME>, Dr. <NAME>, CT(ASCP) Date Reported: <DATE> 16:30\n<LOCATION> Dr. <NAME> <IDNUM> CPT Code(s) A: 88305\n\n<LOCATION> in <LOCATION> City Dr. <NAME>, <LOCATION>, <LOCATION>\n<CONTACT> Patient Name: <NAME> Accession #: <IDNUM>\nPatient ID #: <IDNUM> Collected: <DATE> Address:\n<LOCATION>, <LOCATION>\n<LOCATION>, <LOCATION> <LOCATION>\nReceived: <DATE> Reported: <DATE>\nSoc. Sec. #: <IDNUM> DOB/Age/Sex: <DATE> (Age: <AGE>) M\nPhysician(s): Dr. <NAME>. Peripheral sequestration, i.e. splenomegaly or hepatomegaly should be excluded to be sure if peripheral sequestration is not present.\nThe following special studies were performed at <LOCATION>, <LOCATION> – <LOCATION>; <LOCATION> City.\n· <LOCATION> analysis cytogenetics. (ADDENDUM REPORT TO FOLLOW.)\n·...","[\n(NOTE) Patient Name: Gillie Allan. MR#: 0074518 Location: 4500 MEMORIAL DRIVE Date Reported: 2025-06-29 16:30\nSpecimen #SA52-9740 Clinical History: None Given. CLINICAL INFORMATION: Date of Last Menstrual Period: N/A\nElectronically Signed Out By Dr. Wanna, Dr. Malvin, CT(ASCP) Date Reported: 2025-06-29 16:30\n310 Ellis Street Dr. Marcelo Danes 41581 CPT Code(s) A: 88305\n\n310 Ellis Street in 2000 Boise Ave City Dr. Duwaine, 427 GUY PARK AVE, 16100 SOUTH FREEWAY\n(585) 666-0741 Patient Name: Gillie Allan Accession #: PU-663305\nPatient ID #: 0074518 Collected: 2025-06-27 Address:\n3255 Independence Street, 302 W MCNEESE ST\n4101 NW 89TH BLVD, 16100 SOUTH FREEWAY 59 KOCH AVE\nReceived: 2025-06-27 Reported: 2025-06-29\nSoc. Sec. #: WWW-WW-8529 DOB/Age/Sex: 1974 (Age: 44) M\nPhysician(s): Dr. Marchelle. Peripheral sequestration, i.e. splenomegaly or hepatomegaly should be excluded to be sure if peripheral sequestration is not present.\nThe following special studies were performed..."


## Dataset

In [31]:
# !wget -q https://github.com/JohnSnowLabs/spark-nlp-workshop/raw/master/tutorials/academic/NER_Benchmarks/ner_cloud_benchmark_text_df.xlsx
# text_df = pd.read_excel("ner_cloud_benchmark_text_df.xlsx")
# #text_df.head(2)
# input_spark_df = spark.createDataFrame(text_df)

In [131]:
# !wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/refs/heads/master/data/ner/eng.train -O eng.train

from sparknlp.training import CoNLL
data_conll = CoNLL(includeDocId=True,explodeSentences=True).readDataset(spark, "./eng.train")
data_conll.show(2)
data_conll.count()


+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|doc_id|                text|            document|            sentence|               token|                 pos|               label|
+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|     X|EU rejects German...|[{document, 0, 47...|[{document, 0, 47...|[{token, 0, 1, EU...|[{pos, 0, 1, NNP,...|[{named_entity, 0...|
|     X|     Peter Blackburn|[{document, 0, 14...|[{document, 0, 14...|[{token, 0, 4, Pe...|[{pos, 0, 4, NNP,...|[{named_entity, 0...|
+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 2 rows



14041

In [132]:
input_spark_df = data_conll.select("doc_id", "text")
input_spark_df.show(2, truncate=50)

+------+------------------------------------------------+
|doc_id|                                            text|
+------+------------------------------------------------+
|     X|EU rejects German call to boycott British lamb .|
|     X|                                 Peter Blackburn|
+------+------------------------------------------------+
only showing top 2 rows



In [133]:
results = modified_pipeline.transform(input_spark_df)
results.columns

['doc_id',
 'text',
 'document',
 'splitter',
 'token',
 'embeddings',
 'ner_clinical_large',
 'ner_chunk_clinical_large',
 'ner_deid_generic_docwise',
 'ner_deid_docwise_subentity',
 'ner_deid_generic_docwise_merged_conll',
 'ner_chunk_generic_docwise',
 'ner_chunk_subentity_docwise',
 'ner_chunk_merged_docwise',
 'ner_zero_shot',
 'ner_chunk_zero_shot_raw',
 'ner_deid_subentity_docwise_new',
 'ner_chunk_subentity_docwise_new_chunk',
 'ner_chunk_zero_shot',
 'deid_merged_ner_chunk',
 'entity_icd10',
 'entity_ssn',
 'entity_account',
 'entity_dln',
 'entity_plate',
 'entity_vin',
 'entity_license',
 'entity_country',
 'entity_state',
 'entity_age',
 'entity_date',
 'entity_phone',
 'entity_phone2',
 'entity_zip',
 'entity_medicalrecord',
 'entity_email',
 'entity_ip_address',
 'entity_cpt',
 'entity_specimen',
 'deid_merged_ner_rulebased',
 'ner_chunk_raw',
 'ner_chunk_processed',
 'ner_chunk',
 'mask_entity',
 'obfuscated',
 'ner_label']

In [134]:
result_df = results.select('doc_id','text','document','splitter',
                          'token',"embeddings", 'ner_label')

In [135]:
result_df.show(2, truncate=40)

[Stage 1012:>                                                       (0 + 1) / 1]

+------+----------------------------------------+----------------------------------------+----------------------------------------+----------------------------------------+----------------------------------------+----------------------------------------+
|doc_id|                                    text|                                document|                                splitter|                                   token|                              embeddings|                               ner_label|
+------+----------------------------------------+----------------------------------------+----------------------------------------+----------------------------------------+----------------------------------------+----------------------------------------+
|     X|EU rejects German call to boycott Bri...|[{document, 0, 47, EU rejects German ...|[{document, 0, 48, EU rejects German ...|[{token, 0, 1, EU, {sentence -> 0}, [...|[{word_embeddings, 0, 1, EU, {isOOV -...|[{named_entity, 0, 1, 

                                                                                

In [136]:
%%time

n_partitions = 48 

# WRITING THE DATA
result_df.repartition(n_partitions).write.mode("overwrite").format("parquet")\
    .save(f"./data/result_df_{n_partitions}.parquet")

# READING THE DATA
result_df = spark.read \
    .parquet(f"./data/result_df_{n_partitions}.parquet")\
    .repartition(n_partitions)

                                                                                

CPU times: user 122 ms, sys: 29.1 ms, total: 151 ms
Wall time: 12min 16s


# NER Training

In [140]:
result_df.count()

14041

In [141]:
(train_df, test_df) = result_df.randomSplit([0.8, 0.2], seed = 42)

In [142]:
test_df.repartition(n_partitions).write.mode("overwrite").format("parquet")\
    .save(f"./data/test_df.parquet")

                                                                                

## NERDL Graph

We will use `TFGraphBuilder` annotator which can be used to create graphs in the model training pipeline. `TFGraphBuilder` inspects the data and creates the proper graph if a suitable version of TensorFlow is available. The graph is stored in the defined folder and loaded by the `MedicalNerApproach` annotator.

In [None]:
%pip install -q tensorflow==2.12.0
%pip install -q tensorflow-addons

In [143]:
from sparknlp_jsl.annotator import TFGraphBuilder

graph_folder_path = "medical_ner_graphs"

ner_graph_builder = TFGraphBuilder()\
    .setModelName("ner_dl")\
    .setInputCols(["splitter", "token", "embeddings"]) \
    .setLabelColumn("ner_label")\
    .setGraphFolder(graph_folder_path)\
    .setGraphFile("auto")\
    .setHiddenUnitsNumber(50)\
    .setIsLicensed(True) # False -> if you want to use TFGraphBuilder with NerDLApproach

In [144]:
nerTagger = MedicalNerApproach()\
    .setInputCols(["splitter", "token", "embeddings"])\
    .setLabelColumn("ner_label")\
    .setOutputCol("ner")\
    .setMaxEpochs(30)\
    .setBatchSize(8)\
    .setRandomSeed(0)\
    .setVerbose(1)\
    .setValidationSplit(0.2)\
    .setEvaluationLogExtended(True) \
    .setEnableOutputLogs(True)\
    .setIncludeConfidence(True)\
    .setOutputLogsPath('ner_logs')\
    .setEarlyStoppingCriterion(0.01)\
    .setEarlyStoppingPatience(5)\
    .setGraphFolder(graph_folder_path)\
    .setUseBestModel(False)\
    #.setTestDataset("./data/test_df.parquet")\
    #.setEnableMemoryOptimizer(True) #>> if you have a limited memory and a large conll file, you can set this True to train batch by batch
    #.setDatasetInfo("NCBI_sample_short dataset") #You can add details regarding the dataset

ner_pipeline = Pipeline(
    stages=[
          ner_graph_builder,
          nerTagger
 ])

In [145]:
%%time
ner_model = ner_pipeline.fit(train_df)

                                                                                

TF Graph Builder configuration:
Model name: ner_dl
Graph folder: medical_ner_graphs
Graph file name: auto
Build params: {'ntags': 13, 'embeddings_dim': 200, 'nchars': 85, 'is_medical': True, 'lstm_size': 50}
ner_dl graph exported to medical_ner_graphs/blstm_13_200_50_85.pb
Training started - total epochs: 30 - lr: 0.001 - batch size: 8 - labels: 13 - chars: 84 - training examples: 11190


                                                                                

Epoch 1/30 started, lr: 0.001, dataset size: 11190


                                                                                

Epoch 1/30 - 26.58s - total training loss: 6000.8135 - avg training loss: 5.372259 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 2.07s
Total validation loss: 923.2025	Avg validation loss: 3.2280
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 663	 115	 195	 0.8521851	 0.77272725	 0.81051344
I-CONTACT	 0	 0	 58	 0.0	 0.0	 0.0
I-AGE	 0	 0	 3	 0.0	 0.0	 0.0
I-IDNUM	 0	 0	 5	 0.0	 0.0	 0.0
B-DATE	 597	 19	 126	 0.96915585	 0.82572615	 0.8917102
I-DATE	 57	 6	 53	 0.9047619	 0.5181818	 0.6589595
I-LOCATION	 221	 67	 535	 0.7673611	 0.29232803	 0.4233716
B-NAME	 751	 233	 243	 0.76321137	 0.7555332	 0.75935286
B-AGE	 18	 13	 95	 0.58064514	 0.15929204	 0.25
B-LOCATION	 1485	 242	 663	 0.85987264	 0.6913408	 0.7664516
B-IDNUM	 0	 0	 26	 0.0	 0.0	 0.0
B-CONTACT	 1	 1	 65	 0.5	 0.015151516	 0.029411767
tp: 3793 fp: 696 fn: 2067 labels: 12
Macro-average	 prec: 0.51643276, rec: 0.33585677, f1: 0.40701526
Micro-average	 prec: 0.8449543, rec: 

                                                                                

Epoch 2/30 - 24.92s - total training loss: 3631.148 - avg training loss: 3.250804 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.77s
Total validation loss: 723.8810	Avg validation loss: 2.5311
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 698	 103	 160	 0.8714107	 0.81351984	 0.8414708
I-CONTACT	 29	 31	 29	 0.48333332	 0.5	 0.4915254
I-AGE	 0	 0	 3	 0.0	 0.0	 0.0
I-IDNUM	 0	 0	 5	 0.0	 0.0	 0.0
B-DATE	 634	 58	 89	 0.91618496	 0.8769018	 0.8961131
I-DATE	 74	 10	 36	 0.88095236	 0.6727273	 0.7628866
I-LOCATION	 348	 132	 408	 0.725	 0.46031746	 0.5631068
B-NAME	 776	 179	 218	 0.81256545	 0.7806841	 0.7963058
B-AGE	 46	 36	 67	 0.5609756	 0.40707964	 0.47179487
B-LOCATION	 1598	 259	 550	 0.86052775	 0.74394786	 0.79800254
B-IDNUM	 0	 1	 26	 0.0	 0.0	 0.0
B-CONTACT	 18	 34	 48	 0.34615386	 0.27272728	 0.30508476
tp: 4221 fp: 843 fn: 1639 labels: 12
Macro-average	 prec: 0.538092, rec: 0.4606588, f1: 0.4963737
Micro-average	 pre

                                                                                

Epoch 3/30 - 25.26s - total training loss: 3031.2466 - avg training loss: 2.7137392 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.78s
Total validation loss: 637.5287	Avg validation loss: 2.2291
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 712	 118	 146	 0.8578313	 0.82983685	 0.8436019
I-CONTACT	 34	 40	 24	 0.45945945	 0.5862069	 0.51515156
I-AGE	 0	 1	 3	 0.0	 0.0	 0.0
I-IDNUM	 0	 0	 5	 0.0	 0.0	 0.0
B-DATE	 657	 92	 66	 0.87716955	 0.9087137	 0.8926631
I-DATE	 86	 10	 24	 0.8958333	 0.7818182	 0.83495146
I-LOCATION	 347	 98	 409	 0.77977526	 0.45899472	 0.5778518
B-NAME	 799	 180	 195	 0.8161389	 0.80382293	 0.80993414
B-AGE	 57	 33	 56	 0.6333333	 0.50442475	 0.56157637
B-LOCATION	 1589	 232	 559	 0.87259746	 0.7397579	 0.80070543
B-IDNUM	 0	 2	 26	 0.0	 0.0	 0.0
B-CONTACT	 19	 28	 47	 0.40425533	 0.28787878	 0.33628318
tp: 4300 fp: 834 fn: 1560 labels: 12
Macro-average	 prec: 0.5496995, rec: 0.4917879, f1: 0.51913357
Mic

                                                                                

Epoch 4/30 - 25.38s - total training loss: 2695.8843 - avg training loss: 2.4135044 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 2.08s
Total validation loss: 587.0292	Avg validation loss: 2.0525
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 698	 90	 160	 0.88578683	 0.81351984	 0.84811664
I-CONTACT	 43	 30	 15	 0.5890411	 0.7413793	 0.65648854
I-AGE	 0	 0	 3	 0.0	 0.0	 0.0
I-IDNUM	 0	 0	 5	 0.0	 0.0	 0.0
B-DATE	 662	 87	 61	 0.88384515	 0.9156293	 0.89945656
I-DATE	 88	 15	 22	 0.8543689	 0.8	 0.82629114
I-LOCATION	 417	 212	 339	 0.6629571	 0.5515873	 0.6021661
B-NAME	 795	 163	 199	 0.82985383	 0.7997988	 0.81454915
B-AGE	 53	 22	 60	 0.70666665	 0.46902654	 0.5638298
B-LOCATION	 1550	 195	 598	 0.88825214	 0.7216015	 0.79630107
B-IDNUM	 1	 2	 25	 0.33333334	 0.03846154	 0.068965524
B-CONTACT	 23	 40	 43	 0.36507937	 0.34848484	 0.35658914
tp: 4330 fp: 856 fn: 1530 labels: 12
Macro-average	 prec: 0.58326536, rec: 0.51662415, 

                                                                                

Epoch 5/30 - 26.19s - total training loss: 2471.7085 - avg training loss: 2.2128098 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.87s
Total validation loss: 560.2350	Avg validation loss: 1.9589
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 718	 97	 140	 0.8809816	 0.83682984	 0.85833836
I-CONTACT	 42	 13	 16	 0.76363635	 0.7241379	 0.74336284
I-AGE	 2	 8	 1	 0.2	 0.6666667	 0.30769232
I-IDNUM	 0	 0	 5	 0.0	 0.0	 0.0
B-DATE	 647	 52	 76	 0.92560804	 0.89488244	 0.90998596
I-DATE	 90	 14	 20	 0.86538464	 0.8181818	 0.8411215
I-LOCATION	 395	 141	 361	 0.7369403	 0.52248675	 0.61145514
B-NAME	 820	 192	 174	 0.8102767	 0.8249497	 0.8175473
B-AGE	 58	 33	 55	 0.63736266	 0.5132743	 0.5686275
B-LOCATION	 1723	 297	 425	 0.8529703	 0.80214155	 0.8267755
B-IDNUM	 4	 5	 22	 0.44444445	 0.15384616	 0.22857143
B-CONTACT	 23	 23	 43	 0.5	 0.34848484	 0.41071427
tp: 4522 fp: 875 fn: 1338 labels: 12
Macro-average	 prec: 0.6348004, rec: 0.5

                                                                                

Epoch 6/30 - 25.38s - total training loss: 2302.556 - avg training loss: 2.0613751 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.78s
Total validation loss: 541.1011	Avg validation loss: 1.8920
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 736	 114	 122	 0.86588234	 0.8578088	 0.8618266
I-CONTACT	 36	 8	 22	 0.8181818	 0.62068963	 0.7058823
I-AGE	 0	 1	 3	 0.0	 0.0	 0.0
I-IDNUM	 0	 3	 5	 0.0	 0.0	 0.0
B-DATE	 653	 61	 70	 0.9145658	 0.9031812	 0.90883785
I-DATE	 88	 11	 22	 0.8888889	 0.8	 0.84210527
I-LOCATION	 365	 114	 391	 0.7620042	 0.48280424	 0.5910931
B-NAME	 832	 191	 162	 0.81329423	 0.8370221	 0.8249876
B-AGE	 56	 24	 57	 0.7	 0.49557522	 0.5803108
B-LOCATION	 1712	 264	 436	 0.8663968	 0.7970205	 0.8302619
B-IDNUM	 1	 3	 25	 0.25	 0.03846154	 0.06666667
B-CONTACT	 25	 24	 41	 0.5102041	 0.37878788	 0.4347826
tp: 4504 fp: 818 fn: 1356 labels: 12
Macro-average	 prec: 0.6157849, rec: 0.51761264, f1: 0.562447
Micro-aver

                                                                                

Epoch 7/30 - 25.34s - total training loss: 2185.92 - avg training loss: 1.956956 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.76s
Total validation loss: 517.6503	Avg validation loss: 1.8100
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 742	 112	 116	 0.86885244	 0.8648019	 0.8668224
I-CONTACT	 43	 29	 15	 0.5972222	 0.7413793	 0.6615384
I-AGE	 2	 4	 1	 0.33333334	 0.6666667	 0.44444448
I-IDNUM	 0	 2	 5	 0.0	 0.0	 0.0
B-DATE	 653	 50	 70	 0.9288762	 0.9031812	 0.9158485
I-DATE	 91	 10	 19	 0.9009901	 0.8272727	 0.8625592
I-LOCATION	 370	 109	 386	 0.7724426	 0.489418	 0.59919024
B-NAME	 838	 193	 156	 0.8128031	 0.84305835	 0.82765436
B-AGE	 60	 28	 53	 0.6818182	 0.53097343	 0.5970149
B-LOCATION	 1742	 273	 406	 0.86451614	 0.81098694	 0.83689654
B-IDNUM	 2	 1	 24	 0.6666667	 0.07692308	 0.13793105
B-CONTACT	 29	 48	 37	 0.37662336	 0.43939394	 0.4055944
tp: 4572 fp: 859 fn: 1288 labels: 12
Macro-average	 prec: 0.6503454, rec

                                                                                

Epoch 8/30 - 25.08s - total training loss: 2070.598 - avg training loss: 1.8537134 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.74s
Total validation loss: 506.5224	Avg validation loss: 1.7711
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 721	 89	 137	 0.8901235	 0.84032637	 0.8645084
I-CONTACT	 35	 4	 23	 0.8974359	 0.6034483	 0.72164947
I-AGE	 3	 8	 0	 0.27272728	 1.0	 0.42857146
I-IDNUM	 0	 10	 5	 0.0	 0.0	 0.0
B-DATE	 656	 64	 67	 0.9111111	 0.9073306	 0.90921694
I-DATE	 89	 9	 21	 0.90816325	 0.8090909	 0.8557692
I-LOCATION	 401	 136	 355	 0.7467412	 0.5304233	 0.620263
B-NAME	 829	 178	 165	 0.82323736	 0.83400404	 0.82858574
B-AGE	 52	 18	 61	 0.74285716	 0.460177	 0.568306
B-LOCATION	 1781	 324	 367	 0.8460808	 0.8291434	 0.83752644
B-IDNUM	 3	 6	 23	 0.33333334	 0.115384616	 0.17142858
B-CONTACT	 24	 22	 42	 0.5217391	 0.36363637	 0.42857143
tp: 4594 fp: 868 fn: 1266 labels: 12
Macro-average	 prec: 0.65779585, rec: 0.

                                                                                

Epoch 9/30 - 24.75s - total training loss: 1988.499 - avg training loss: 1.780214 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.74s
Total validation loss: 502.7877	Avg validation loss: 1.7580
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 757	 127	 101	 0.85633487	 0.8822844	 0.86911595
I-CONTACT	 46	 24	 12	 0.6571429	 0.79310346	 0.71875006
I-AGE	 2	 3	 1	 0.4	 0.6666667	 0.5
I-IDNUM	 0	 5	 5	 0.0	 0.0	 0.0
B-DATE	 650	 63	 73	 0.91164094	 0.8990318	 0.90529245
I-DATE	 89	 8	 21	 0.91752577	 0.8090909	 0.85990345
I-LOCATION	 391	 93	 365	 0.80785125	 0.51719576	 0.63064516
B-NAME	 843	 188	 151	 0.81765276	 0.8480885	 0.8325926
B-AGE	 68	 30	 45	 0.6938776	 0.6017699	 0.6445498
B-LOCATION	 1756	 267	 392	 0.8680178	 0.81750464	 0.8420043
B-IDNUM	 2	 3	 24	 0.4	 0.07692308	 0.12903227
B-CONTACT	 33	 44	 33	 0.42857143	 0.5	 0.46153846
tp: 4637 fp: 855 fn: 1223 labels: 12
Macro-average	 prec: 0.64655125, rec: 0.61763823, f1: 0.

                                                                                

Epoch 10/30 - 24.99s - total training loss: 1934.6532 - avg training loss: 1.7320082 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.79s
Total validation loss: 507.3811	Avg validation loss: 1.7741
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 725	 96	 133	 0.88306946	 0.84498835	 0.86360925
I-CONTACT	 27	 0	 31	 1.0	 0.46551725	 0.63529414
I-AGE	 3	 6	 0	 0.33333334	 1.0	 0.5
I-IDNUM	 0	 3	 5	 0.0	 0.0	 0.0
B-DATE	 664	 76	 59	 0.8972973	 0.9183956	 0.9077239
I-DATE	 92	 13	 18	 0.8761905	 0.8363636	 0.8558139
I-LOCATION	 417	 129	 339	 0.76373625	 0.5515873	 0.64055294
B-NAME	 836	 198	 158	 0.80851066	 0.8410463	 0.82445765
B-AGE	 70	 26	 43	 0.7291667	 0.61946905	 0.6698565
B-LOCATION	 1779	 309	 369	 0.8520115	 0.82821226	 0.8399434
B-IDNUM	 3	 4	 23	 0.42857143	 0.115384616	 0.18181819
B-CONTACT	 23	 22	 43	 0.51111114	 0.34848484	 0.4144144
tp: 4639 fp: 882 fn: 1221 labels: 12
Macro-average	 prec: 0.6735832, rec: 0.6141208

                                                                                

Epoch 11/30 - 26.59s - total training loss: 1846.9413 - avg training loss: 1.6534837 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.80s
Total validation loss: 503.6275	Avg validation loss: 1.7609
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 697	 75	 161	 0.90284973	 0.8123543	 0.8552148
I-CONTACT	 44	 15	 14	 0.7457627	 0.7586207	 0.75213677
I-AGE	 2	 5	 1	 0.2857143	 0.6666667	 0.4
I-IDNUM	 0	 3	 5	 0.0	 0.0	 0.0
B-DATE	 648	 58	 75	 0.91784704	 0.89626557	 0.9069279
I-DATE	 90	 12	 20	 0.88235295	 0.8181818	 0.8490566
I-LOCATION	 412	 134	 344	 0.75457877	 0.54497355	 0.6328725
B-NAME	 777	 119	 217	 0.8671875	 0.7816901	 0.8222222
B-AGE	 63	 27	 50	 0.7	 0.5575221	 0.62068963
B-LOCATION	 1748	 235	 400	 0.8814927	 0.81378025	 0.84628415
B-IDNUM	 2	 4	 24	 0.33333334	 0.07692308	 0.12500001
B-CONTACT	 29	 24	 37	 0.5471698	 0.43939394	 0.48739496
tp: 4512 fp: 711 fn: 1348 labels: 12
Macro-average	 prec: 0.651524, rec: 0.5971

                                                                                

Epoch 12/30 - 25.28s - total training loss: 1803.9219 - avg training loss: 1.6149703 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.76s
Total validation loss: 491.9632	Avg validation loss: 1.7202
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 741	 106	 117	 0.8748524	 0.8636364	 0.8692082
I-CONTACT	 38	 5	 20	 0.88372093	 0.6551724	 0.75247526
I-AGE	 2	 6	 1	 0.25	 0.6666667	 0.36363637
I-IDNUM	 1	 12	 4	 0.07692308	 0.2	 0.11111111
B-DATE	 670	 92	 53	 0.87926507	 0.92669433	 0.90235686
I-DATE	 92	 14	 18	 0.8679245	 0.8363636	 0.8518518
I-LOCATION	 428	 152	 328	 0.737931	 0.56613755	 0.6407185
B-NAME	 816	 137	 178	 0.85624343	 0.82092553	 0.8382126
B-AGE	 66	 18	 47	 0.78571427	 0.5840708	 0.67005074
B-LOCATION	 1753	 268	 395	 0.86739236	 0.816108	 0.8409691
B-IDNUM	 4	 7	 22	 0.36363637	 0.15384616	 0.21621622
B-CONTACT	 26	 29	 40	 0.47272727	 0.3939394	 0.42975208
tp: 4637 fp: 846 fn: 1223 labels: 12
Macro-average	 prec:

                                                                                

Epoch 13/30 - 25.22s - total training loss: 1735.0734 - avg training loss: 1.5533334 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.78s
Total validation loss: 494.6951	Avg validation loss: 1.7297
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 766	 140	 92	 0.8454746	 0.89277387	 0.8684807
I-CONTACT	 50	 25	 8	 0.6666667	 0.86206895	 0.7518797
I-AGE	 2	 5	 1	 0.2857143	 0.6666667	 0.4
I-IDNUM	 0	 3	 5	 0.0	 0.0	 0.0
B-DATE	 661	 60	 62	 0.91678226	 0.9142462	 0.91551244
I-DATE	 88	 9	 22	 0.9072165	 0.8	 0.85024154
I-LOCATION	 387	 115	 369	 0.77091634	 0.5119048	 0.6152623
B-NAME	 840	 177	 154	 0.8259587	 0.8450704	 0.8354053
B-AGE	 58	 21	 55	 0.73417723	 0.5132743	 0.6041666
B-LOCATION	 1720	 245	 428	 0.87531805	 0.8007449	 0.8363725
B-IDNUM	 4	 15	 22	 0.21052632	 0.15384616	 0.17777777
B-CONTACT	 31	 28	 35	 0.5254237	 0.46969697	 0.496
tp: 4607 fp: 843 fn: 1253 labels: 12
Macro-average	 prec: 0.6303479, rec: 0.6191911, f1

                                                                                

Epoch 14/30 - 25.31s - total training loss: 1691.0828 - avg training loss: 1.5139506 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.76s
Total validation loss: 506.0290	Avg validation loss: 1.7693
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 729	 91	 129	 0.8890244	 0.8496503	 0.86889154
I-CONTACT	 33	 5	 25	 0.8684211	 0.5689655	 0.6875
I-AGE	 2	 6	 1	 0.25	 0.6666667	 0.36363637
I-IDNUM	 1	 7	 4	 0.125	 0.2	 0.15384616
B-DATE	 666	 69	 57	 0.90612245	 0.92116183	 0.9135802
I-DATE	 90	 9	 20	 0.90909094	 0.8181818	 0.861244
I-LOCATION	 378	 83	 378	 0.8199566	 0.5	 0.62119967
B-NAME	 834	 188	 160	 0.81604695	 0.8390342	 0.82738096
B-AGE	 64	 16	 49	 0.8	 0.5663717	 0.6632125
B-LOCATION	 1763	 276	 385	 0.8646395	 0.8207635	 0.84213036
B-IDNUM	 6	 10	 20	 0.375	 0.23076923	 0.2857143
B-CONTACT	 28	 34	 38	 0.4516129	 0.42424244	 0.4375
tp: 4594 fp: 794 fn: 1266 labels: 12
Macro-average	 prec: 0.6729095, rec: 0.6171506, f1: 0.6

                                                                                

Epoch 15/30 - 25.66s - total training loss: 1641.4646 - avg training loss: 1.4695296 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.78s
Total validation loss: 493.5042	Avg validation loss: 1.7255
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 736	 106	 122	 0.87410927	 0.8578088	 0.86588234
I-CONTACT	 43	 10	 15	 0.8113208	 0.7413793	 0.77477485
I-AGE	 2	 4	 1	 0.33333334	 0.6666667	 0.44444448
I-IDNUM	 1	 17	 4	 0.055555556	 0.2	 0.086956516
B-DATE	 662	 60	 61	 0.91689754	 0.9156293	 0.916263
I-DATE	 92	 18	 18	 0.8363636	 0.8363636	 0.8363636
I-LOCATION	 388	 109	 368	 0.7806841	 0.5132275	 0.6193136
B-NAME	 843	 195	 151	 0.81213874	 0.8480885	 0.8297244
B-AGE	 72	 23	 41	 0.75789475	 0.63716817	 0.6923077
B-LOCATION	 1735	 232	 413	 0.8820539	 0.8077281	 0.8432564
B-IDNUM	 4	 13	 22	 0.23529412	 0.15384616	 0.18604651
B-CONTACT	 34	 44	 32	 0.43589744	 0.5151515	 0.4722222
tp: 4612 fp: 831 fn: 1248 labels: 12
Macro-average	

                                                                                

Epoch 16/30 - 25.00s - total training loss: 1598.9236 - avg training loss: 1.4314445 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.78s
Total validation loss: 482.6834	Avg validation loss: 1.6877
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 759	 126	 99	 0.8576271	 0.88461536	 0.8709122
I-CONTACT	 49	 24	 9	 0.6712329	 0.8448276	 0.74809164
I-AGE	 2	 8	 1	 0.2	 0.6666667	 0.30769232
I-IDNUM	 0	 5	 5	 0.0	 0.0	 0.0
B-DATE	 666	 76	 57	 0.8975741	 0.92116183	 0.90921503
I-DATE	 92	 7	 18	 0.9292929	 0.8363636	 0.88038284
I-LOCATION	 439	 220	 317	 0.6661608	 0.5806878	 0.62049466
B-NAME	 843	 193	 151	 0.8137066	 0.8480885	 0.83054185
B-AGE	 71	 30	 42	 0.7029703	 0.6283186	 0.66355145
B-LOCATION	 1742	 260	 406	 0.8701299	 0.81098694	 0.8395181
B-IDNUM	 4	 8	 22	 0.33333334	 0.15384616	 0.21052632
B-CONTACT	 30	 33	 36	 0.47619048	 0.45454547	 0.4651163
tp: 4697 fp: 990 fn: 1163 labels: 12
Macro-average	 prec: 0.61818486, rec: 

                                                                                

Epoch 17/30 - 25.15s - total training loss: 1569.9241 - avg training loss: 1.4054826 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.74s
Total validation loss: 498.1250	Avg validation loss: 1.7417
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 711	 81	 147	 0.89772725	 0.82867134	 0.8618182
I-CONTACT	 45	 6	 13	 0.88235295	 0.7758621	 0.82568806
I-AGE	 2	 5	 1	 0.2857143	 0.6666667	 0.4
I-IDNUM	 0	 4	 5	 0.0	 0.0	 0.0
B-DATE	 676	 92	 47	 0.8802083	 0.9349931	 0.906774
I-DATE	 93	 13	 17	 0.8773585	 0.8454546	 0.8611111
I-LOCATION	 389	 94	 367	 0.805383	 0.51455027	 0.62792575
B-NAME	 798	 139	 196	 0.85165423	 0.8028169	 0.8265148
B-AGE	 55	 11	 58	 0.8333333	 0.48672566	 0.61452514
B-LOCATION	 1747	 233	 401	 0.8823232	 0.81331474	 0.84641474
B-IDNUM	 2	 6	 24	 0.25	 0.07692308	 0.11764707
B-CONTACT	 29	 25	 37	 0.537037	 0.43939394	 0.48333332
tp: 4547 fp: 709 fn: 1313 labels: 12
Macro-average	 prec: 0.66525763, rec: 0.598781

                                                                                

Epoch 18/30 - 25.49s - total training loss: 1517.5596 - avg training loss: 1.358603 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.79s
Total validation loss: 493.3323	Avg validation loss: 1.7249
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 736	 118	 122	 0.8618267	 0.8578088	 0.8598131
I-CONTACT	 42	 5	 16	 0.89361703	 0.7241379	 0.79999995
I-AGE	 0	 3	 3	 0.0	 0.0	 0.0
I-IDNUM	 1	 4	 4	 0.2	 0.2	 0.20000002
B-DATE	 658	 58	 65	 0.9189944	 0.9100968	 0.91452396
I-DATE	 92	 6	 18	 0.93877554	 0.8363636	 0.88461536
I-LOCATION	 398	 132	 358	 0.7509434	 0.52645504	 0.61897355
B-NAME	 790	 130	 204	 0.8586956	 0.79476863	 0.8254963
B-AGE	 60	 14	 53	 0.8108108	 0.53097343	 0.6417112
B-LOCATION	 1718	 241	 430	 0.87697804	 0.7998138	 0.83662033
B-IDNUM	 6	 8	 20	 0.42857143	 0.23076923	 0.29999998
B-CONTACT	 27	 27	 39	 0.5	 0.4090909	 0.45000002
tp: 4528 fp: 746 fn: 1332 labels: 12
Macro-average	 prec: 0.66993445, rec: 0.5683565, 

                                                                                

Epoch 19/30 - 25.42s - total training loss: 1514.4695 - avg training loss: 1.3558366 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.79s
Total validation loss: 504.7399	Avg validation loss: 1.7648
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 732	 83	 126	 0.8981595	 0.85314685	 0.87507474
I-CONTACT	 52	 28	 6	 0.65	 0.8965517	 0.7536232
I-AGE	 3	 7	 0	 0.3	 1.0	 0.4615385
I-IDNUM	 0	 4	 5	 0.0	 0.0	 0.0
B-DATE	 674	 85	 49	 0.88801056	 0.93222684	 0.90958166
I-DATE	 92	 13	 18	 0.8761905	 0.8363636	 0.8558139
I-LOCATION	 411	 121	 345	 0.77255636	 0.5436508	 0.63819873
B-NAME	 825	 155	 169	 0.84183675	 0.8299799	 0.83586633
B-AGE	 75	 34	 38	 0.6880734	 0.6637168	 0.6756757
B-LOCATION	 1800	 323	 348	 0.8478568	 0.83798885	 0.842894
B-IDNUM	 5	 13	 21	 0.2777778	 0.1923077	 0.22727273
B-CONTACT	 38	 41	 28	 0.48101267	 0.57575756	 0.524138
tp: 4707 fp: 907 fn: 1153 labels: 12
Macro-average	 prec: 0.62678957, rec: 0.6801409, f1

                                                                                

Epoch 20/30 - 25.55s - total training loss: 1484.1133 - avg training loss: 1.32866 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.72s
Total validation loss: 495.7651	Avg validation loss: 1.7334
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 744	 97	 114	 0.88466114	 0.86713284	 0.8758093
I-CONTACT	 48	 19	 10	 0.7164179	 0.82758623	 0.76799995
I-AGE	 2	 6	 1	 0.25	 0.6666667	 0.36363637
I-IDNUM	 1	 9	 4	 0.1	 0.2	 0.13333334
B-DATE	 662	 58	 61	 0.91944444	 0.9156293	 0.917533
I-DATE	 92	 12	 18	 0.88461536	 0.8363636	 0.8598131
I-LOCATION	 435	 158	 321	 0.7335582	 0.57539684	 0.6449222
B-NAME	 848	 191	 146	 0.8161694	 0.8531187	 0.8342351
B-AGE	 73	 29	 40	 0.71568626	 0.6460177	 0.67906976
B-LOCATION	 1793	 274	 355	 0.86744076	 0.83472997	 0.850771
B-IDNUM	 4	 9	 22	 0.30769232	 0.15384616	 0.20512821
B-CONTACT	 36	 58	 30	 0.38297874	 0.54545456	 0.45000002
tp: 4738 fp: 920 fn: 1122 labels: 12
Macro-average	 prec: 0.631555

                                                                                

Epoch 21/30 - 25.63s - total training loss: 1445.0181 - avg training loss: 1.2936598 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.86s
Total validation loss: 500.0023	Avg validation loss: 1.7483
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 724	 79	 134	 0.90161896	 0.84382284	 0.871764
I-CONTACT	 49	 10	 9	 0.8305085	 0.8448276	 0.83760685
I-AGE	 2	 3	 1	 0.4	 0.6666667	 0.5
I-IDNUM	 1	 3	 4	 0.25	 0.2	 0.22222224
B-DATE	 657	 51	 66	 0.9279661	 0.9087137	 0.918239
I-DATE	 91	 14	 19	 0.8666667	 0.8272727	 0.84651154
I-LOCATION	 422	 172	 334	 0.7104377	 0.5582011	 0.6251852
B-NAME	 825	 163	 169	 0.83502024	 0.8299799	 0.8324924
B-AGE	 68	 23	 45	 0.74725276	 0.6017699	 0.66666675
B-LOCATION	 1796	 302	 352	 0.8560534	 0.8361266	 0.8459727
B-IDNUM	 4	 12	 22	 0.25	 0.15384616	 0.19047621
B-CONTACT	 31	 28	 35	 0.5254237	 0.46969697	 0.496
tp: 4670 fp: 860 fn: 1190 labels: 12
Macro-average	 prec: 0.67507905, rec: 0.64507705, 

                                                                                

Epoch 22/30 - 26.02s - total training loss: 1402.4447 - avg training loss: 1.2555459 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.84s
Total validation loss: 493.1565	Avg validation loss: 1.7243
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 751	 101	 107	 0.8814554	 0.87529135	 0.8783626
I-CONTACT	 47	 17	 11	 0.734375	 0.8103448	 0.7704918
I-AGE	 2	 5	 1	 0.2857143	 0.6666667	 0.4
I-IDNUM	 1	 7	 4	 0.125	 0.2	 0.15384616
B-DATE	 664	 66	 59	 0.90958905	 0.9183956	 0.9139711
I-DATE	 92	 9	 18	 0.9108911	 0.8363636	 0.8720379
I-LOCATION	 401	 110	 355	 0.7847358	 0.5304233	 0.6329913
B-NAME	 847	 160	 147	 0.8411122	 0.85211265	 0.8465767
B-AGE	 60	 12	 53	 0.8333333	 0.53097343	 0.6486487
B-LOCATION	 1773	 256	 375	 0.8738295	 0.825419	 0.8489346
B-IDNUM	 3	 7	 23	 0.3	 0.115384616	 0.16666667
B-CONTACT	 33	 26	 33	 0.55932206	 0.5	 0.528
tp: 4674 fp: 776 fn: 1186 labels: 12
Macro-average	 prec: 0.6699465, rec: 0.63844794, f1:

                                                                                

Epoch 23/30 - 26.49s - total training loss: 1380.7888 - avg training loss: 1.2361583 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.89s
Total validation loss: 492.8770	Avg validation loss: 1.7233
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 722	 85	 136	 0.8946716	 0.8414918	 0.8672673
I-CONTACT	 48	 15	 10	 0.7619048	 0.82758623	 0.7933885
I-AGE	 2	 5	 1	 0.2857143	 0.6666667	 0.4
I-IDNUM	 4	 11	 1	 0.26666668	 0.8	 0.4
B-DATE	 670	 68	 53	 0.9078591	 0.92669433	 0.91718
I-DATE	 91	 3	 19	 0.9680851	 0.8272727	 0.8921569
I-LOCATION	 399	 82	 357	 0.82952183	 0.5277778	 0.6451092
B-NAME	 807	 138	 187	 0.85396826	 0.81187123	 0.83238786
B-AGE	 75	 20	 38	 0.7894737	 0.6637168	 0.7211538
B-LOCATION	 1716	 201	 432	 0.8951487	 0.79888266	 0.8442804
B-IDNUM	 6	 20	 20	 0.23076923	 0.23076923	 0.23076923
B-CONTACT	 34	 30	 32	 0.53125	 0.5151515	 0.52307695
tp: 4574 fp: 678 fn: 1286 labels: 12
Macro-average	 prec: 0.68458605, rec:

                                                                                

Epoch 24/30 - 26.08s - total training loss: 1335.1971 - avg training loss: 1.1953421 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.88s
Total validation loss: 503.0075	Avg validation loss: 1.7588
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 738	 89	 120	 0.8923821	 0.86013985	 0.8759644
I-CONTACT	 45	 8	 13	 0.8490566	 0.7758621	 0.8108108
I-AGE	 2	 7	 1	 0.22222222	 0.6666667	 0.33333334
I-IDNUM	 3	 6	 2	 0.33333334	 0.6	 0.42857143
B-DATE	 661	 54	 62	 0.92447555	 0.9142462	 0.91933244
I-DATE	 92	 13	 18	 0.8761905	 0.8363636	 0.8558139
I-LOCATION	 433	 147	 323	 0.74655175	 0.57275134	 0.6482037
B-NAME	 831	 146	 163	 0.85056293	 0.8360161	 0.8432268
B-AGE	 77	 38	 36	 0.6695652	 0.6814159	 0.6754386
B-LOCATION	 1746	 235	 402	 0.88137305	 0.81284916	 0.84572536
B-IDNUM	 4	 12	 22	 0.25	 0.15384616	 0.19047621
B-CONTACT	 35	 29	 31	 0.546875	 0.530303	 0.5384615
tp: 4667 fp: 784 fn: 1193 labels: 12
Macro-average	 prec: 0.67

                                                                                

Epoch 25/30 - 26.94s - total training loss: 1299.8882 - avg training loss: 1.1637316 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.87s
Total validation loss: 520.1693	Avg validation loss: 1.8188
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 720	 78	 138	 0.90225565	 0.83916086	 0.8695653
I-CONTACT	 47	 18	 11	 0.72307694	 0.8103448	 0.7642277
I-AGE	 1	 4	 2	 0.2	 0.33333334	 0.25
I-IDNUM	 1	 10	 4	 0.09090909	 0.2	 0.125
B-DATE	 669	 76	 54	 0.8979866	 0.9253112	 0.9114442
I-DATE	 91	 3	 19	 0.9680851	 0.8272727	 0.8921569
I-LOCATION	 400	 89	 356	 0.8179959	 0.52910054	 0.6425703
B-NAME	 814	 146	 180	 0.84791666	 0.81891346	 0.8331628
B-AGE	 71	 20	 42	 0.7802198	 0.6283186	 0.6960785
B-LOCATION	 1779	 281	 369	 0.8635922	 0.82821226	 0.84553236
B-IDNUM	 4	 9	 22	 0.30769232	 0.15384616	 0.20512821
B-CONTACT	 37	 38	 29	 0.49333334	 0.56060606	 0.5248228
tp: 4634 fp: 772 fn: 1226 labels: 12
Macro-average	 prec: 0.6577553, re

                                                                                

Epoch 26/30 - 26.06s - total training loss: 1283.1296 - avg training loss: 1.1487284 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.84s
Total validation loss: 516.2067	Avg validation loss: 1.8049
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 743	 107	 115	 0.8741177	 0.8659674	 0.8700234
I-CONTACT	 47	 17	 11	 0.734375	 0.8103448	 0.7704918
I-AGE	 3	 8	 0	 0.27272728	 1.0	 0.42857146
I-IDNUM	 0	 7	 5	 0.0	 0.0	 0.0
B-DATE	 666	 61	 57	 0.9160935	 0.92116183	 0.9186207
I-DATE	 94	 11	 16	 0.8952381	 0.8545455	 0.8744187
I-LOCATION	 473	 252	 283	 0.6524138	 0.6256614	 0.6387575
B-NAME	 827	 171	 167	 0.8286573	 0.831992	 0.8303213
B-AGE	 68	 22	 45	 0.75555557	 0.6017699	 0.6699507
B-LOCATION	 1828	 366	 320	 0.8331814	 0.8510242	 0.8420083
B-IDNUM	 4	 7	 22	 0.36363637	 0.15384616	 0.21621622
B-CONTACT	 36	 28	 30	 0.5625	 0.54545456	 0.5538462
tp: 4789 fp: 1057 fn: 1071 labels: 12
Macro-average	 prec: 0.640708, rec: 0.671814, 

                                                                                

Epoch 27/30 - 27.00s - total training loss: 1269.74 - avg training loss: 1.1367413 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.87s
Total validation loss: 514.8259	Avg validation loss: 1.8001
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 749	 122	 109	 0.8599311	 0.8729604	 0.8663967
I-CONTACT	 48	 25	 10	 0.65753424	 0.82758623	 0.7328244
I-AGE	 0	 1	 3	 0.0	 0.0	 0.0
I-IDNUM	 4	 9	 1	 0.30769232	 0.8	 0.44444445
B-DATE	 668	 63	 55	 0.9138167	 0.9239281	 0.91884464
I-DATE	 91	 6	 19	 0.9381443	 0.8272727	 0.87922704
I-LOCATION	 442	 174	 314	 0.71753246	 0.58465606	 0.6443149
B-NAME	 838	 172	 156	 0.829703	 0.84305835	 0.8363274
B-AGE	 56	 15	 57	 0.7887324	 0.49557522	 0.6086957
B-LOCATION	 1779	 293	 369	 0.8585907	 0.82821226	 0.84312797
B-IDNUM	 8	 15	 18	 0.3478261	 0.30769232	 0.32653064
B-CONTACT	 38	 44	 28	 0.46341464	 0.57575756	 0.5135135
tp: 4721 fp: 939 fn: 1139 labels: 12
Macro-average	 prec: 0.6402432, rec: 

                                                                                

Epoch 28/30 - 25.87s - total training loss: 1245.6982 - avg training loss: 1.1152178 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.92s
Total validation loss: 515.7788	Avg validation loss: 1.8034
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 758	 138	 100	 0.84598213	 0.8834499	 0.8643101
I-CONTACT	 52	 26	 6	 0.6666667	 0.8965517	 0.76470596
I-AGE	 1	 2	 2	 0.33333334	 0.33333334	 0.33333334
I-IDNUM	 4	 17	 1	 0.1904762	 0.8	 0.30769232
B-DATE	 676	 79	 47	 0.8953642	 0.9349931	 0.9147496
I-DATE	 94	 11	 16	 0.8952381	 0.8545455	 0.8744187
I-LOCATION	 404	 132	 352	 0.75373137	 0.5343915	 0.625387
B-NAME	 819	 161	 175	 0.8357143	 0.8239437	 0.8297872
B-AGE	 63	 14	 50	 0.8181818	 0.5575221	 0.6631579
B-LOCATION	 1763	 279	 385	 0.8633692	 0.8207635	 0.8415274
B-IDNUM	 12	 19	 14	 0.38709676	 0.46153846	 0.42105266
B-CONTACT	 40	 43	 26	 0.48192772	 0.6060606	 0.53691274
tp: 4686 fp: 921 fn: 1174 labels: 12
Macro-average	 prec

                                                                                

Epoch 29/30 - 26.33s - total training loss: 1222.2452 - avg training loss: 1.0942214 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.82s
Total validation loss: 524.0858	Avg validation loss: 1.8325
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 746	 117	 112	 0.86442643	 0.86946386	 0.8669379
I-CONTACT	 49	 30	 9	 0.62025315	 0.8448276	 0.71532845
I-AGE	 0	 2	 3	 0.0	 0.0	 0.0
I-IDNUM	 0	 4	 5	 0.0	 0.0	 0.0
B-DATE	 668	 55	 55	 0.9239281	 0.9239281	 0.9239281
I-DATE	 91	 9	 19	 0.91	 0.8272727	 0.8666666
I-LOCATION	 432	 167	 324	 0.721202	 0.5714286	 0.6376384
B-NAME	 822	 160	 172	 0.8370672	 0.82696176	 0.8319838
B-AGE	 66	 21	 47	 0.7586207	 0.5840708	 0.66
B-LOCATION	 1743	 260	 405	 0.87019473	 0.8114525	 0.8397976
B-IDNUM	 5	 23	 21	 0.17857143	 0.1923077	 0.1851852
B-CONTACT	 32	 44	 34	 0.42105263	 0.4848485	 0.45070425
tp: 4654 fp: 892 fn: 1206 labels: 12
Macro-average	 prec: 0.5921097, rec: 0.57804686, f1: 0.5849938
Mi

                                                                                

Epoch 30/30 - 25.80s - total training loss: 1198.5831 - avg training loss: 1.0730377 - batches: 1117
Quality on validation dataset (20.0%), validation examples = 2238
time to finish evaluation: 1.83s
Total validation loss: 529.5698	Avg validation loss: 1.8516
label	 tp	 fp	 fn	 prec	 rec	 f1
I-NAME	 751	 111	 107	 0.8712297	 0.87529135	 0.8732558
I-CONTACT	 43	 20	 15	 0.6825397	 0.7413793	 0.7107439
I-AGE	 1	 6	 2	 0.14285715	 0.33333334	 0.2
I-IDNUM	 0	 4	 5	 0.0	 0.0	 0.0
B-DATE	 670	 60	 53	 0.91780823	 0.92669433	 0.9222299
I-DATE	 89	 10	 21	 0.8989899	 0.8090909	 0.85167474
I-LOCATION	 436	 189	 320	 0.6976	 0.5767196	 0.63142645
B-NAME	 841	 167	 153	 0.8343254	 0.8460765	 0.84015983
B-AGE	 75	 26	 38	 0.7425743	 0.6637168	 0.7009346
B-LOCATION	 1796	 337	 352	 0.84200656	 0.8361266	 0.8390563
B-IDNUM	 3	 11	 23	 0.21428572	 0.115384616	 0.14999999
B-CONTACT	 31	 44	 35	 0.41333333	 0.46969697	 0.4397163
tp: 4736 fp: 985 fn: 1124 labels: 12
Macro-average	 prec: 0.6047959, rec: 

In [146]:
ner_model.stages[-1].getTrainingClassDistribution()

{'I-NAME': 4341, 'I-CONTACT': 205, 'I-AGE': 36, 'I-IDNUM': 63, 'B-DATE': 3513, 'I-DATE': 490, 'I-LOCATION': 3780, 'B-NAME': 5133, 'B-AGE': 589, 'B-LOCATION': 10578, 'B-IDNUM': 160, 'O': 136507, 'B-CONTACT': 311}

In [150]:
ner_model.stages[-1].write().overwrite().save('models/new_NER_model')

In [None]:
import os
log_file= os.listdir("ner_logs")[0]

with open (f"./ner_logs/{log_file}") as f:
    print(f.read())

## Evaluate your model

In [147]:
pred_df = ner_model.stages[-1].transform(test_df).cache()

In [148]:
pred_df.show()

                                                                                

+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|doc_id|                text|            document|            splitter|               token|          embeddings|           ner_label|                 ner|
+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|     X|" Bankruptcy is a...|[{document, 0, 13...|[{document, 0, 13...|[{token, 0, 0, ",...|[{word_embeddings...|[{named_entity, 0...|[{named_entity, 0...|
|     X|" I have my own c...|[{document, 0, 57...|[{document, 0, 58...|[{token, 0, 0, ",...|[{word_embeddings...|[{named_entity, 0...|[{named_entity, 0...|
|     X|" Indian fisherme...|[{document, 0, 12...|[{document, 0, 12...|[{token, 0, 0, ",...|[{word_embeddings...|[{named_entity, 0...|[{named_entity, 0...|
|     X|" The first half ...|[{document, 0, 20...|[{document, 0,

In [151]:
from pyspark.sql import functions as F

pred_token_df = pred_df.select(F.explode(F.arrays_zip(pred_df.ner_label.metadata,
                                                  pred_df.ner_label.begin,
                                                  pred_df.ner_label.end,
                                                  pred_df.ner_label.result,
                                                  pred_df.ner.result)).alias("cols")) \
          .select(F.expr("cols['0']['word']").alias("token"),
                  F.expr("cols['1']").alias("begin"),
                  F.expr("cols['2']").alias("end"),
                  F.expr("cols['3']").alias("gtruth"),
                  F.expr("cols['4']").alias("prediction"))\
          .toPandas()

pred_token_df

Unnamed: 0,token,begin,end,gtruth,prediction
0,"""",0,0,O,O
1,Bankruptcy,2,11,B-NAME,B-NAME
2,is,13,14,O,O
3,always,16,21,O,O
4,possible,23,30,O,O
...,...,...,...,...,...
40252,Wednesday,21,29,B-DATE,B-DATE
40253,(,31,31,O,O
40254,prefix,33,38,O,O
40255,number,40,45,O,O


In [153]:
from sparknlp_jsl.eval import NerDLMetrics
import pyspark.sql.functions as F

evaler = NerDLMetrics(mode="full_chunk")

eval_result = evaler.computeMetricsFromDF(pred_df.select("ner_label","ner"), 
                                          prediction_col="ner", 
                                          label_col="ner_label", 
                                          drop_o = True, case_sensitive = True).cache()

eval_result.withColumn("precision", F.round(eval_result["precision"],4))\
           .withColumn("recall", F.round(eval_result["recall"],4))\
           .withColumn("f1", F.round(eval_result["f1"],4)).show(100)

print(eval_result.selectExpr("avg(f1) as macro").show())
print (eval_result.selectExpr("sum(f1*total) as sumprod","sum(total) as sumtotal").selectExpr("sumprod/sumtotal as micro").show())

+--------+------+-----+-----+------+---------+------+------+
|  entity|    tp|   fp|   fn| total|precision|recall|    f1|
+--------+------+-----+-----+------+---------+------+------+
| CONTACT|  42.0| 30.0| 29.0|  71.0|   0.5833|0.5915|0.5874|
|    NAME|1149.0|143.0|162.0|1311.0|   0.8893|0.8764|0.8828|
|    DATE| 809.0| 29.0| 68.0| 877.0|   0.9654|0.9225|0.9434|
|   IDNUM|  18.0|  8.0| 20.0|  38.0|   0.6923|0.4737|0.5625|
|LOCATION|2260.0|310.0|325.0|2585.0|   0.8794|0.8743|0.8768|
|     AGE| 133.0| 32.0| 23.0| 156.0|   0.8061|0.8526|0.8287|
+--------+------+-----+-----+------+---------+------+------+

+-----------------+
|            macro|
+-----------------+
|0.780276564367539|
+-----------------+

None
+------------------+
|             micro|
+------------------+
|0.8820389851678987|
+------------------+

None


In [154]:
evaler = NerDLMetrics(mode="partial_chunk_per_token")
eval_result_partial = evaler.computeMetricsFromDF(pred_df.select("ner_label","ner"), prediction_col="ner", label_col="ner_label", drop_o = True, case_sensitive = True).cache()

eval_result_partial.withColumn("precision", F.round(eval_result_partial["precision"],4))\
           .withColumn("recall", F.round(eval_result_partial["recall"],4))\
           .withColumn("f1", F.round(eval_result_partial["f1"],4)).sort("entity").show(100)
df_partial=eval_result_partial.toPandas()
print("partial_chunk_per_token")
print(eval_result_partial.selectExpr("avg(f1) as macro").show())
print (eval_result_partial.selectExpr("sum(f1*total) as sumprod","sum(total) as sumtotal").selectExpr("sumprod/sumtotal as micro").show())

+--------+------+-----+-----+------+---------+------+------+
|  entity|    tp|   fp|   fn| total|precision|recall|    f1|
+--------+------+-----+-----+------+---------+------+------+
|     AGE| 138.0| 31.0| 23.0| 161.0|   0.8166|0.8571|0.8364|
| CONTACT| 106.0| 36.0| 24.0| 130.0|   0.7465|0.8154|0.7794|
|    DATE| 934.0| 24.0| 66.0|1000.0|   0.9749| 0.934| 0.954|
|   IDNUM|  25.0| 11.0| 22.0|  47.0|   0.6944|0.5319|0.6024|
|LOCATION|3140.0|330.0|392.0|3532.0|   0.9049| 0.889|0.8969|
|    NAME|2181.0|171.0|185.0|2366.0|   0.9273|0.9218|0.9245|
+--------+------+-----+-----+------+---------+------+------+

partial_chunk_per_token
+------------------+
|             macro|
+------------------+
|0.8322751118663967|
+------------------+

None
+------------------+
|             micro|
+------------------+
|0.9084578982865754|
+------------------+

None


# Create New Pipeline

In [180]:
# We are loading the pretrained pipeline using the `from_disk` method.
from sparknlp.pretrained import PretrainedPipeline

modified_pipeline = PretrainedPipeline.from_disk('modified_pipeline')

In [181]:
modified_pipeline.model.stages

[DocumentAssembler_ae0f203deedd,
 InternalDocumentSplitter_cc36578ceda6,
 REGEX_TOKENIZER_2e85686aea12,
 WORD_EMBEDDINGS_MODEL_9004b1d00302,
 MedicalNerModel_1a8637089929,
 NER_CONVERTER_1aef7e9d2de5,
 MedicalNerModel_d92d47622e85,
 MedicalNerModel_32184c1db80b,
 MedicalNerModel_ada39ac0d359,
 NER_CONVERTER_a99db4e6a79d,
 NER_CONVERTER_4a9436714344,
 NER_CONVERTER_ea6433988e18,
 PretrainedZeroShotNER_5f30ab9002f1,
 NER_CONVERTER_c97040caf7b3,
 MedicalNerModel_b8b167ec3114,
 NER_CONVERTER_06db473f3215,
 ContextualEntityRuler_11ff6711ef6b,
 ChunkMergeModel_95d6827691bb,
 CONTEXTUAL-PARSER_bf2a6abaf5fa,
 CONTEXTUAL-PARSER_ff6bad379d91,
 CONTEXTUAL-PARSER_89341cae7221,
 CONTEXTUAL-PARSER_c6b9eded8d31,
 CONTEXTUAL-PARSER_9480c24bd9f8,
 CONTEXTUAL-PARSER_3886bce391c8,
 CONTEXTUAL-PARSER_0bb3fb75cd01,
 ENTITY_EXTRACTOR_6792f2f6e85a,
 ENTITY_EXTRACTOR_74ace4be4f73,
 CONTEXTUAL-PARSER_dfb32adc7555,
 REGEX_MATCHER_5003669d6422,
 CONTEXTUAL-PARSER_746a25662aa6,
 CONTEXTUAL-PARSER_079220479a3d,
 C

## New Stages

In [179]:
ner_deid_new = MedicalNerModel.load("models/new_NER_model")\
    .setInputCols(["splitter", "token", "embeddings"])\
    .setOutputCol("ner_deid_new")

ner_deid_new_converter = NerConverter()\
      .setInputCols(["splitter", "token", "ner_deid_new"])\
      .setOutputCol("ner_chunk_new")

ner_deid = MedicalNerModel.pretrained("ner_deid_subentity_docwise", "en", "clinical/models")  \
      .setInputCols(["splitter", "token", "embeddings"]) \
      .setOutputCol("ner_deid_subentity_docwise")

ner_deid_converter = NerConverter()\
      .setInputCols(["splitter", "token", "ner_deid_subentity_docwise"])\
      .setOutputCol("ner_chunk_subentity_docwise")

chunk_merge_ner = ChunkMergeModel()\
    .setInputCols("ner_chunk_new", # New Trained Model
                  "ner_chunk_subentity_docwise")\
    .setOutputCol("deid_merged_ner_chunk")\
    .setOrderingFeatures(["ChunkLength","ChunkBegin"])\
    .setMergeOverlapping(True)\
    .setResetSentenceIndices(True)


ner_deid_subentity_docwise download started this may take some time.
[OK!]


## **Update Stages**

In [182]:
modified_pipeline.model.stages = (
    modified_pipeline.model.stages[:4]
    + [ner_deid_new, 
       ner_deid_new_converter,
       ner_deid, 
       ner_deid_converter,
       chunk_merge_ner]
    + modified_pipeline.model.stages[18:]

)

In [183]:
modified_pipeline.model.stages

[DocumentAssembler_ae0f203deedd,
 InternalDocumentSplitter_cc36578ceda6,
 REGEX_TOKENIZER_2e85686aea12,
 WORD_EMBEDDINGS_MODEL_9004b1d00302,
 MedicalNerModel_f3c446a6c387,
 NerConverter_e2443d1f19ec,
 MedicalNerModel_32184c1db80b,
 NerConverter_0e94e14fb143,
 ChunkMergeModel_15094e842f96,
 CONTEXTUAL-PARSER_bf2a6abaf5fa,
 CONTEXTUAL-PARSER_ff6bad379d91,
 CONTEXTUAL-PARSER_89341cae7221,
 CONTEXTUAL-PARSER_c6b9eded8d31,
 CONTEXTUAL-PARSER_9480c24bd9f8,
 CONTEXTUAL-PARSER_3886bce391c8,
 CONTEXTUAL-PARSER_0bb3fb75cd01,
 ENTITY_EXTRACTOR_6792f2f6e85a,
 ENTITY_EXTRACTOR_74ace4be4f73,
 CONTEXTUAL-PARSER_dfb32adc7555,
 REGEX_MATCHER_5003669d6422,
 CONTEXTUAL-PARSER_746a25662aa6,
 CONTEXTUAL-PARSER_079220479a3d,
 CONTEXTUAL-PARSER_f8b8f9aafb9f,
 CONTEXTUAL-PARSER_7f824493eafc,
 REGEX_MATCHER_26934077fe57,
 REGEX_MATCHER_5fe3de8b5a4e,
 CONTEXTUAL-PARSER_1543b24b4890,
 CONTEXTUAL-PARSER_980d396bec6c,
 MERGE_ddff59e8b14a,
 ChunkMergeModel_50feb5f97568,
 ContextualEntityRuler_08eeaa89c938,
 ChunkMe

In [185]:
empty_result = modified_pipeline.transform(spark.createDataFrame([[""]]).toDF("text"))

modified_pipeline.model.write().overwrite().save("new_pipeline")

In [11]:
from sparknlp.pretrained import PretrainedPipeline

new_pipeline = PretrainedPipeline.from_disk('new_pipeline')

## Sample Result

In [10]:
samples_df = spark.createDataFrame([[text]]).toDF("text")

result = new_pipeline.transform(samples_df).cache()

In [12]:
result.select(F.explode(F.arrays_zip(result.ner_chunk.result,
                                     result.ner_chunk.begin,
                                     result.ner_chunk.end,
                                     result.ner_chunk.metadata)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']").alias("begin"),
              F.expr("cols['2']").alias("end"),
              F.expr("cols['3']['entity']").alias("ner_label"),
              F.expr("cols['3']['confidence']").alias("confidence")).show(50,truncate=False)



+----------------------------------+-----+----+---------+----------+
|chunk                             |begin|end |ner_label|confidence|
+----------------------------------+-----+----+---------+----------+
|John Lee                          |22   |29  |NAME     |0.83455   |
|7789201                           |37   |43  |IDNUM    |0.72      |
|2025-05-12                        |75   |84  |DATE     |NULL      |
|#RD23-4897                        |101  |110 |IDNUM    |0.50      |
|Smith                             |232  |236 |NAME     |0.6658    |
|Carter                            |243  |248 |NAME     |0.9997    |
|2025-05-12                        |275  |284 |DATE     |NULL      |
|Fan Gabriel                       |313  |323 |NAME     |0.7408    |
|90210                             |325  |329 |IDNUM    |0.3806    |
|New York                          |373  |380 |LOCATION |NULL      |
|Williams                          |391  |398 |LOCATION |0.3922    |
|NYC                              

                                                                                

In [19]:
pd.set_option("display.max_colwidth", 1000)
# pd.set_option("display.max_colwidth",0)


result_df = result.selectExpr("text","mask_entity.result as masked_result","obfuscated.result as obfuscated_result").toPandas()
result_df

Unnamed: 0,text,masked_result,obfuscated_result
0,"\n(NOTE) Patient Name: John Lee. MR#: 7789201 Location: LERE Date Reported: 2025-05-12 16:30\nSpecimen #RD23-4897 Clinical History: None Given. CLINICAL INFORMATION: Date of Last Menstrual Period: N/A\nElectronically Signed Out By Dr. Smith, Dr. Carter, CT(ASCP) Date Reported: 2025-05-12 16:30\nGeneral Hospital Dr. Fan Gabriel 90210 CPT Code(s) A: 88305\n\nGeneral Hospital in New York City Dr. Williams, NYC, NY\n(212) 555-7890 Patient Name: John Lee Accession #: GH-556672\nPatient ID #: 7789201 Collected: 2025-05-10 Address:\n123 Main Street, FALL RIVER\nNIAGARA FALLS, NY 14304\nReceived: 2025-05-10 Reported: 2025-05-12\nSoc. Sec. #: XXX-XX-1234 DOB/Age/Sex: 1973 (Age: 52) M\nPhysician(s): Dr. Jameson. Peripheral sequestration, i.e. splenomegaly or hepatomegaly should be excluded to be sure if peripheral sequestration is not present.\nThe following special studies were performed at Barstow Heights Christus Southeast, NY – St Elizabeth; New York City.\n· Chromosome analysis cytogene...","[\n(NOTE) Patient Name: <NAME>. MR#: <IDNUM> Location: LERE Date Reported: <DATE> 16:30\nSpecimen <IDNUM> Clinical History: None Given. CLINICAL INFORMATION: Date of Last Menstrual Period: N/A\nElectronically Signed Out By Dr. <NAME>, Dr. <NAME>, CT(ASCP) Date Reported: <DATE> 16:30\nGeneral Hospital Dr. <NAME> <IDNUM> CPT Code(s) A: 88305\n\nGeneral Hospital in <LOCATION> City Dr. <LOCATION>, <LOCATION>, <LOCATION>\n<CONTACT> Patient Name: <NAME> Accession #: <IDNUM>\nPatient ID #: <IDNUM> Collected: <DATE> Address:\n<LOCATION>, FALL <LOCATION>, <LOCATION> <LOCATION>\nReceived: <DATE> Reported: <DATE>\nSoc. Sec. #: XXX-XX-1234 DOB/Age/Sex: <DATE> (Age: <AGE>) <NAME>): Dr. <NAME>. Peripheral sequestration, i.e. splenomegaly or hepatomegaly should be excluded to be sure if peripheral sequestration is not present.\nThe following special studies were performed at <LOCATION>, <LOCATION> – <LOCATION>; <LOCATION> City.\n· Chromosome analysis cytogenetics. (ADDENDUM REPORT TO FOLLOW.)\n· ...","[\n(NOTE) Patient Name: Gillie Allan. MR#: 0074518 Location: LERE Date Reported: 2025-06-29 16:30\nSpecimen #SA52-9740 Clinical History: None Given. CLINICAL INFORMATION: Date of Last Menstrual Period: N/A\nElectronically Signed Out By Dr. Wanna, Dr. Malvin, CT(ASCP) Date Reported: 2025-06-29 16:30\nGeneral Hospital Dr. Marcelo Danes 41581 CPT Code(s) A: 88305\n\nGeneral Hospital in 2000 Boise Ave City Dr. 2601 Fox Run Parkway, 427 GUY PARK AVE, 16100 SOUTH FREEWAY\n(585) 666-0741 Patient Name: Gillie Allan Accession #: PU-663305\nPatient ID #: 0074518 Collected: 2025-06-27 Address:\n3255 Independence Street, FALL 401 BICENTENNIAL WAY, 16100 SOUTH FREEWAY 59 KOCH AVE\nReceived: 2025-06-27 Reported: 2025-06-29\nSoc. Sec. #: XXX-XX-1234 DOB/Age/Sex: 1974 (Age: 44) TERETHA Sol): Dr. Marchelle. Peripheral sequestration, i.e. splenomegaly or hepatomegaly should be excluded to be sure if peripheral sequestration is not present.\nThe following special studies were performed at 103 North S..."
