In [None]:
import json
import os

from google.colab import files

if 'spark_jsl.json' not in os.listdir():
  license_keys = files.upload()
  os.rename(list(license_keys.keys())[0], 'spark_jsl.json')

with open('spark_jsl.json') as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)
os.environ.update(license_keys)

In [None]:

# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.5.0  spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

In [3]:
import json
import os

import sparknlp
import sparknlp_jsl

from sparknlp.base import *
from sparknlp.util import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp_jsl.pipeline_tracer import PipelineTracer
from sparknlp_jsl.pipeline_output_parser import PipelineOutputParser

from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline, PipelineModel

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

import string
import numpy as np


params = {"spark.driver.memory":"16G",
          "spark.kryoserializer.buffer.max":"2000M",
          "spark.driver.maxResultSize":"2000M"}

spark = sparknlp_jsl.start(secret = license_keys["SECRET"], params=params)

print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark

Spark NLP Version : 6.0.2
Spark NLP_JSL Version : 6.0.2


## Pipeline Start

In [4]:
documentAssembler = DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")

splitter = (
            InternalDocumentSplitter()
            .setInputCols("document")
            .setOutputCol("splitter")
            .setSplitMode("recursive")
            .setSplitPatterns(["\s+"])  # Token base
            .setPatternsAreRegex(True)
            .setChunkSize(512)    # 512 Char Lenght
            .setChunkOverlap(50)
            .setEnableSentenceIncrement(True)  # Like sentenceDetector
)

tokenizer = (
    Tokenizer()
    .setInputCols("splitter")
    .setOutputCol("token")
)

In [5]:
full_example = """
A . Record date: 2093-01-13, date: 2093-01-13, DATE: 2093-01-13, David Hale, M.D. IP: 203.120.223.13. ID: 1231511863, Des Moines AL 50129-4444, The driver's license no: A334455B. the SSN:324598674 and e-mail: hale@gmail.com. Name : Hendrickson, Ora MR. # 719435 Date : 01/13/93. PCP : Oliveira, 25 years-old. Record date : 2079-11-09, Cocke County Baptist Hospital. 0295 Keats Street. Phone (302) 786-5227.
Mine is SSN#332255677, The other is ssN: 333-44-6666. the rest ssn:  212-33-4444. his is sSN : 345-33-5666, HER is ssn 332233445.
me again ssn 223344556, Their SSN: 234-44-3333. MY dln# D08954796. your DLN : AB773955A. \
Social Security no: 445-66-4432. Patient's VIN : 1HGBH41JXMN109286. Molina Cortes, Alvaro, MD. MRN: 1482926 from GE Healthcare.
His personal IP: 2001:db8:85a3:8d3:1319:8a2e:370:7348. the patient's ssns: 333224444. my account number is 123456789. your routing 44334456.
account#3344556677. his bank no: 334455667788. Patient's Vehicle Identifiers: 1HGBH41JXMN109286. my VIN# 1dddd41JXMN109286. our vehicle id no: 1HGBH41JXMN109286.
his aba is 3445-6543-2. sample bankrouting: 23443245. \
"""

In [6]:
nlp = Pipeline(stages=[documentAssembler,splitter,tokenizer])
nlp.fit(spark.createDataFrame([['']]).toDF("text")).transform(spark.createDataFrame([[full_example]]).toDF("text")).selectExpr("explode(splitter) as s").show(n=50,truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|s                                                                                                                                                                                                                                                                                                                                                                                         

In [7]:
def write_rule(file_path, rules):

    prefix_rule = "(?i)(?<=((({})[^a-z0-9]{})))"
    prefix_rule_init = "(?i)(?<=(({})))"
    suffix_rule = "(?i)(?=(([^a-z0-9]{}({}))))"
    suffix_rule_init = "(?i)(?=(({})))"
    with open(file_path, 'w') as f:

        for label in list(rules.keys()):
            if len(rules[label]['prefix'])>0:
                rule = prefix_rule_init.format("|".join(rules[label]['prefix'])) + rules[label]['rule'] + f"~{label}"
                f.write(rule)
                f.write('\n')
                for i in range(1,rules[label]['contextLength']):
                    rule = prefix_rule.format("|".join(rules[label]['prefix']),'{'+str(i)+'}') + rules[label]['rule'] + f"~{label}"
                    f.write(rule)
                    f.write('\n')
            try:

                if len(rules[label]['suffix'])>0:
                    rule = suffix_rule_init.format("|".join(rules[label]['suffix'])) + rules[label]['rule'] + f"~{label}"
                    f.write(rule)
                    f.write('\n')
                    for i in range(1,rules[label]['contextLength']):
                        rule = rules[label]['rule'] + suffix_rule.format('{'+str(i)+'}', "|".join(rules[label]['suffix'])) + f"~{label}"
                        f.write(rule)
                        f.write('\n')
            except:
                continue

## ICD10_CODE

In [8]:
icd = {
    "entity": "ICD10_CODE",
    "ruleScope": "sentence",
    "regex":"^[A-Z]\d{2}(.[A-Z0-9]{1,4})?$",
    "matchScope": "token"
}

with open('icd10.json', 'w') as f:
    json.dump(icd, f)

icd_parser = ContextualParserApproach()\
    .setInputCols(["splitter", "token"])\
    .setOutputCol("entity_icd10")\
    .setJsonPath("icd10.json")\
    .setCaseSensitive(True)\
    .setPrefixAndSuffixMatch(False)

icd_parser_pipeline = Pipeline(
    stages=[
        documentAssembler,
        splitter,
        tokenizer,
        icd_parser
])

empty_data = spark.createDataFrame([[""]]).toDF("text")
icd_parser_model = icd_parser_pipeline.fit(empty_data)
icd_parser_model.stages[-1].write().overwrite().save("parser/icd10_parser")

icd_lp = LightPipeline(icd_parser_model)

example_text = """  Patient hat einen Infekt der oberen Atemweg
(J41) F33 A444 (T41.2) (K41.456) (X41.4X5S)
(m41.54)  (XJ41.4X56) (C414.46XS) (414.46XS) (414 46XS) (41K.46XS)"""
res = icd_lp.fullAnnotate(example_text)
print([a.result for a in res[0]['entity_icd10']])

['J41', 'F33', 'T41.2', 'K41.456', 'X41.4X5S']


## SSN

In [9]:
ssn = {
    "entity": "SSN",
    "ruleScope": "document",
    "regex": "\d{3}.?\d{2}.?\d{4}",
    "matchScope": "sub-token",
    "prefix": ["social", "security", "ss#", "ssn","(SSN)",
              "ssid", "ss #", "ssn #", "SSA Number","SSN",
               "Soc Sec","SSNS", "SSN#" ],

    "contextLength": 50
}

with open('ssn.json', 'w') as f:
    json.dump(ssn, f)

ssn_parser = ContextualParserApproach()\
    .setInputCols(["splitter", "token"])\
    .setOutputCol("entity_ssn")\
    .setJsonPath("ssn.json")\
    .setCaseSensitive(False) \
    .setPrefixAndSuffixMatch(False)\
    .setShortestContextMatch(False)\
    .setOptionalContextRules(False)\
    .setCompleteContextMatch(True)

ssn_parser_pipeline = Pipeline(stages=[
    documentAssembler,
    splitter,
    tokenizer,
    ssn_parser
])

empty_data = spark.createDataFrame([[""]]).toDF("text")

ssn_parser_model = ssn_parser_pipeline.fit(empty_data)
ssn_parser_model.stages[-1].write().overwrite().save("parser/ssn_parser")

txt = """San Diego, CA, USA. Email: medunites@firsthospital.com
Patient John Davies, 62 y.o. ssn: 023-92-7136 was discharged after 12 hours of monitoring without any signs of internal damage.
TSICU Admission 65332 on 24/06/2019 by ambulance VIN 4Y1SL65848Z411439"""

LightPipeline(ssn_parser_model).annotate(txt)["entity_ssn"]

['023-92-7136']

In [10]:
test= [
    ['Please provide your Social Security Number SSN as part of the application process: 123-45-6789.', 'SSN', '123-45-6789'],
    ['To verify your identity, we require your SSN: anc 987-65-4321.', 'SSN', '987-65-4321'],
    ['For security reasons, enter your SSN in the format 123.45.6789.', 'SSN', '123.45.6789'],
    ['Your SSN should be submitted as follows: 123 45 6789.', 'SSN', '123 45 6789'],
    ['Ensure you input your SSN correctly: 456-78-9123.', 'SSN', '456-78-9123'],
    ['For the background check, please use the following SSN: 789-12-3456.', 'SSN', '789-12-3456'],
    ['Your Social Security Number is needed: 321-54-9876.', 'SSN', '321-54-9876'],
    ['SSN must be provided in the following format: 654-32-1987.', 'SSN', '654-32-1987'],
    ['Please confirm your SSN: 159-75-2468.', 'SSN', '159-75-2468'],
    ['For identity verification, input your SSN as 753.14.2689.', 'SSN', '753.14.2689'],
    ['Enter your SSN here: 147-25-3698.', 'SSN', '147-25-3698'],
    ['We require your SSN for processing: 258-69-7410.', 'SSN', '258-69-7410'],
    ['Provide your Social Security Number: 369.87.4521.', 'SSN', '369.87.4521'],
    ['Submit your SSN in this format: 123 45 6789.', 'SSN', '123 45 6789'],
    ['Ensure your SSN is correct: 654-32-1987.', 'SSN', '654-32-1987'],
    ['To complete your registration, enter your SSN: 789-12-3456.', 'SSN', '789-12-3456'],
    ['Your SSN should be written as follows: 321-54-9876.', 'SSN', '321-54-9876'],
    ['For security purposes, use the SSN format: 456 78 9123.', 'SSN', '456 78 9123'],
    ['Verify your identity with your SSN: 987.65.4321.', 'SSN', '987.65.4321'],
    ['Your Social Security Number (SSN) must be provided: 123-45-6789.', 'SSN', '123-45-6789']
]

ssn_parser_model.transform(spark.createDataFrame(test, ['text']))\
                .select("token.result","document.result","entity_ssn.result")\
                .show(truncate=False)

+------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------+-------------+
|result                                                                                                            |result                                                                                           |result       |
+------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------+-------------+
|[Please, provide, your, Social, Security, Number, SSN, as, part, of, the, application, process, :, 123-45-6789, .]|[Please provide your Social Security Number SSN as part of the application process: 123-45-6789.]|[123-45-6789]|
|[To, verify, your, identity, ,, we, require, your, SSN, :, anc, 987-65-4321, .]    

## ACCOUNT

In [11]:
account = {
    "entity": "ACCOUNT",
    "ruleScope": "document",
    "regex": "\d{6,17}|\d{3}.?\d{4}.?\d",
    "matchScope": "sub-token",
    "prefix": ["check", "checking", "account", "acct", "routing","acc"
              , "save", "saving", "debit", "bank", "aba", "aba routing",
              "abarouting", "association", "bankrouting","routing number"],
    "contextLength": 50
}

with open('account.json', 'w') as f:
    json.dump(account, f)

account_parser = ContextualParserApproach() \
    .setInputCols(["splitter", "token"]) \
    .setOutputCol("entity_account") \
    .setJsonPath("account.json") \
    .setCaseSensitive(False) \
    .setPrefixAndSuffixMatch(False)\
    .setShortestContextMatch(False)\
    .setOptionalContextRules(False)\
    .setCompleteContextMatch(True)

account_parser_pipeline = Pipeline(stages=[
    documentAssembler,
    splitter,
    tokenizer,
    account_parser
])

empty_data = spark.createDataFrame([[""]]).toDF("text")

account_parser_model = account_parser_pipeline.fit(empty_data)

account_parser_model.stages[-1].write().overwrite().save("parser/account_parser")

txt = """Name : Hendrickson, Ora, Record date: 2093-01-13, # 719435.
Dr. John Green, ID: 1231511863, IP 203.120.223.13. account: 1234567890120 route number: 123567
He is a 60-year-old male was admitted to the Day Hospital for cystectomy on 01/13/93.
Patient's VIN : 1HGBH41JXMN109286, SSN #333-44-6666, Driver's license no:A334455B.
Phone (302) 786-5227, 0295 Keats Street, San Francisco, E-MAIL: smith@gmail.com."""

LightPipeline(account_parser_model).annotate(txt)["entity_account"]

['1234567890120', '123567']

In [12]:
# ACCOUNT_NUMBER
test=[
    ['Please use acct# 1234567890 for all future transactions.', 'ACCOUNT_NUMBER', '1234567890'],
    ['Your new account number is 987654321. Make sure to update your records.', 'ACCOUNT_NUMBER', '987654321'],
    ['The transfer was made to Acct. 1111-2222-3333.', 'ACCOUNT_NUMBER', '1111-2222-3333'],
    ['For payment, use account no. 0000 1234 5678 9012.', 'ACCOUNT_NUMBER', '0000 1234 5678 9012'],
    ['Account #9876-543210-987 is currently inactive.', 'ACCOUNT_NUMBER', '9876-543210-987'],
    ['The payment was credited to account 1234.5678.9101.1121.', 'ACCOUNT_NUMBER', '1234.5678.9101.1121'],
    ['Please deposit the funds into account number 12345-67890.', 'ACCOUNT_NUMBER', '12345-67890'],
    ['Account No. 12345678 is linked to your checking account.', 'ACCOUNT_NUMBER', '12345678'],
    ['Your account 111-222-333333-44 has been updated.', 'ACCOUNT_NUMBER', '111-222-333333-44'],
    ['The account 5678-123-45678901 is now closed.', 'ACCOUNT_NUMBER', '5678-123-45678901'],
    ['Please use acc. no. 87654321 for future deposits.', 'ACCOUNT_NUMBER', '87654321'],
    ['Your account number 1122-3344-5566-7788 has been approved.', 'ACCOUNT_NUMBER', '1122-3344-5566-7788'],
    ['For this transaction, use account. number 99999999.', 'ACCOUNT_NUMBER', '99999999'],
    ['Funds have been transferred to account number 8888.9999.0000.', 'ACCOUNT_NUMBER', '8888.9999.0000'],
    ['Account number 123-4567-8910-1112 is currently suspended.', 'ACCOUNT_NUMBER', '123-4567-8910-1112'],
    ['Please update your records with account number 9876 5432 1098.', 'ACCOUNT_NUMBER', '9876 5432 1098'],
    ['For billing inquiries, use account #123456789012.', 'ACCOUNT_NUMBER', '123456789012'],
    ['The funds were transferred to Account. 123456-7890.', 'ACCOUNT_NUMBER', '123456-7890'],
    ['Account number 7654-321098-765 is being reviewed.', 'ACCOUNT_NUMBER', '7654-321098-765'],
    ['Please confirm your account number 101112-131415.', 'ACCOUNT_NUMBER', '101112-131415'],
    ['Account No: 1234.5678.9012.3456.', 'ACCOUNT_NUMBER', '1234.5678.9012.3456'],
    ['Account 123456789-0 has been successfully updated.', 'ACCOUNT_NUMBER', '123456789-0'],
    ['Please check your account 12345678-90 for the latest transaction.', 'ACCOUNT_NUMBER', '12345678-90'],
    ['The account number 0987 6543 2109 8765 is incorrect.', 'ACCOUNT_NUMBER', '0987 6543 2109 8765'],
    ['Verify your account number 54321-09876 before proceeding.', 'ACCOUNT_NUMBER', '54321-09876'],
    ['The account 1111-2222-3333-4444 is no longer active.', 'ACCOUNT_NUMBER', '1111-2222-3333-4444'],
    ['Account #0987654321 has been successfully closed.', 'ACCOUNT_NUMBER', '0987654321'],
    ['For any questions, refer to account 987-654-3210.', 'ACCOUNT_NUMBER', '987-654-3210'],
    ['Your account number 1234-567-89012345 has been updated.', 'ACCOUNT_NUMBER', '1234-567-89012345'],
    ['Funds were deposited to account number 12345678-90.', 'ACCOUNT_NUMBER', '12345678-90'],
]

account_parser_model.transform(spark.createDataFrame(test, ['text']))\
                    .select("token.result","document.result","entity_account.result")\
                    .show(n=50,truncate=False)

+----------------------------------------------------------------------------------------+-------------------------------------------------------------------------+----------------+
|result                                                                                  |result                                                                   |result          |
+----------------------------------------------------------------------------------------+-------------------------------------------------------------------------+----------------+
|[Please, use, acct#, 1234567890, for, all, future, transactions, .]                     |[Please use acct# 1234567890 for all future transactions.]               |[1234567890]    |
|[Your, new, account, number, is, 987654321, ., Make, sure, to, update, your, records, .]|[Your new account number is 987654321. Make sure to update your records.]|[987654321]     |
|[The, transfer, was, made, to, Acct, ., 1111-2222-3333, .]                              |

## DLN

In [13]:
dln = {
  "entity": "DLN",
  "ruleScope": "sentence",
  "regex": "\S+\d\S+",    # UPDATE REGEX
  "matchScope": "token",
  "prefix": ['DL', 'DLS', 'CDL', 'CDLS',
             'CDL#', 'CDLS#', 'LIC',
             'DLN', "DRIVER", "Driving", "Drive", "Driver's",
             "Drivers'", "Drivers",
             "DriverLic","DriverLicense"
],

"contextLength": 30

} # AB773955A


with open('dln.json', 'w') as f:
    json.dump(dln, f)

dln_parser = ContextualParserApproach() \
        .setInputCols(["splitter", "token"]) \
        .setOutputCol("entity_dln") \
        .setJsonPath("dln.json") \
        .setCaseSensitive(False) \
        .setPrefixAndSuffixMatch(False)\
        .setShortestContextMatch(False)\
        .setOptionalContextRules(False)\
        .setCompleteContextMatch(True)

dln_parser_pipeline = Pipeline(stages=[
    documentAssembler,
    splitter,
    tokenizer,
    dln_parser
])

empty_data = spark.createDataFrame([[""]]).toDF("text")

dln_parser_model = dln_parser_pipeline.fit(empty_data)
dln_parser_model.stages[-1].write().overwrite().save("parser/dln_parser")


txt = """Name : Hendrickson, Ora, Record date: 2093-01-13, # 719435.
Dr. John Green, ID: 1231511863, IP 203.120.223.13.
He is a 60-year-old male was admitted to the Day Hospital for cystectomy on 01/13/93.
Patient's VIN : 1HGBH41JXMN109286, SSN #333-44-6666, Driver's license no: A334455B. Driver's license# 12345678. MY DL# B324567 CDL bs34df45
Phone (302) 786-5227, 0295 Keats Street, San Francisco, E-MAIL: smith@gmail.com."""

LightPipeline(dln_parser_model).annotate(txt)["entity_dln"]

['A334455B', '12345678', 'B324567', 'bs34df45', '302', '786-5227']

In [14]:
# DRIVER_LICENSE
test=[
      ["Please present your driver's license number AB1234567 at the counter.", 'DRIVER_LICENSE', 'AB1234567'],
      ["Your dl# 9876543210 has been renewed.", 'DRIVER_LICENSE', '9876543210'],
      ["Driver's license number B123-456-789 is required for this transaction.", 'DRIVER_LICENSE', 'B123-456-789'],
      ["Kindly update your records with driver's license #C123 456 7890.", 'DRIVER_LICENSE', 'C123 456 7890'],
      ["The DriverLicense D1234-5678-9012 was issued in California.", 'DRIVER_LICENSE', 'D1234-5678-9012'],
      ["Please verify LIC# 123456789 before processing.", 'DRIVER_LICENSE', '123456789'],
      ["Your new driver's license number is E123-4567-8901.", 'DRIVER_LICENSE', 'E123-4567-8901'],
      ["Driver's license 1122-3344-5566 is currently valid.", 'DRIVER_LICENSE', '1122-3344-5566'],
      ["Please update your driver's license to F98765432.", 'DRIVER_LICENSE', 'F98765432'],
      ["Your driver's license number 765-432-1098 is ready for pickup.", 'DRIVER_LICENSE', '765-432-1098'],
      ["Driver's license number ABC-12-34-56 is not valid.", 'DRIVER_LICENSE', 'ABC-12-34-56'],
      ["The driver's license 1234 5678 901 was rejected.", 'DRIVER_LICENSE', '1234 5678 901'],
      ["Driver's license number XYZ12345678901 is too long.", 'DRIVER_LICENSE', 'XYZ12345678901'],
      ["The format AB-1234567890 for a driver's license is incorrect.", 'DRIVER_LICENSE', 'AB-1234567890'],
      ["Your driver's license number 12345-678 is incomplete.", 'DRIVER_LICENSE', '12345-678'],
      ["Driver's license number ABC-12-34-56 is not valid.", 'DRIVER_LICENSE', 'ABC-12-34-56'],
      ["The driver's license 1234 5678 901 was rejected.", 'DRIVER_LICENSE', '1234 5678 901'],
      ["Driver's license number XYZ12345678901 is too long.", 'DRIVER_LICENSE', 'XYZ12345678901'],
      ["The format AB-1234567890 for a driver's license is incorrect.", 'DRIVER_LICENSE', 'AB-1234567890'],
      ["Your driver's license number 12345-678 is incomplete.", 'DRIVER_LICENSE', '12345-678'],

]

dln_parser_model.transform(spark.createDataFrame(test, ['text']))\
                .select("token.result","document.result","entity_dln.result")\
                .show(truncate=False)

+----------------------------------------------------------------------------------+------------------------------------------------------------------------+------------------+
|result                                                                            |result                                                                  |result            |
+----------------------------------------------------------------------------------+------------------------------------------------------------------------+------------------+
|[Please, present, your, driver's, license, number, AB1234567, at, the, counter, .]|[Please present your driver's license number AB1234567 at the counter.] |[AB1234567]       |
|[Your, dl#, 9876543210, has, been, renewed, .]                                    |[Your dl# 9876543210 has been renewed.]                                 |[9876543210]      |
|[Driver's, license, number, B123-456-789, is, required, for, this, transaction, .]|[Driver's license number B123-4

## PLATE

UPDATE RULES

In [15]:
plate = {
  "entity": "PLATE",
  "ruleScope": "sentence",
  "regex": "(?=.{5,12}$)\S+\d\S+",  # UPDATE REGEX
  "matchScope": "token",
  "prefix": ["Plate", "Plates", "Plate#", "Lpn", "Lpn#", "Plate number"],
  "contextLength": 15,
  "completeMatchRegex": "true"
}

with open('plate.json', 'w') as f:
    json.dump(plate, f)

plate_parser = ContextualParserApproach() \
        .setInputCols(["splitter", "token"]) \
        .setOutputCol("entity_plate") \
        .setJsonPath("plate.json") \
        .setCaseSensitive(False) \
        .setPrefixAndSuffixMatch(False)\
        .setShortestContextMatch(False)\
        .setOptionalContextRules(False)\
        .setCompleteContextMatch(True)

plate_parser_pipeline = Pipeline(stages=[
    documentAssembler,
    splitter,
    tokenizer,
    plate_parser
])

empty_data = spark.createDataFrame([[""]]).toDF("text")

plate_parser_model = plate_parser_pipeline.fit(empty_data)
plate_parser_model.stages[-1].write().overwrite().save("parser/plate_parser")


txt = """Name : Hendrickson, Ora, Record date: 2093-01-13, # 719435.
Dr. John Green, ID: 1231511863, IP 203.120.223.13.
He is a 60-year-old male was admitted to the Day Hospital for cystectomy on 01/13/93.
Patient's VIN : 1HGBH41JXMN109286, SSN #333-44-6666, Driver's license no:A334455B, plates 34NLP34. LPN# 25ASD25
Phone (302) 786-5227, 0295 Keats Street, San Francisco, E-MAIL: smith@gmail.com."""

LightPipeline(plate_parser_model).annotate(txt)["entity_plate"]

['34NLP34', '25ASD25']

In [16]:
# PLATE
test=[
      ["Please enter your vehicle's plate number 34-NLP-34 into the system.", 'PLATE', '34 NLP 34'],
      ["The car with plate number XYZ 9876 was reported stolen.", 'PLATE', 'XYZ 9876'],
      ["Vehicle plate number AB-123-CD has been registered.", 'PLATE', 'AB-123-CD'],
      ["Your new license plate number is 123-ABC.", 'PLATE', '123-ABC'],
      ["The truck's plate number 1AB234 is valid for another year.", 'PLATE', '1AB234'],
      ["Please verify the plate number DE 45 6789 before proceeding.", 'PLATE', 'DE 45 6789'],
      ["Your license plate number WXYZ-1234 is ready for pickup.", 'PLATE', 'WXYZ-1234'],
      ["The vehicle with plate number 123-DEF-456 was fined.", 'PLATE', '123-DEF-456'],
      ["Check the registration status of plate number AB 123 CD.", 'PLATE', 'AB 123 CD'],
      ["The car's plate number GHI-1234 has expired.", 'PLATE', 'GHI-1234'],
      ["Please enter the plate number 1234567.", 'PLATE', '1234567'],  # Missing letters in US format
      ["The vehicle with plate number ABC-12 was towed.", 'PLATE', 'ABC-12'],  # Incorrect length
      ["Verify plate number AB123.", 'PLATE', 'AB123'],  # Too short
      ["The car's plate number XYZ 9876 1234 is not recognized.", 'PLATE', 'XYZ 9876 1234'],  # Too long
      ["Please confirm the plate number 16AC.", 'PLATE', '16AC'],  # Incorrect format
      ["The plate number XYABC-123456-ABCEDFD is invalid.", 'PLATE', 'XYABC-123456-ABCEDFD'],  # Incorrect format for Europe
]

plate_parser_model.transform(spark.createDataFrame(test, ['text']))\
                  .select("token.result","document.result","entity_plate.result")\
                  .show(truncate=False)

+--------------------------------------------------------------------------------+---------------------------------------------------------------------+-------------+
|result                                                                          |result                                                               |result       |
+--------------------------------------------------------------------------------+---------------------------------------------------------------------+-------------+
|[Please, enter, your, vehicle's, plate, number, 34-NLP-34, into, the, system, .]|[Please enter your vehicle's plate number 34-NLP-34 into the system.]|[34-NLP-34]  |
|[The, car, with, plate, number, XYZ, 9876, was, reported, stolen, .]            |[The car with plate number XYZ 9876 was reported stolen.]            |[]           |
|[Vehicle, plate, number, AB-123-CD, has, been, registered, .]                   |[Vehicle plate number AB-123-CD has been registered.]                |[AB-123-CD]  

## VIN

In [17]:
vin_cp = {
  "entity": "VIN",
  "ruleScope": "sentence",
  "matchScope":"token",
  "regex":"([A-HJ-NPR-Z0-9]{17})|([A-HJ-NPR-Z\d]{3}[A-HJ-NPR-Z\d]{5}[A-HJ-NPR-Z\d]{9})",
  "prefix":['VIN', "vehicle", 'VIN#', "veh no"],   # Why 35?  vehicle identifier number:
  "contextLength": 35,  # Increased 35, before 15
}

with open('vin_cp.json', 'w') as f:
    json.dump(vin_cp, f)

vin_parser = ContextualParserApproach() \
        .setInputCols(["splitter", "token"]) \
        .setOutputCol("entity_vin_code") \
        .setJsonPath("vin_cp.json") \
        .setCaseSensitive(False) \
        .setPrefixAndSuffixMatch(False)\
        .setShortestContextMatch(False)\
        .setOptionalContextRules(False)\
        .setCompleteContextMatch(True)

vin_parser_pipeline = Pipeline(stages=[
    documentAssembler,
    splitter,
    tokenizer,
    vin_parser
  ])

empty_data = spark.createDataFrame([[""]]).toDF("text")
vin_parser_model = vin_parser_pipeline.fit(empty_data)
vin_parser_model.stages[-1].write().overwrite().save("parser/vin_parser")


vin_lp = LightPipeline(vin_parser_model)

txt = """Name : Hendrickson, Ora, Record date: 2093-01-13, # 719435.
Dr. John Green, ID: 1231511863, IP 203.120.223.13.
He is a 60-year-old male was admitted to the Day Hospital for cystectomy on 01/13/93.
Patient's VIN : 1HGBH41JXMN109286, VIN 4Y1SL65848Z411439, VIN 1HGCM82633A123456 - VIN JH4KA7560MC012345 - VIN 5YJSA1E14HF123456
SSN #333-44-6666, Driver's license no:A334455B, plate 34NLP34.
Phone (302) 786-5227, 0295 Keats Street, San Francisco, E-MAIL: smith@gmail.com."""

LightPipeline(vin_parser_model).annotate(txt)["entity_vin_code"]

['1HGBH41JXMN109286',
 '4Y1SL65848Z411439',
 '1HGCM82633A123456',
 'JH4KA7560MC012345',
 '5YJSA1E14HF123456']

In [18]:
test=[
      ["The vehicle with VIN 1HGCM82633A004352 has been recalled.", 'VIN', '1HGCM82633A004352'],
      ["Please check the VIN 1FTFW1EF1EKD21287 before purchase.", 'VIN', '1FTFW1EF1EKD21287'],
      ["The VIN JHMFA16586S000000 was found in our database.", 'VIN', 'JHMFA16586S000000'],
      ["Your vehicle's VIN is WDBRF61JX5F718236.", 'VIN', 'WDBRF61JX5F718236'],
      ["Please verify the VIN 2GCEK19T4Y1108956 for accuracy.", 'VIN', '2GCEK19T4Y1108956'],
      ["The VIN WP0ZZZ99ZTS392124 is registered under your name.", 'VIN', 'WP0ZZZ99ZTS392124'],
      ["The vehicle identifier number WP0ZZZ99ZTS392124 is registered under your name.", 'VIN', 'WP0ZZZ99ZTS392124'],
      ["Make sure to provide the VIN 4Y1SL65848Z411439.", 'VIN', '4Y1SL65848Z411439'],
      ["The car with VIN JT3AC12R8R1009871 has a clean title.", 'VIN', 'JT3AC12R8R1009871'],
      ["Your new vehicle's VIN is VF1BG0E0662484848.", 'VIN', 'VF1BG0E0662484848'],
      ["Please update your records with VIN WVGZZZ5NZ6W000001.", 'VIN', 'WVGZZZ5NZ6W000001'],
      ["The VIN 1HGCM82633A00435 is incomplete.", 'VIN', '1HGCM82633A00435'],  # Only 16 characters
      ["VIN WP0ZZZ99ZTS3921248 is too long.", 'VIN', 'WP0ZZZ99ZTS3921248'],  # 18 characters
      ["VIN 1FTF1EF1EKD21287 has a missing character.", 'VIN', '1FTF1EF1EKD21287'],  # Only 16 characters
      ["The VIN JHMFA16586S0000O is invalid due to the letter 'O'.", 'VIN', 'JHMFA16586S0000O'],  # 'O' is not allowed
      ["VIN 2GCEK19T4Y110895G contains an invalid character 'G'.", 'VIN', '2GCEK19T4Y110895G'],  # Invalid character
  ]

vin_parser_model.transform(spark.createDataFrame(test, ['text']))\
                .select("token.result","document.result","entity_vin_code.result")\
                .show(truncate=False)


+-------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------+--------------------+
|result                                                                                     |result                                                                          |result              |
+-------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------+--------------------+
|[The, vehicle, with, VIN, 1HGCM82633A004352, has, been, recalled, .]                       |[The vehicle with VIN 1HGCM82633A004352 has been recalled.]                     |[1HGCM82633A004352] |
|[Please, check, the, VIN, 1FTFW1EF1EKD21287, before, purchase, .]                          |[Please check the VIN 1FTFW1EF1EKD21287 before purchase.]                       |[1FTFW1EF1EKD21287] |
|[The, VIN, JHMFA165

## LICENSE

In [19]:
license = {
  "entity": "LICENSE",
  "ruleScope": "sentence",
  "regex": "\S+\d\S+",   # Search regex,   ChunkMerger set the last
  "matchScope": "token",
  "prefix": ["License", "Lic", "Licence", "Lic#", "License#", "Licence#", "Certificate", "Cert", "Certificate#", "Cert#"],
  "suffix": ["License", "Lic", "Licence", "Lic#", "License#", "Licence#", "Certificate", "Cert", "Certificate#", "Cert#"],
  "contextLength": 15  #WHy 15? Certificate# =12
}

with open('license.json', 'w') as f:
    json.dump(license, f)

license_parser = ContextualParserApproach() \
        .setInputCols(["splitter", "token"]) \
        .setOutputCol("entity_license") \
        .setJsonPath("license.json") \
        .setCaseSensitive(False) \
        .setPrefixAndSuffixMatch(False)\
        .setShortestContextMatch(False)\
        .setOptionalContextRules(False)\
        .setCompleteContextMatch(True)

license_parser_pipeline = Pipeline(stages=[
    documentAssembler,
    splitter,
    tokenizer,
    license_parser
])

empty_data = spark.createDataFrame([[""]]).toDF("text")

license_parser_model = license_parser_pipeline.fit(empty_data)
license_parser_model.stages[-1].write().overwrite().save("parser/license_parser")

txt = """Name : Hendrickson, Ora, Record date: 2093-01-13, # 719435.
Dr. John Green, ID: 1231511863, IP 203.120.223.13.
He is a 60-year-old male was admitted to the Day Hospital for cystectomy on 01/13/93.
Patient's VIN : 1HGBH41JXMN109286, VIN 4Y1SL65848Z411439, VIN 1HGCM82633A123456 - VIN JH4KA7560MC012345 - VIN 5YJSA1E14HF123456
SSN #333-44-6666, Driver's license no: A334455B, plate 34NLP34. Lic: 12345As. Cert: 12345As
Phone (302) 786-5227, 0295 Keats Street, San Francisco, E-MAIL: smith@gmail.com."""

LightPipeline(license_parser_model).annotate(txt)["entity_license"]

['#333-44-6666', 'A334455B', '34NLP34', '12345As', '12345As']

In [20]:
test=[
    ["Please present your driver's license number AB1234567 at the counter.", 'DRIVER_LICENSE', 'AB1234567'],
    ["Your driver's license 9876543210 has been renewed.", 'DRIVER_LICENSE', '9876543210'],
    ["Driver's license number B123-456-789 is required for this transaction.", 'DRIVER_LICENSE', 'B123-456-789'],
    ["Kindly update your records with driver's license #C123 456 7890.", 'DRIVER_LICENSE', 'C123 456 7890'],
    ["The driver's license D1234-5678-9012 was issued in California.", 'DRIVER_LICENSE', 'D1234-5678-9012'],
    ["Please verify driver's license number 123456789 before processing.", 'DRIVER_LICENSE', '123456789'],
    ["Your new driver's license number is E123-4567-8901.", 'DRIVER_LICENSE', 'E123-4567-8901'],
    ["Driver's license 1122-3344-5566 is currently valid.", 'DRIVER_LICENSE', '1122-3344-5566'],
    ["Please update your driver's license to F98765432.", 'DRIVER_LICENSE', 'F98765432'],
    ["Your driver's license number 765-432-1098 is ready for pickup.", 'DRIVER_LICENSE', '765-432-1098'],
    ["The license number 1234 is too short and invalid.", 'DRIVER_LICENSE', '1234'],
    ["Driver's license 98765432101234567890 is too long and invalid.", 'DRIVER_LICENSE', '98765432101234567890'],
    ["The driver's license number A123456 is missing required digits.", 'DRIVER_LICENSE', 'A123456'],
    ["Driver's license 1234-56-789 lacks the proper formatting.", 'DRIVER_LICENSE', '1234-56-789'],
    ["License number ZXY-987654321 is not a valid format.", 'DRIVER_LICENSE', 'ZXY-987654321'],
]

license_parser_model.transform(spark.createDataFrame(test, ['text']))\
                    .select("token.result","document.result","entity_license.result")\
                    .show(truncate=False)


+----------------------------------------------------------------------------------+------------------------------------------------------------------------+----------------------+
|result                                                                            |result                                                                  |result                |
+----------------------------------------------------------------------------------+------------------------------------------------------------------------+----------------------+
|[Please, present, your, driver's, license, number, AB1234567, at, the, counter, .]|[Please present your driver's license number AB1234567 at the counter.] |[AB1234567]           |
|[Your, driver's, license, 9876543210, has, been, renewed, .]                      |[Your driver's license 9876543210 has been renewed.]                    |[9876543210]          |
|[Driver's, license, number, B123-456-789, is, required, for, this, transaction, .]|[Driver's l

## AGE

In [21]:
# old_regex_rule:"(\b(\d{1,2})(\s?-?1\/2)?\b)|(\b(1\d{2})(\s?-?1\/2)?\b)",

In [22]:
age = {
  "entity": "AGE",
  "ruleScope": "document",
  "matchScope":"sub-token",
  "regex":"(\d{1,3})([\s-]+1/2)?\s*(-years-old|years-old|years old|-year-old|year-old|year old|-months-old|months-old|months old|-month-old|month-old|month old|-weeks-old|weeks-old|weeks old|-week-old|week-old|week old|-days-old|days-old|days old|-day-old|day-old|day old|years of age|months of age|weeks of age|days of age|old)",
  "contextLength": 20, #20
  "contextException": ["ago"],
  "exceptionDistance": 15
}

with open('age.json', 'w') as f:
    json.dump(age, f)

age_parser = ContextualParserApproach() \
        .setInputCols(["splitter", "token"]) \
        .setOutputCol("entity_age") \
        .setJsonPath("age.json") \
        .setCaseSensitive(False) \
        .setPrefixAndSuffixMatch(False)\
        .setShortestContextMatch(False)\
        .setOptionalContextRules(False)\
        .setCompleteContextMatch(False)

In [23]:
age_parser_pipeline = Pipeline(stages=[
    documentAssembler,
    splitter,
    tokenizer,
    age_parser
  ])

empty_data = spark.createDataFrame([[""]]).toDF("text")

age_parser_model = age_parser_pipeline.fit(empty_data)
age_parser_model.stages[-1].write().overwrite().save("parser/age_parser")

In [24]:
txt = """Name : Hendrickson, Ora, Record date: 2093-01-13, # 719435.
Dr. John Green, ID: 1231511863, IP 203.120.223.13.
He is a 60-year-old male was admitted to the Day Hospital for cystectomy on 01/13/93.
Patient's VIN : 1HGBH41JXMN109286, VIN 4Y1SL65848Z411439, VIN 1HGCM82633A123456 - VIN JH4KA7560MC012345 - VIN 5YJSA1E14HF123456
SSN #333-44-6666, Driver's license no: A334455B, plate 34NLP34. Lic: 12345As. Cert: 12345As
Phone (302) 786-5227, 0295 Keats Street, San Francisco, E-MAIL: smith@gmail.com.

A 28 year old female with a history of gestational diabetes mellitus diagnosed 8 years ago.
3 years ago, he reported an episode of HTG-induced pancreatitis . 5 months old boy with repeated concussions.
"""

annotations = LightPipeline(age_parser_model).fullAnnotate(txt)

annotations[0]["entity_age"]

[Annotation(chunk, 119, 129, 60-year-old, {'field': 'AGE', 'tokenIndex': '29', 'confidence': '0.50', 'ner_source': 'entity_age', 'normalized': '', 'sentence': '0'}, []),
 Annotation(chunk, 501, 511, 28 year old, {'field': 'AGE', 'tokenIndex': '95', 'confidence': '0.50', 'ner_source': 'entity_age', 'normalized': '', 'sentence': '0'}, []),
 Annotation(chunk, 501, 511, 28 year old, {'field': 'AGE', 'tokenIndex': '103', 'confidence': '0.50', 'ner_source': 'entity_age', 'normalized': '', 'sentence': '0'}, []),
 Annotation(chunk, 501, 511, 28 year old, {'field': 'AGE', 'tokenIndex': '5', 'confidence': '0.50', 'ner_source': 'entity_age', 'normalized': '', 'sentence': '1'}, []),
 Annotation(chunk, 501, 511, 28 year old, {'field': 'AGE', 'tokenIndex': '13', 'confidence': '0.50', 'ner_source': 'entity_age', 'normalized': '', 'sentence': '1'}, []),
 Annotation(chunk, 657, 668, 5 months old, {'field': 'AGE', 'tokenIndex': '41', 'confidence': '0.50', 'ner_source': 'entity_age', 'normalized': '', 's

In [25]:
txt = """At just 1 year old, a child is beginning to 70 year old, taking their first steps and saying their first words.
In contrast, a 5-year-old is likely running around with boundless energy, attending kindergarten, and learning to read and write.
By the time someone is 10 years old, they may have developed hobbies like playing a musical instrument or participating in team sports.
A 15-year-old, navigating the challenges of adolescence, often dreams about the future while balancing school and friendships.
Many 20-year-olds are stepping into adulthood, pursuing higher education or entering the workforce, while 25-year-olds often reflect
on the first quarter of their lives, setting long-term goals. For a 30-year-old, milestones like starting a family or advancing in
their career may define this stage. A 40-year-old might focus on stability and family, while a 50-year-old could be planning for
retirement or embracing new hobbies. At 60 years old, many celebrate a lifetime of achievements and embrace a slower pace of life.
Similarly, a 70-year-old might enjoy spoiling grandchildren or traveling to places they’ve always dreamed of. On the other hand,
objects like a 100-year-old oak tree stand as a testament to resilience and time. Whether describing a one-year-old baby, a
30-year-old adult, or a 90-year-old senior, each year-old or years-old stage tells a unique story of growth, change, and experience.
"""

age_parser_model.transform(spark.createDataFrame([[txt]], ['text']))\
                .selectExpr("explode(entity_age) result")\
                .show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                              |
+----------------------------------------------------------------------------------------------------------------------------------------------------+
|{chunk, 8, 17, 1 year old, {field -> AGE, tokenIndex -> 2, confidence -> 0.50, ner_source -> entity_age, normalized -> , sentence -> 0}, []}        |
|{chunk, 44, 54, 70 year old, {field -> AGE, tokenIndex -> 11, confidence -> 0.50, ner_source -> entity_age, normalized -> , sentence -> 0}, []}     |
|{chunk, 127, 136, 5-year-old, {field -> AGE, tokenIndex -> 29, confidence -> 0.50, ner_source -> entity_age, normalized -> , sentence -> 0}, []}    |
|{chunk, 265, 276, 10 years old, {field -> AGE, tokenIndex -> 53, confidence -> 0.50, ner_sour

In [26]:
test = [
    ["The user is 25 years old.", 'AGE', '25'],
    ["She just turned 30 last week.", 'AGE', '30'],
    ["My grandfather is 85 years old.", 'AGE', '85'],
    ["The child is 7 years old.", 'AGE', '7'],
    ["He is 18, just old enough to vote.", 'AGE', '18'],
    ["At 45, she decided to start a new career.", 'AGE', '45'],
    ["The athlete is 22 and in top form.", 'AGE', '22'],
    ["He reached the age of 100 last month.", 'AGE', '100'],
    ["She is 16 and can't wait to drive.", 'AGE', '16'],
    ["At 65, he decided to retire.", 'AGE', '65'],
    ['The applicant must be at least 18 years old to apply for this position.', 'AGE', '18'],
    ['Participants aged 25 and older are eligible for the program.', 'AGE', '25'],
    ['Children under the age of 12 are not allowed in the facility.', 'AGE', '12'],
    ['She celebrated her 30th birthday last week.', 'AGE', '30'],
    ['You need to be 21 years old to enter this club.', 'AGE', '21'],
    ['Our senior members are all over the age of 65.', 'AGE', '65'],
    ['He retired at the age of 60.', 'AGE', '60'],
    ['Only individuals between 18 and 35 years old can participate.', 'AGE', '35'],
    ['Children aged 5 to 10 can join the junior program.', 'AGE', '5 to 10'],
    ['The minimum age requirement for this course is 16.', 'AGE', '16'],
    ['At 45 years old, she decided to pursue a new career.', 'AGE', '45'],
    ['Babies must be at least 6 months old to be enrolled.', 'AGE', '6'],
    ['The event is open to everyone over the age of 50.', 'AGE', '50'],
    ['He joined the army at the age of 18.', 'AGE', '18'],
    ['Students aged 14 and above can attend the seminar.', 'AGE', '14'],
    ['Her daughter turned 8 last month.', 'AGE', '8'],
    ['Participants must be at least 30 years of age.', 'AGE', '30'],
    ['Children under 7 years old need to be accompanied by an adult.', 'AGE', '7'],
    ['He was just 25 when he became a CEO.', 'AGE', '25'],
    ['To qualify for the senior discount, you must be 65 or older.', 'AGE', '65'],
    ["The age -5 is not valid.", 'AGE', '-5'],
    ["Age 200 is unrealistic and invalid.", 'AGE', '200'],
    ["The input 'twenty' is not a valid age.", 'AGE', 'twenty'],
    ["Age 0 is invalid; the minimum age should be 1.", 'AGE', '0'],
    ["999 years old is not a realistic age.", 'AGE', '999'],
]

age_parser_model.transform(spark.createDataFrame(test, ['text']))\
                .select("token.result","document.result","entity_age.result")\
                .show(truncate=False)

+----------------------------------------------------------------------------------------+-------------------------------------------------------------------------+--------------+
|result                                                                                  |result                                                                   |result        |
+----------------------------------------------------------------------------------------+-------------------------------------------------------------------------+--------------+
|[The, user, is, 25, years, old, .]                                                      |[The user is 25 years old.]                                              |[25 years old]|
|[She, just, turned, 30, last, week, .]                                                  |[She just turned 30 last week.]                                          |[]            |
|[My, grandfather, is, 85, years, old, .]                                                |[My grandf

## DATE

In [27]:
"""date = "(?i)((?:(?<!\:)(?<!\:\d)(?<!\d\d)[0-9]{1,4}(?:st|nd|rd|th)?\s(?:of\s)?(?:jan\.?|january|feb\.?|february|mar\.?|march|apr\.?|april|may|jun\.?|june|jul\.?|july|aug\.?|august|sep\.?|september|oct\.?|october|nov\.?|november|dec\.?|december)|(?:jan\.?|january|feb\.?|february|mar\.?|march|apr\.?|april|may|jun\.?|june|jul\.?|july|aug\.?|august|sep\.?|september|oct\.?|october|nov\.?|november|dec\.?|december)\s(?<!\:)(?<!\:\d)[0-3]?\d{1,3}(?:st|nd|rd|th)?)(?:[,\.\!\?;:\-\(\)\[\]\{\}\s]*\s?\d{4})?|[0-3]?\d[-\./][0-3]?\d[-\./]\d{2,4})|(\d{4}[-\./]\d{2}[-\./]\d{2})|(?<=on\s|in\s)(\d{1,2}[-/]\d{2})(?!\d|[-/])|(\d{1,2}(?:jan\.?|january|feb\.?|february|mar\.?|march|apr\.?|april|may|jun\.?|june|jul\.?|july|aug\.?|august|sep\.?|september|oct\.?|october|nov\.?|november|dec\.?|december)\d{4})|(\d{1,2}-[A-Za-z]{3}-\d{4})|(\d{2}-\d{2}-\d{4})# DATE\n"
date += "(?<!\d)(3[01]|[12][0-9]|0?[1-9])\/(0?[1-9]|1[1-2])\/\d{4}(?!\d)# DATE\n"
#date += "(?<!\d)(0?[1-9]|1[1-2])\/(3[01]|[12][0-9]|0?[1-9])\/\d{4}(?!\d)# DATE\n" Deleted because of duplicate
date += "(?:3[01]|[12][0-9]|0?[1-9])([-/.])(0?[1-9]|1[1-2])\\1\d{4}# DATE\n"
date += "(?:0?[1-9]|1[1-2])([\-\/.])(3[01]|[12][0-9]|0?[1-9])\\1\d{4}# DATE\n"
date += "\d{4}([\-\/.])(0?[1-9]|1[1-2])\\1(3[01]|[12][0-9]|0?[1-9])# DATE\n"
date += "\d{2}/\d{4}# DATE"""

'date = "(?i)((?:(?<!\\:)(?<!\\:\\d)(?<!\\d\\d)[0-9]{1,4}(?:st|nd|rd|th)?\\s(?:of\\s)?(?:jan\\.?|january|feb\\.?|february|mar\\.?|march|apr\\.?|april|may|jun\\.?|june|jul\\.?|july|aug\\.?|august|sep\\.?|september|oct\\.?|october|nov\\.?|november|dec\\.?|december)|(?:jan\\.?|january|feb\\.?|february|mar\\.?|march|apr\\.?|april|may|jun\\.?|june|jul\\.?|july|aug\\.?|august|sep\\.?|september|oct\\.?|october|nov\\.?|november|dec\\.?|december)\\s(?<!\\:)(?<!\\:\\d)[0-3]?\\d{1,3}(?:st|nd|rd|th)?)(?:[,\\.\\!\\?;:\\-\\(\\)\\[\\]\\{\\}\\s]*\\s?\\d{4})?|[0-3]?\\d[-\\./][0-3]?\\d[-\\./]\\d{2,4})|(\\d{4}[-\\./]\\d{2}[-\\./]\\d{2})|(?<=on\\s|in\\s)(\\d{1,2}[-/]\\d{2})(?!\\d|[-/])|(\\d{1,2}(?:jan\\.?|january|feb\\.?|february|mar\\.?|march|apr\\.?|april|may|jun\\.?|june|jul\\.?|july|aug\\.?|august|sep\\.?|september|oct\\.?|october|nov\\.?|november|dec\\.?|december)\\d{4})|(\\d{1,2}-[A-Za-z]{3}-\\d{4})|(\\d{2}-\\d{2}-\\d{4})# DATE\n"\ndate += "(?<!\\d)(3[01]|[12][0-9]|0?[1-9])\\/(0?[1-9]|1[1-2])\\/\\d{

In [28]:
date = "(?i)(((?:(?<!\:)(?<!\:\d)(?<!\d\d)(?<!\w\d))(((?:19|20)\d{2})|(\d{2})|(\d{1}))(?:st|nd|rd|th)?\s(?:of\s)?(?:january|jan\.?|february|feb\.?|march|mar\.?|april|apr\.?|may|june|jun\.?|july|jul\.?|august|aug\.?|september|sep\.?|october|oct\.?|november|nov\.?|december|dec\.?)(?!\w)|(?:january|jan\.?|february|feb\.?|march|mar\.?|april|apr\.?|may|june|jun\.?|july|jul\.?|august|aug\.?|september|sep\.?|october|oct\.?|november|nov\.?|december|dec\.?)\s(?<!\:)(?<!\:\d)[0-3]?\d{1,3}(?!\d)(?:st|nd|rd|th)?)(?:[,\.\!\?;:\-\(\)\[\]\{\}\s]*\s?(?:19|20)\d{2})?|[0-3]?\d[-\./][0-3]?\d[-\./]\d{2,4})|((?:19|20)\d{2}[-\./](?:0[1-9]|1[0-9]|2[0-9]|3[01])[-\./](?:0[1-9]|1[0-9]|2[0-9]|3[01]))|(?<=on\s|in\s)(\d{1,2}[-/](?:0[1-9]|1[0-9]|2[0-9]|3[01]))(?!\d|[-/])|(\d{1,2}(?:jan\.?|january|feb\.?|february|mar\.?|march|apr\.?|april|may|jun\.?|june|jul\.?|july|aug\.?|august|sep\.?|september|oct\.?|october|nov\.?|november|dec\.?|december)(?:19|20)\d{2})|(\d{1,2}-[A-Za-z]{3}-(?:19|20)\d{2})|((?:0[1-9]|1[0-9]|2[0-9]|3[01])-(?:0[1-9]|1[0-9]|2[0-9]|3[01])-(?:19|20)\d{2})# DATE\n"
date += "(?<!\d)(3[01]|[12][0-9]|0?[1-9])\/(0?[1-9]|1[1-2])\/(?:19|20)\d{2}(?!\d)# DATE\n"
date += "(?:3[01]|[12][0-9]|0?[1-9])([-/.])(0?[1-9]|1[1-2])\\1(?:19|20)\d{2}# DATE\n"
date += "(?:0?[1-9]|1[1-2])([\-\/.])(3[01]|[12][0-9]|0?[1-9])\\1(?:19|20)\d{2}# DATE\n"
date += "(?:19|20)\d{2}([\-\/.])(0?[1-9]|1[1-2])\\1(3[01]|[12][0-9]|0?[1-9])# DATE\n"
date += "(?:0[1-9]|1[0-9]|2[0-9]|3[01])/(?:19|20)\d{2}# DATE"

In [29]:
with open('date.txt', 'w') as f:
    f.write(date)

date_parser = RegexMatcherInternal() \
    .setExternalRules("date.txt",  "#") \
    .setInputCols(["splitter"]) \
    .setOutputCol("entity_date") \
    .setStrategy("MATCH_ALL")

date_parser_pipeline = Pipeline(stages=[
    documentAssembler,
    splitter,
    date_parser
])

empty_data = spark.createDataFrame([[""]]).toDF("text")
date_parser_model = date_parser_pipeline.fit(empty_data)
date_parser_model.stages[-1].write().overwrite().save("regex_matcher/date_regex_matcher")

date_lp = LightPipeline(date_parser_model)

In [30]:
txt = """Name : Hendrickson, Ora, Record date: 2093-01-13, # 719435.
Dr. John Green, ID: 1231511863, IP 203.120.223.13.
He is a 60-year-old male was admitted to the Day Hospital for cystectomy on 01/13/93.
Patient's VIN : 1HGBH41JXMN109286, VIN 4Y1SL65848Z411439, VIN 1HGCM82633A123456 - VIN JH4KA7560MC012345 - VIN 5YJSA1E14HF123456
SSN #333-44-6666, Driver's license no: A334455B, plate 34NLP34. Lic: 12345As. Cert: 12345As
Phone (302) 786-5227, 0295 Keats Street, San Francisco, E-MAIL: smith@gmail.com."""

date_lp.annotate(txt)["entity_date"]

['2093-01-13', '01/13/93']

In [31]:
test=[
    ['The project kickoff meeting is scheduled for 01/15/2024. Please make sure to attend.', 'DATE', '01/15/2024'],
    ['Our next team outing is on 15 January 2024. Mark your calendars!', 'DATE', '15 January 2024'],
    ['The deadline for the submission of the final report is 2024-01-15.', 'DATE', '2024-01-15'],
    ['Join us for the annual conference from January 15th to January 17th, 2024.', 'DATE', 'January 15th'],
    ['The application period opens on 1st Feb 2024 and closes on 15th March 2024.', 'DATE', '1st Feb 2024'],
    ['Our office will be closed on 02/14/2024 for a public holiday.', 'DATE', '02/14/2024'],
    ['The new policy will take effect starting from February 20, 2024.', 'DATE', 'February 20, 2024'],
    ['The training workshop is scheduled for the week of March 1-7, 2024.', 'DATE', 'March 1-7, 2024'],
    ['Please submit your expense reports by 3/15/24.', 'DATE', '3/15/24'],
    ['The system upgrade is planned for 24th March 2024.', 'DATE', '24th March 2024'],
    ['Our quarterly review meeting will be held on 04-05-2024.', 'DATE', '04-05-2024'],
    ['The next phase of the project begins on 5th April, 2024.', 'DATE', '5th April, 2024'],
    ['Please note that the final exams are scheduled for May 10th - 12th, 2024.', 'DATE', 'May 10th - 12th, 2024'],
    ['Our summer break starts on 06/01/2024 and ends on 07/01/2024.', 'DATE', '06/01/2024'],
    ['The product launch event is set for June 15, 2024.', 'DATE', 'June 15, 2024'],
    ['We will be celebrating our anniversary on 20th July 2024.', 'DATE', '20th July 2024'],
    ['The next board meeting is on 2024-08-25.', 'DATE', '2024-08-25'],
    ['Our fiscal year ends on 09/30/2024.', 'DATE', '09/30/2024'],
    ['Please join us for the year-end party on December 31, 2024.', 'DATE', 'December 31, 2024'],
    ['The holiday season begins on 12/24/2024 and extends until 01/01/2025.', 'DATE', '12/24/2024'],
    ['The seminar will be held on the 10th of October, 2024.', 'DATE', '10th of October, 2024'],
    ['The next release cycle starts on 2024.11.11.', 'DATE', '2024.11.11'],
    ['The charity event is scheduled for November 25th, 2024.', 'DATE', 'November 25th, 2024'],
    ['Please note that maintenance is scheduled for 12-01-2024.', 'DATE', '12-01-2024'],
    ['We will begin our winter session on December 5th, 2024.', 'DATE', 'December 5th, 2024'],
    ['Our next town hall meeting is on 13.01.2024.', 'DATE', '13.01.2024'],
    ['All submissions are due by 2024/02/10.', 'DATE', '2024/02/10'],
    ['The winter break starts on 20-Dec-2024 and ends on 05-Jan-2025.', 'DATE', '20-Dec-2024'],
    ['Our annual retreat is planned for January 8th, 2024.', 'DATE', 'January 8th, 2024'],
    ['The graduation ceremony will be held on 30 June, 2024.', 'DATE', '30 June, 2024'],
    ['Our next quarterly financial review will be on 2024.09.30.', 'DATE', '2024.09.30'],
    ["The project kickoff meeting is scheduled for 01/15/2024.", 'DATE', '01/15/2024'],
    ["Our next team outing is on 15 January 2024.", 'DATE', '15 January 2024'],
    ["The deadline for the submission is 2024-01-15.", 'DATE', '2024-01-15'],
    ["The conference will take place from March 10th to March 12th, 2024.", 'DATE', 'March 10th'],
    ["The event is scheduled for 25th December 2024.", 'DATE', '25th December 2024'],
    ["Please submit your reports by 3/15/24.", 'DATE', '3/15/24'],
    ["The meeting is on February 20, 2024.", 'DATE', 'February 20, 2024'],
    ["Our annual review is planned for 2024.11.11.", 'DATE', '2024.11.11'],
    ["The training will be held on 5th April, 2024.", 'DATE', '5th April, 2024'],
    ["The system upgrade is set for 09/30/2024.", 'DATE', '09/30/2024'],
    ["The meeting is scheduled for 13/32/2024, which is not a valid date.", 'DATE', '13/32/2024'],
    ["Please submit your report by 02/30/2024, an impossible date.", 'DATE', '02/30/2024'],
    ["The event will take place on 2024-00-01, which is invalid.", 'DATE', '2024-00-01'],
    ["The start date is 2024/13/01, which does not exist.", 'DATE', '2024/13/01'],
    ["He mentioned the date 31st February 2024, which is not a real date.", 'DATE', '31st February 2024'],
    ["The project was completed on 2024-11-31, which is incorrect.", 'DATE', '2024-11-31'],
    ["She said the deadline was 00/15/2024, an invalid date.", 'DATE', '00/15/2024'],
    ["The contract starts on 2024/02/29, a non-existent date for this year.", 'DATE', '2024/02/29'],
    ["The report was dated 2024-04-31, but April only has 30 days.", 'DATE', '2024-04-31'],
    ["The event is scheduled for 2024/15/01, which is not possible.", 'DATE', '2024/15/01'],
]

date_parser_model.transform(spark.createDataFrame(test, ['text']))\
                .select("document.result","entity_date.result")\
                .show(truncate=False)

+--------------------------------------------------------------------------------------+----------------------------------+
|result                                                                                |result                            |
+--------------------------------------------------------------------------------------+----------------------------------+
|[The project kickoff meeting is scheduled for 01/15/2024. Please make sure to attend.]|[01/15/2024]                      |
|[Our next team outing is on 15 January 2024. Mark your calendars!]                    |[15 January 2024]                 |
|[The deadline for the submission of the final report is 2024-01-15.]                  |[2024-01-15]                      |
|[Join us for the annual conference from January 15th to January 17th, 2024.]          |[January 15th, January 17th, 2024]|
|[The application period opens on 1st Feb 2024 and closes on 15th March 2024.]         |[1st Feb 2024, 15th March 2024]   |
|[Our of

## PHONE-1

In [32]:
#NEED TO IMPROVE
phone = {
  "entity": "PHONE",
  "ruleScope": "document",
  "matchScope":"sub-token",
  "regex":"(?<!\d)(\(\+1\)\s?)?(\+1\s?)?(1\s?)?(001\s?)?\(?[1-9]\d{2}\)?[\s.-]?\d{3}[\s.-]?\d{4}(?!\d)|(?<!\d)\+?1\.?\-?\s?\(?[1-9]\d{2}\)?\.?\-?\s?\d{3}\.?\-?\s?\d{4}(?!\d)",

  "prefix": ["Call", "Phone", "Telefon", "Telefon/Phone","dial",
            "Cell/Mobile", "Cell", "Tel", "Telephone number",
            "Phone/Mobile", "T", "Tel.", "Tlf.", "Fax",
            "Contact","office number", "free number","contact us","reach us"
             ],

  "contextLength": 60,
  "completeMatchRegex": "true"
}

with open('phone.json', 'w') as f:
    json.dump(phone, f)

phone_parser = ContextualParserApproach() \
        .setInputCols(["splitter", "token"]) \
        .setOutputCol("entity_phone") \
        .setJsonPath("phone.json") \
        .setCaseSensitive(False) \
        .setPrefixAndSuffixMatch(False)\
        .setShortestContextMatch(False)\
        .setOptionalContextRules(False)\
        .setCompleteContextMatch(True)

phone_parser_pipeline = Pipeline(stages=[
    documentAssembler,
    splitter,
    tokenizer,
    phone_parser
])

empty_data = spark.createDataFrame([[""]]).toDF("text")

phone_parser_model = phone_parser_pipeline.fit(empty_data)
phone_parser_model.stages[-1].write().overwrite().save("parser/phone_parser")


phone_lp = LightPipeline(phone_parser_model)

text = '''Record date :2093-01-13, David Hale, M.D. IP 203.120.223.13.
ID: 1231511863, The driver's license no:A334455B and e-mail: hale@gmail.com .
PCP : Oliveira, 25 years-old, Jake 5 year old, Record date : 2079-11-09.
Cocke County Baptist Hospital , 0295 Keats Street, 12345, TX 55555-4444. Phone: (818) 342-7353 Fax No.: (818) 342-7354, SSN# 332255677, The other is ssN: 333-44-6666.
Phone: (818) 342-7353.\nEmail: medunites@firsthospital.com'''

phone_lp.annotate(text)['entity_phone']

['(818) 342-7353', '(818) 342-7354', '(818) 342-7353']

In [33]:
test=[
    ['If you have any questions, please contact our support fax number (555) 123-4567.', 'PHONE', '(555) 123-4567'],
    ['For reservations, call our hotline at 1-800-987-6543.', 'PHONE', '1-800-987-6543'],
    ['Our customer service is available 24/7 at +1-234-567-8901.', 'PHONE', '+1-234-567-8901'],
    ['You can reach me on my cell at 555.234.5678 during office hours.', 'PHONE', '555.234.5678'],
    ['For immediate assistance, dial 123-456-7890.', 'PHONE', '123-456-7890'],
    ['To inquire about our services, call (123)456-7890.', 'PHONE', '(123)456-7890'],
    ['Our office number is +44 20 7946 0958.', 'PHONE', '+44 20 7946 0958'],
    ['Reach us at our toll-free number: 800-123-4567.', 'PHONE', '800-123-4567'],
    ['Please contact John at 555-1234 for further information.', 'PHONE', '555-1234'],
    ['For more details, call 123.456.7890.', 'PHONE', '123.456.7890'],
    ['In case of emergency, contact us at +49 89 636-48018.', 'PHONE', '+49 89 636-48018'],
    ['You can also fax us at (555) 987-6543.', 'PHONE', '(555) 987-6543'],
    ['For support, call 1.800.234.5678.', 'PHONE', '1.800.234.5678'],
    ['Call our main line at +1 (123) 456-7890 for inquiries.', 'PHONE', '+1 (123) 456-7890'],
    ['For appointments, please call (555) 123-4567 ext. 89.', 'PHONE', '(555) 123-4567 ext. 89'],
    ['Our contact number is +33 1 23 45 67 89.', 'PHONE', '+33 1 23 45 67 89'],
    ['To speak with a representative, dial 123-4567.', 'PHONE', '123-4567'],
    ['For general questions, call our help desk at 800.123.4567.', 'PHONE', '800.123.4567'],
    ['Contact the HR department at (123) 456-7890 for job inquiries.', 'PHONE', '(123) 456-7890'],
    ['To book an appointment, call our office at 555 234 5678.', 'PHONE', '555 234 5678'],
    ['For sales inquiries, reach us at 1-800-555-1234.', 'PHONE', '1-800-555-1234'],
    ['If you have any issues, please call our hotline at 987-654-3210.', 'PHONE', '987-654-3210'],
    ['For quick support, dial our customer service at 555.789.1234.', 'PHONE', '555.789.1234'],
    ['You can reach our New York office at +1 212-555-6789.', 'PHONE', '+1 212-555-6789'],
    ['For immediate assistance, please call (123) 555-6789.', 'PHONE', '(123) 555-6789'],
    ['Our helpline is available at 1 (800) 555-6789.', 'PHONE', '1 (800) 555-6789'],
    ['Contact our office at +61 3 1234 5678 for international inquiries.', 'PHONE', '+61 3 1234 5678'],
    ['Please call our main desk at 123.456.7890 for more details.', 'PHONE', '123.456.7890'],
    ['For urgent matters, dial (555) 987-6543.', 'PHONE', '(555) 987-6543'],
    ['To reach our support team, call +44 (0)20 1234 5678.', 'PHONE', '+44 (0)20 1234 5678'],
    ['You can contact us at 800-987-6543 for further assistance.', 'PHONE', '800-987-6543'],
    ["Please contact us at +1-202-555-0173 for further information.", 'PHONE', '+1-202-555-0173'],
    ["Call our UK office at +44 20 7946 0958.", 'PHONE', '+44 20 7946 0958'],
    ["You can reach me at 212-555-1234 during office hours.", 'PHONE', '212-555-1234'],
    ["For inquiries, dial +49 89 636 48018.", 'PHONE', '+49 89 636 48018'],
    ["My new number is (415) 555-2671.", 'PHONE', '(415) 555-2671'],
    ["Please use 030 12345678 to contact our Berlin office.", 'PHONE', '030 12345678'],
    ["You can reach our support team at +33 1 44 94 56 78.", 'PHONE', '+33 1 44 94 56 78'],
    ["Call +39 02 12345678 for more details.", 'PHONE', '+39 02 12345678'],
    ["Our customer service is available at 0800 123 456.", 'PHONE', '0800 123 456'],
    ["Dial +353 1 234 5678 to connect with our Dublin branch.", 'PHONE', '+353 1 234 5678'],
    ["Contact us at 12345 for further assistance.", 'PHONE', '12345'],  # Too short
    ["Reach out at +1-800-555-5555-5555.", 'PHONE', '+1-800-555-5555-5555'],  # Too many digits
    ["My number is 5551234567890.", 'PHONE', '5551234567890'],  # Missing separators, too long
    ["Call me at +44 123 4567 8901 234.", 'PHONE', '+44 123 4567 8901 234'],  # Too many digits
    ["Dial (123) 456-789 for quick support.", 'PHONE', '(123) 456-789'],  # Too short
    ["My contact number is 555-123456789012.", 'PHONE', '555-123456789012'],  # Too long
    ["Please use 020-ABCDEFGH.", 'PHONE', '020-ABCDEFGH'],  # Letters instead of numbers
    ["Reach us at +49-89-123.", 'PHONE', '+49-89-123'],  # Too short
    ["Call us at 5555-555-5555-555.", 'PHONE', '5555-555-5555-555'],  # Incorrect grouping
    ["Contact +33 01 23 45 67 890.", 'PHONE', '+33 01 23 45 67 890'],  # Too long
]

phone_parser_model.transform(spark.createDataFrame(test, ['text']))\
                  .select("token.result","document.result","entity_phone.result")\
                  .show(truncate=False)

+------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------+-------------------+
|result                                                                                                |result                                                                            |result             |
+------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------+-------------------+
|[If, you, have, any, questions, ,, please, contact, our, support, fax, number, (, 555, ), 123-4567, .]|[If you have any questions, please contact our support fax number (555) 123-4567.]|[(555) 123-4567]   |
|[For, reservations, ,, call, our, hotline, at, 1-800-987-6543, .]                                     |[For reservations, call our hotline at 1-800-987-6543.]         

## PHONE-2

In [34]:
phone_extra = {
  "entity": "PHONE",
  "ruleScope": "document",
  "matchScope":"sub-token",
  "regex":"(\d{4}-\d{4})|(\d{3}-\d{3}-\d{4})|(\+\d{11})|(\d{3}\.\d{3}\.\d{4})",

  "prefix": ["Content Version",
             "PROBLEMS OR CONCERNS",
             "Questions",
             "Fax :",
             "Provider Number :",
             "MAIN"
             ],

  "contextLength": 45,
  "completeMatchRegex": "true"
}

with open('phone_extra.json', 'w') as f:
    json.dump(phone_extra, f)

phone_parser2 = ContextualParserApproach() \
        .setInputCols(["splitter", "token"]) \
        .setOutputCol("entity_phone2") \
        .setJsonPath("phone_extra.json") \
        .setCaseSensitive(False) \
        .setPrefixAndSuffixMatch(False)\
        .setShortestContextMatch(False)\
        .setOptionalContextRules(False)\
        .setCompleteContextMatch(True)

phone_parser_pipeline2 = Pipeline(stages=[
    documentAssembler,
    splitter,
    tokenizer,
    phone_parser2
])

empty_data = spark.createDataFrame([[""]]).toDF("text")

phone_parser_model2 = phone_parser_pipeline2.fit(empty_data)
phone_parser_model2.stages[-1].write().overwrite().save("parser/phone_parser2")


phone_lp = LightPipeline(phone_parser_model2)

text = '''Record date :2093-01-13, David Hale, M.D. IP 203.120.2234. Fax : 203.120.2234
Fax : 4342-7353.\nEmail: medunites@firsthospital.com'''

phone_lp.annotate(text)['entity_phone2']

['203.120.2234', '4342-7353']

## MEDICALRECORD

In [35]:
med_cp = {
  "entity": "MEDICALRECORD",
  "ruleScope": "sentence",
  "matchScope":"token",
  "regex":"(^\d{5,7}$)",
  "prefix":['ICU Admission', 'MR', 'MR#', 'MR.', 'MRN', 'MRN#',
            'TSICU Admission', 'Medical Record', "Medical Record Number"],
  "contextLength": 25,
}

with open('med_cp.json', 'w') as f:
    json.dump(med_cp, f)

med_parser = ContextualParserApproach() \
        .setInputCols(["splitter", "token"]) \
        .setOutputCol("entity_medicalrecord") \
        .setJsonPath("med_cp.json") \
        .setCaseSensitive(False) \
        .setPrefixAndSuffixMatch(False)\
        .setShortestContextMatch(False)\
        .setOptionalContextRules(False)\
        .setCompleteContextMatch(True)


med_parser_pipeline = Pipeline(stages=[
    documentAssembler,
    splitter,
    tokenizer,
    med_parser
    ])

empty_data = spark.createDataFrame([[""]]).toDF("text")
med_parser_model = med_parser_pipeline.fit(empty_data)
med_parser_model.stages[-1].write().overwrite().save("parser/med_parser")

med_lp = LightPipeline(med_parser_model)


text = """Month DD, YYYY
XYZ
RE: ABC
MEDICAL RECORD#: 12332
MRN: 1233567
Dear Dr. XYZ:

I saw ABC back in Neuro-Oncology Clinic today."""

res = med_lp.annotate(text)
res['entity_medicalrecord']

['12332', '1233567']

## EMAIL

In [36]:
email_regex_matcher = RegexMatcherInternalModel.pretrained("email_matcher", "en", "clinical/models")\
    .setInputCols(["splitter"]) \
    .setOutputCol("entity_email") \

email_regex_matcher_pipeline = Pipeline(stages=[
    documentAssembler,
    splitter,
    email_regex_matcher
    ])

empty_data = spark.createDataFrame([[""]]).toDF("text")
email_regex_matcher_model = email_regex_matcher_pipeline.fit(empty_data)


email_lp = LightPipeline(email_regex_matcher_model)

text = """ID: 1231511863, The driver's license no:A334455B, the SSN:324598674 and jadjada_adald19@msku.edu.tr, mail: afakfl_lakf19@yahoo.com, e-mail: hale@gmail.com .
EMAIL: afakfl_lakf19@yahoo.com, E-mail: hale@gmail.com .
"""

email_lp.annotate(text)['entity_email']

email_matcher download started this may take some time.
[OK!]


['jadjada_adald19@msku.edu.tr',
 'afakfl_lakf19@yahoo.com',
 'hale@gmail.com',
 'afakfl_lakf19@yahoo.com',
 'hale@gmail.com']

## URL


In [37]:
url_regex_matcher = RegexMatcherInternalModel.pretrained("url_matcher","en","clinical/models")\
    .setInputCols(["splitter"]) \
    .setOutputCol("entity_url")

url_regex_matcher_pipeline = Pipeline(stages=[
    documentAssembler,
    splitter,
    url_regex_matcher
    ])

empty_data = spark.createDataFrame([[""]]).toDF("text")
url_regex_matcher_model = url_regex_matcher_pipeline.fit(empty_data)


url_lp = LightPipeline(url_regex_matcher_model)

text = """Name: ID: 1231511863, Driver's License No: A334455B, SSN: 324-59-8674. E-mail: hale@gmail.com.
        For more details, visit our website at www.johnsnowlabs.com or check out http://example.com for general info.
        For secure access, go to https://secure.example.com. File transfers can be done via ftp://files.example.com.
"""

url_lp.annotate(text)['entity_url']

url_matcher download started this may take some time.
[OK!]


['www.johnsnowlabs.com',
 'http://example.com',
 'https://secure.example.com',
 'ftp://files.example.com']

## IP

In [38]:
ip_regex_matcher = RegexMatcherInternalModel.pretrained("ip_matcher", "en", "clinical/models") \
    .setInputCols(["splitter"]) \
    .setOutputCol("entity_ip")

ip_regex_matcher_pipeline = Pipeline(stages=[
    documentAssembler,
    splitter,
    ip_regex_matcher
    ])

empty_data = spark.createDataFrame([[""]]).toDF("text")
ip_regex_matcher_model = ip_regex_matcher_pipeline.fit(empty_data)


ip_lp = LightPipeline(ip_regex_matcher_model)

text = """Name: ID: 1231511863, Driver's License No: A334455B, SSN: 324-59-8674. E-mail: hale@gmail.com.
        Access the router at http://192.168.0.1 for configuration. Please connect to 10.0.0.1 to access the database..
        Visit http://198.51.100.42 for more information. File transfers can be done via ftp://files.example.com.
"""

ip_lp.annotate(text)['entity_ip']

ip_matcher download started this may take some time.
[OK!]


['192.168.0.1', '10.0.0.1', '198.51.100.42']

## ZIP

In [39]:
zip_parser = ContextualParserModel.pretrained("zip_parser", "en", "clinical/models") \
    .setInputCols(["document", "token"]) \
    .setOutputCol("entity_zip_code")

zip_parser download started this may take some time.
[OK!]


In [40]:

zip_regex_matcher_pipeline = Pipeline(stages=[
    documentAssembler,
    splitter,
    tokenizer,
    zip_parser
    #zip_regex_matcher
    ])

empty_data = spark.createDataFrame([[""]]).toDF("text")
zip_regex_matcher_model = zip_regex_matcher_pipeline.fit(empty_data)


zip_lp = LightPipeline(zip_regex_matcher_model)
text = """
Washington, DC 20004
Kenilworth, NJ 07033
Kenilworth, PENNSYLVANIA 07033
AL 123456!, TX 54321-4444, AL :55555-4444, JHBJHBJHB 12345-4444, MK 11111, TX 12345
"""
zip_lp.annotate(text)['entity_zip_code']


['20004', '07033', '07033', '54321-4444', '55555-4444', '12345']

In [41]:
test=[
    ["Please send the package to ZIP code AL 90210.", "ZIP", "90210"],
    ["The delivery address is in the 10001 ZIP code.", "ZIP", "10001"],
    ["Enter the postal code SW1A 1AA for London.", "ZIP", "SW1A 1AA"],
    ["The office is located in ZIP code 75008.", "ZIP", "75008"],
    ["Please use ZIP code 94105 for the San Francisco branch.", "ZIP", "94105"],
    ["Her address includes the ZIP code 10115.", "ZIP", "10115"],
    ["Send the mail to ZIP code 30301 in Atlanta.", "ZIP", "30301"],
    ["The warehouse is located in ZIP code 40000.", "ZIP", "40000"],
    ["Our Paris office uses the ZIP code 75007.", "ZIP", "75007"],
    ["Use postal code 12345-6789 for faster delivery.", "ZIP", "12345-6789"],
    ["The Vienna office has the ZIP code 1010.", "ZIP", "1010"],
    ["He lives in an area with the ZIP code 150-0001 in Tokyo.", "ZIP", "150-0001"],
    ["Please send the package to ZIP code 1234.", "ZIP", "1234"],  # Too short for most regions
    ["The delivery address is in the 123456 ZIP code.", "ZIP", "123456"],  # Too long for most regions
    ["Enter the postal code ABCDE for London.", "ZIP", "ABCDE"],  # Non-numeric characters in a numeric ZIP code region
    ["The office is located in ZIP code 99999-999.", "ZIP", "99999-999"],  # Incorrect format for extended US ZIP code
    ["Please use ZIP code 12345 67890 for the New York branch.", "ZIP", "12345 67890"],  # Incorrect format with space
    ["Her address includes the ZIP code 1234-567.", "ZIP", "1234-567"],  # Incorrect length and format
    ["Send the mail to ZIP code SW1A-1AA in London.", "ZIP", "SW1A-1AA"],  # Incorrect format with hyphen
    ["The warehouse is located in ZIP code 40000-12345.", "ZIP", "40000-12345"],  # Extended format not valid in Europe
    ["Our Paris office uses the ZIP code 7500.", "ZIP", "7500"],  # Too short for France
    ["Use postal code 123456789 for faster delivery.", "ZIP", "123456789"],  # Too long for any region
    ["The Vienna office has the ZIP code A-1010.", "ZIP", "A-1010"],  # Incorrect prefix for Vienna
    ["He lives in an area with the ZIP code 150-000123 in Tokyo.", "ZIP", "150-000123"],  # Too long for Japan

]

zip_regex_matcher_model.transform(spark.createDataFrame(test, ['text']))\
                        .select("document.result","entity_zip_code.result")\
                        .show(truncate=False)

+----------------------------------------------------------+--------------+
|result                                                    |result        |
+----------------------------------------------------------+--------------+
|[Please send the package to ZIP code AL 90210.]           |[90210]       |
|[The delivery address is in the 10001 ZIP code.]          |[10001]       |
|[Enter the postal code SW1A 1AA for London.]              |[]            |
|[The office is located in ZIP code 75008.]                |[75008]       |
|[Please use ZIP code 94105 for the San Francisco branch.] |[94105]       |
|[Her address includes the ZIP code 10115.]                |[10115]       |
|[Send the mail to ZIP code 30301 in Atlanta.]             |[30301]       |
|[The warehouse is located in ZIP code 40000.]             |[40000]       |
|[Our Paris office uses the ZIP code 75007.]               |[75007]       |
|[Use postal code 12345-6789 for faster delivery.]         |[12345-6789]  |
|[The Vienna

## COUNTRY

In [42]:
country_matcher = TextMatcherInternalModel.pretrained("country_matcher","en","clinical/models") \
    .setInputCols(["splitter", "token"])\
    .setOutputCol("entity_country")\
    .setMergeOverlapping(True)

pipeline = Pipeline().setStages([
    documentAssembler,
    splitter,
    tokenizer,
    country_matcher])

empty_data = spark.createDataFrame([[""]]).toDF("text")
country_code_model = pipeline.fit(empty_data)
# country_code_model.stages[-1].write().overwrite().save("matcher/country_matcher")

country_lp = LightPipeline(country_code_model)

country_matcher download started this may take some time.
[OK!]


In [43]:
text = """
United States, United Kingdom
Côte d’Ivoire
São Tomé & Príncipe
Åland Islands
This is Bahamas and the USAID in the USA here
Name: Johnson, Alice, Record date: 2093-03-22, MR: 846275.
Dr. Emily Brown, IP 192.168.1.1.
She is a 55-year-old female who was admitted to the Global Hospital for hip replacement on 03/22/93.
Patient's VIN: 2HGFA165X8H123456, SSN: 444-55-8888, Driver's license no: C789012D.
Phone: (212) 555-7890, 4321 Oak Street, New York, USA, E-MAIL: alice.johnson@example.com.
Patient has traveled to Japan, France, and Australia in the past year."""

data = spark.createDataFrame([[text]]).toDF("text")

results = pipeline.fit(data).transform(data)

results.selectExpr('explode(entity_country) as country')\
      .select("country.result").show(truncate=False)

+-------------------+
|result             |
+-------------------+
|United States      |
|United Kingdom     |
|Côte d’Ivoire      |
|São Tomé & Príncipe|
|Åland Islands      |
|Bahamas            |
|USA                |
|USA                |
|Japan              |
|France             |
|Australia          |
+-------------------+



## STATE

In [44]:
state_matcher = TextMatcherInternalModel.pretrained("state_matcher","en","clinical/models") \
    .setInputCols(["splitter", "token"])\
    .setOutputCol("entity_state")

state_matcher download started this may take some time.
[OK!]


In [45]:
state_mathcer_pipeline = Pipeline().setStages([
                  documentAssembler,
                  splitter,
                  tokenizer,
                  state_matcher])

data = spark.createDataFrame([[""]]).toDF("text")

state_model = state_mathcer_pipeline.fit(data)

#state_model.stages[-1].write().overwrite().save("matcher/state_matcher")

In [46]:
state_lp = LightPipeline(state_model)

text = """
California is known for its beautiful beaches and vibrant entertainment industry centered.
The Grand Canyon in Arizona is one of the most stunning natural landmarks in the world.


AL 123456!, TX 54321-4444, AL :55555-4444, JHBJHBJHB 12345-4444, MK 11111, TX 12345

'MD Connect Call 11:59pm 2/16/69 from Dr . Hale at Senior Care Clinic Queen Creek , SD regarding Terri Bird .',
 'Arroyo Grande , KS , 19741-6273',
 'Oroville , AL 89389 48423663',
 'Red Springs , WA 77286',
 'Lake Pocotopaug , ME 15424',
 'Queen Creek , SD 89544',
 'Goins is a 27 yo male with history of type I DM formally without regular medical care who was visiting family in Maryland and had sudden witnessed seizure activity in late August .',
 'Whitewater , NC 13662 10776605'
"""


state_lp.annotate(text)['entity_state']

['California', 'Arizona', 'Maryland']

In [47]:
test = [
    ["Please send the package to the address in CA 90210.", "CA", "90210"],
    ["Please send the package to the address in California 90210.", "California", "90210"],
    ["The delivery address is in New York, NY 10001.", "New York", "10001"],
    ["Enter the address for Texas 75008.", "Texas", "75008"],
    ["The office is located in Florida 94105.", "Florida", "94105"],
    ["Please use the address in Illinois 10115 for the shipment.", "Illinois", "10115"],
    ["Her address includes the city of Georgia 30301.", "Georgia", "30301"],
    ["Send the mail to the address in Oregon 40000.", "Oregon", "40000"],
    ["Our branch is located in Maine 75007.", "Maine", "75007"],
    ["Use the address in Arizona 12345-6789 for faster processing.", "Arizona", "12345-6789"],
    ["The office in Washington 1010 has all the documents.", "Washington", "1010"]
]

state_model.transform(spark.createDataFrame(test, ['text']))\
            .select("document.result","entity_state.result").show(truncate=False)

+--------------------------------------------------------------+------------+
|result                                                        |result      |
+--------------------------------------------------------------+------------+
|[Please send the package to the address in CA 90210.]         |[]          |
|[Please send the package to the address in California 90210.] |[California]|
|[The delivery address is in New York, NY 10001.]              |[New York]  |
|[Enter the address for Texas 75008.]                          |[Texas]     |
|[The office is located in Florida 94105.]                     |[Florida]   |
|[Please use the address in Illinois 10115 for the shipment.]  |[Illinois]  |
|[Her address includes the city of Georgia 30301.]             |[Georgia]   |
|[Send the mail to the address in Oregon 40000.]               |[Oregon]    |
|[Our branch is located in Maine 75007.]                       |[Maine]     |
|[Use the address in Arizona 12345-6789 for faster processing.]|

## CITY

In [48]:
city_matcher = TextMatcherInternalModel.pretrained("city_matcher", "en", "clinical/models") \
    .setInputCols(["splitter", "token"]) \
    .setOutputCol("entity_city") \
    .setMergeOverlapping(True)


regex_pipeline = Pipeline().setStages([
    documentAssembler,
    splitter,
    tokenizer,
    city_matcher])

text = """California is known for its beautiful beaches and vibrant entertainment industry centered.
The Grand Canyon in Arizona is one of the most stunning natural landmarks in the world.

AL 123456!, TX 54321-4444, AL :55555-4444, JHBJHBJHB 12345-4444, MK 11111, TX 12345
'MD Connect Call 11:59pm 2/16/69 from Dr . Hale at Senior Care Clinic Queen Creek , SD regarding Terri Bird .',
 'Arroyo Grande , KS , 19741-6273',
 'Oroville , AL 89389 48423663',
 'Red Springs , WA 77286',
 'Lake Pocotopaug , ME 15424',
 'Queen Creek , SD 89544',
 'Goins is a 27 yo male with history of type I DM formally without regular medical care who was visiting family in Maryland and had sudden witnessed seizure activity in late August .',
 'Whitewater , NC 13662 10776605',

Name: Johnson, Alice, Record date: 2093-03-22, MR: 846275.
Dr. Emily Brown, IP 192.168.1.1.
She is a 55-year-old female who was admitted to the Global Hospital in Los Angeles for hip replacement on 03/22/93.
Patient's VIN: 2HGFA165X8H123456, SSN: 444-55-8888, Driver's license no: C789012D.
Phone: (212) 555-7890, 4321 Oak Street, New York City, USA, E-MAIL: alice.johnson@example.com.
Patient has traveled to Tokyo, Paris, and Sydney in the past year.
"""

data = spark.createDataFrame([[text]]).toDF("text")

result = regex_pipeline.fit(data).transform(data)

result.selectExpr("explode(entity_city) as entity_city").selectExpr("entity_city.result").show()

city_matcher download started this may take some time.
[OK!]
+-------------+
|       result|
+-------------+
|   California|
| Grand Canyon|
|  Queen Creek|
|Arroyo Grande|
|     Oroville|
|  Red Springs|
|  Queen Creek|
|   Whitewater|
|  Los Angeles|
|New York City|
|        Tokyo|
|        Paris|
|       Sydney|
+-------------+



# Download Models

## Full Pipeline with `setChunkPrecedence("field")`

In [49]:
word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") \
    .setInputCols(["splitter", "token"])\
    .setOutputCol("embeddings")


#################### additinal NER Models ####################
ner_clinical_large = MedicalNerModel.pretrained("ner_clinical_large", "en", "clinical/models")  \
    .setInputCols(["splitter", "token", "embeddings"]) \
    .setOutputCol("ner_clinical_large")

ner_clinical_large_converter = NerConverterInternal()\
    .setInputCols(["splitter", "token", "ner_clinical_large"])\
    .setOutputCol("ner_chunk_clinical_large")


ner_deid_generic_docwise_7Label = MedicalNerModel.pretrained("ner_deid_generic_docwise", "en", "clinical/models")\
    .setInputCols(["splitter", "token", "embeddings"]) \
    .setOutputCol("ner_deid_generic_docwise")

ner_deid_generic_docwise_7Label_converter = NerConverterInternal()\
    .setInputCols(["splitter", "token", "ner_deid_generic_docwise"])\
    .setOutputCol("ner_chunk_generic_docwise")\
    .setBlackList(["AGE", "NAME"])

ner_deid_docwise_subentity_21Label = MedicalNerModel.pretrained("ner_deid_subentity_docwise", "en", "clinical/models")\
    .setInputCols(["splitter", "token", "embeddings"]) \
    .setOutputCol("ner_deid_docwise_subentity")

ner_deid_docwise_subentity_21Label_converter = NerConverterInternal()\
    .setInputCols(["splitter", "token", "ner_deid_docwise_subentity"])\
    .setOutputCol("ner_chunk_subentity_docwise")\
    .setBlackList(["PHONE", "LOCATION", "AGE", "ORGANIZATION", 'CITY','COUNTRY','HOSPITAL', 'STATE', 'STREET', 'ZIP', 'IDNUM', 'BIOID','MEDICALRECORD', 'DATE'])

ner_deid_generic_docwise_merged_conll = MedicalNerModel.pretrained("ner_deid_subentity_augmented_docwise", "en", "clinical/models")\
    .setInputCols(["splitter", "token", "embeddings"]) \
    .setOutputCol("ner_deid_generic_docwise_merged_conll")

ner_deid_generic_docwise_merged_conll_converter = NerConverterInternal()\
    .setInputCols(["splitter", "token", "ner_deid_generic_docwise_merged_conll"])\
    .setOutputCol("ner_chunk_merged_docwise")\
    .setBlackList(["AGE", "ORGANIZATION", "PHONE", "DATE"])



chunk_merge_ner = ChunkMergeModel()\
    .setInputCols("ner_chunk_merged_docwise",
                  "ner_chunk_generic_docwise",
                  "ner_chunk_subentity_docwise",
                  "ner_chunk_clinical_large"
                 )\
    .setOutputCol("deid_merged_ner_chunk")\
    .setOrderingFeatures(["ChunkPrecedence","ChunkBegin"])\
    .setMergeOverlapping(True)\
    .setSelectionStrategy("Sequential")\
    .setBlackList(['USERNAME', 'DEVICE', 'EMAIL', 'PROFESSION'])\
    .setResetSentenceIndices(True)\




embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_clinical_large download started this may take some time.
[OK!]
ner_deid_generic_docwise download started this may take some time.
[OK!]
ner_deid_subentity_docwise download started this may take some time.
[OK!]
ner_deid_subentity_augmented_docwise download started this may take some time.
[OK!]


In [50]:
sorted(set([i.split("-")[1] for i in ner_deid_docwise_subentity_21Label.getClasses() if i!="O"]))

['AGE',
 'BIOID',
 'CITY',
 'COUNTRY',
 'DATE',
 'DEVICE',
 'DOCTOR',
 'EMAIL',
 'FAX',
 'HEALTHPLAN',
 'HOSPITAL',
 'IDNUM',
 'LOCATION',
 'MEDICALRECORD',
 'ORGANIZATION',
 'PATIENT',
 'PHONE',
 'PROFESSION',
 'STATE',
 'STREET',
 'URL',
 'USERNAME',
 'ZIP']

In [51]:
icd10_parser = ContextualParserModel.load("parser/icd10_parser") \
    .setInputCols(["splitter", "token"])\
    .setOutputCol("entity_icd10")

ssn_parser = ContextualParserModel.load("parser/ssn_parser") \
    .setInputCols(["splitter", "token"])\
    .setOutputCol("entity_ssn")

account_parser = ContextualParserModel.load("parser/account_parser") \
    .setInputCols(["splitter", "token"])\
    .setOutputCol("entity_account")

dln_parser = ContextualParserModel.load("parser/dln_parser") \
    .setInputCols(["splitter", "token"])\
    .setOutputCol("entity_dln")

plate_parser = ContextualParserModel.load("parser/plate_parser") \
    .setInputCols(["splitter", "token"])\
    .setOutputCol("entity_plate")

vin_parser = ContextualParserModel.load("parser/vin_parser") \
    .setInputCols(["splitter", "token"])\
    .setOutputCol("entity_vin")

license_parser = ContextualParserModel.load("parser/license_parser") \
    .setInputCols(["splitter", "token"])\
    .setOutputCol("entity_license")

age_parser = ContextualParserModel.load("parser/age_parser") \
    .setInputCols(["splitter", "token"])\
    .setOutputCol("entity_age")

date_parser = RegexMatcherInternalModel.load("regex_matcher/date_regex_matcher") \
    .setInputCols(["splitter"])\
    .setOutputCol("entity_date")

phone_parser = ContextualParserModel.load("parser/phone_parser") \
    .setInputCols(["splitter", "token"])\
    .setOutputCol("entity_phone")

phone_parser2 = ContextualParserModel.load("parser/phone_parser2") \
    .setInputCols(["splitter", "token"])\
    .setOutputCol("entity_phone2")

medicalrecord_parser = ContextualParserModel.load("parser/med_parser") \
    .setInputCols(["splitter", "token"])\
    .setOutputCol("entity_medicalrecord")

##################################################

country_matcher = TextMatcherInternalModel.pretrained("country_matcher","en","clinical/models")\
    .setInputCols(["splitter", "token"]) \
    .setOutputCol("entity_country") \
    .setCaseSensitive(False)

state_matcher = TextMatcherInternalModel.pretrained("state_matcher","en","clinical/models")\
    .setInputCols(["splitter", "token"]) \
    .setOutputCol("entity_state")

"""city_matcher = TextMatcherInternalModel.pretrained("city_matcher","en","clinical/models") \
    .setInputCols(["document", "token"])\
    .setOutputCol("entity_city")\
    .setMergeOverlapping(True)"""

zip_parser = ContextualParserModel.pretrained("zip_parser", "en", "clinical/models")\
    .setInputCols(["splitter", "token"]) \
    .setOutputCol("entity_zip")

email_regex_matcher = RegexMatcherInternalModel.pretrained("email_matcher", "en", "clinical/models")\
    .setInputCols(["splitter"]) \
    .setOutputCol("entity_email")

"""
url_regex_matcher = RegexMatcherInternalModel.pretrained("url_matcher","en","clinical/models") \
    .setInputCols(["splitter"]) \
    .setOutputCol("entity_url")
"""

ip_regex_matcher = RegexMatcherInternalModel.pretrained("ip_matcher", "en", "clinical/models") \
    .setInputCols(["splitter"]) \
    .setOutputCol("entity_ip_address")

#################################################################

# Pritirioze these inputCols
chunk_merge_rulebase = ChunkMergeApproach()\
    .setInputCols("entity_icd10","entity_email","entity_ip_address",
                  "entity_age","entity_medicalrecord","entity_ssn","entity_account",
                  "entity_vin","entity_date","entity_phone","entity_phone2", "entity_country","entity_state",
                  "entity_zip","entity_plate","entity_dln","entity_license")\
    .setOutputCol("deid_merged_ner_rulebased")\
    .setMergeOverlapping(True)\
    .setResetSentenceIndices(True)\
    .setOrderingFeatures(["ChunkLength","ChunkBegin"])

# entity_city cıkardım, mergerda en sondaydı
chunk_merge_final = ChunkMergeModel()\
    .setInputCols("deid_merged_ner_rulebased", "deid_merged_ner_chunk")\
    .setOutputCol("ner_chunk_raw")\
    .setMergeOverlapping(True)\
    .setChunkPrecedence("field")\
    .setSelectionStrategy("Sequential") \
    .setBlackList(["ICD10_CODE", "PROBLEM", "TEST", "TREATMENT", "URL", "DEVICE", "ORGANIZATION"])\
    .setOrderingFeatures(["ChunkPrecedence", "ChunkBegin"])\
    .setResetSentenceIndices(True)\
    .setReplaceDict({"DOCTOR": "NAME",
                     "PATIENT": "NAME",
                     "LICENSE": "IDNUM",
                     "PHONE": "CONTACT",
                     "EMAIL": "CONTACT",
                     "SSN": "IDNUM",
                     "VIN": "IDNUM",
                     "DLN":"IDNUM",
                     "IP":"IDNUM",
                     "ACCOUNT":"IDNUM",
                     "PLATE":"IDNUM",
                     "City": "LOCATION",
                     "LOCATION_OTHER": "LOCATION",
                     "ID":"IDNUM",
                     "MEDICALRECORD":"IDNUM",
                     "CITY":"LOCATION",
                     "STATE":"LOCATION",
                     "ZIP":"LOCATION",
                     "COUNTRY":"LOCATION",
                     "STREET":"LOCATION",
                     "HOSPITAL":"LOCATION",
                     "LOCATION-OTHER":"LOCATION"
                    })

country_matcher download started this may take some time.
[OK!]
state_matcher download started this may take some time.
[OK!]
zip_parser download started this may take some time.
[OK!]
email_matcher download started this may take some time.
[OK!]
ip_matcher download started this may take some time.
[OK!]


## Pipeline Fitting

In [52]:
rules = [   {
                "entity" : "NAME",
                "scopeWindow" : [5,5],
                "scopeWindowLevel"  : "token",
                "prefixPatterns" : ['Signed', 'Unsigned', 'DISCHARGE ORDERS', 'Electronically Signed', 'Name:Page', 'Vitals'
                                    'STAT', 'TR','yu', 'jt', 'ti', 'yz', 'OBSTETRICS SERVICE', 'Allowing', 'Denied', 'hair', 'DICTATING', 'Acknowledgments Created',
                                    'Discharging', 'Records Coversheet', 'bmot','si', 'kxs', '/STAT', 'Purple Cap', 'BestPractice', 'Advisories', 'Acknowledgments',
                                    'CRNED', 'LITA', 'FORD', 'Astronomer', 'AZADLPMB'
                                   ],
                "replaceEntity" : "NAME",
                "mode" : "exclude"
            },
            {
                "entity" : "LOCATION",
                "scopeWindow" : [5,5],
                "scopeWindowLevel"  : "token",
                "prefixPatterns" : ['Portuguese', 'Spanish', 'BRIEF RESUME OF HOSPITAL', 'RESUME OF HOSPITAL', "CSU", "group home",
                                    'None\nHOSPITAL', '5:22 PM Visit', 'CRNP', 'eMAR', 'EclipseTool.co.uk', 'GAVE PULMONARY PARTNERS', 'MD', 'None'
                                    'UPMC', 'DuBois', 'HH', 'BestPractice', 'CAN', 'POD', 'OHILT', 'outside', 'Closest', 'Sent\n\n',
                                    'DME Supplier\n\nPROVIDE AUTO-CAPABLE MACHINE MASK AND SUPPLIES LINK ME TO HIS', 'eHealth', 'Other',
                                    'Other 84-87\n\neHealth Technologies\n\n'
                                   ],
                "replaceEntity" : "LOCATION",
                "mode" : "exclude"
            },
            {
                "entity" : "CONTACT",
                "scopeWindow" : [5,5],
                "scopeWindowLevel"  : "token",
                "prefixPatterns" : ["74R-484", "EK243", "YO52", "NL67", "AI19", "1996[45"],
                "replaceEntity" : "CONTACT",
                "mode" : "exclude"
            },
            {
                "entity" : "DATE",
                "scopeWindow" : [5,5],
                "scopeWindowLevel"  : "token",
                "prefixPatterns" : ["winter", "summer", "fall of", 'New Years Eve', '70/30', 'fall', 'Martin Luther King Day',
                                    'decrescendo', 'Date', '£', 'two', 'three', 'of'
                                   ],
                "replaceEntity" : "DATE",
                "mode" : "exclude"
            },
            {
                "entity" : "IDNUM",
                "scopeWindow" : [5,5],
                "scopeWindowLevel"  : "token",
                "prefixPatterns" : ['pacemaker', 'CPAP'],
                "replaceEntity" : "IDNUM",
                "mode" : "exclude"
            }


        ]

contextual_entity_ruler_2 = ContextualEntityRuler() \
            .setInputCols("splitter", "token", "ner_chunk_raw") \
            .setOutputCol("ner_chunk_processed") \
            .setRules(rules) \
            .setCaseSensitive(False)\
            .setDropEmptyChunks(True)\
            .setAllowPunctuationInBetween(True)

chunk_merge_final2 = ChunkMergeModel()\
    .setInputCols("ner_chunk_processed")\
    .setOutputCol("ner_chunk")\
    .setResetSentenceIndices(True)

In [53]:
light_deidentification_obfuscation = LightDeIdentification() \
    .setInputCols(["ner_chunk", "splitter"]) \
    .setOutputCol("obfuscated") \
    .setMode("obfuscate") \
    .setLanguage("en") \
    .setSeed(10)\
    .setObfuscateDate(True)

In [54]:
nlpPipeline = Pipeline(
    stages=[
        documentAssembler,
        splitter,
        tokenizer,
        word_embeddings,
        ner_clinical_large,
        ner_clinical_large_converter,
        ner_deid_generic_docwise_7Label,
        ner_deid_docwise_subentity_21Label,
        ner_deid_generic_docwise_merged_conll,
        ner_deid_generic_docwise_7Label_converter,
        ner_deid_docwise_subentity_21Label_converter,
        ner_deid_generic_docwise_merged_conll_converter,
        chunk_merge_ner,
        icd10_parser,
        ssn_parser,
        account_parser,
        dln_parser,
        plate_parser,
        vin_parser,
        license_parser,
        country_matcher,
        state_matcher,
        age_parser,
        date_parser,
        phone_parser,
        phone_parser2,
        zip_parser,
        medicalrecord_parser,
        email_regex_matcher,
        #url_regex_matcher,
        ip_regex_matcher,
        #city_matcher,
        chunk_merge_rulebase,
        chunk_merge_final,
        contextual_entity_ruler_2,
        chunk_merge_final2,
        light_deidentification_obfuscation
])

empty_data = spark.createDataFrame([[""]]).toDF("text")

deid_model = nlpPipeline.fit(empty_data)
deid_light_model = LightPipeline(deid_model)

In [55]:
deid_model.stages

[DocumentAssembler_c894bfe37040,
 InternalDocumentSplitter_7b75c0c4c61e,
 REGEX_TOKENIZER_3c0b8372a95d,
 WORD_EMBEDDINGS_MODEL_9004b1d00302,
 MedicalNerModel_1a8637089929,
 NER_CONVERTER_f21d5acd9d84,
 MedicalNerModel_d92d47622e85,
 MedicalNerModel_32184c1db80b,
 MedicalNerModel_ada39ac0d359,
 NER_CONVERTER_6d806299dfa3,
 NER_CONVERTER_e92019a50579,
 NER_CONVERTER_1b9aa26d6a3b,
 ChunkMergeModel_c79a8115c6ef,
 CONTEXTUAL-PARSER_8c0f3622eb6c,
 CONTEXTUAL-PARSER_a6d805aafa96,
 CONTEXTUAL-PARSER_32e8ed56ab65,
 CONTEXTUAL-PARSER_9f2a36afd1b1,
 CONTEXTUAL-PARSER_d5ae75ea9644,
 CONTEXTUAL-PARSER_88045c307ef1,
 CONTEXTUAL-PARSER_f74f81c59361,
 ENTITY_EXTRACTOR_6792f2f6e85a,
 ENTITY_EXTRACTOR_74ace4be4f73,
 CONTEXTUAL-PARSER_93fd39669b2f,
 REGEX_MATCHER_41d477e5731e,
 CONTEXTUAL-PARSER_e34d098e093b,
 CONTEXTUAL-PARSER_0c34f578e042,
 CONTEXTUAL-PARSER_f8b8f9aafb9f,
 CONTEXTUAL-PARSER_d644bc7272a0,
 REGEX_MATCHER_26934077fe57,
 REGEX_MATCHER_5fe3de8b5a4e,
 MERGE_3fbc85992544,
 ChunkMergeModel_b31

In [63]:
import pandas as pd

pd.set_option("display.max_colwidth", 100)
pd.set_option("display.max_colwidth",0)

def get_result(_lres):
    sentence_list = []
    obfuscated_list = []
    masked_list = []

    for i,j in list(zip(_lres["splitter"], _lres["obfuscated"])):
        sentence_list.append(i.result)
        obfuscated_list.append(j.result)

    df= pd.DataFrame({"Sentence": sentence_list,
                    "Obfuscated": obfuscated_list,
                    })

    return df

# Save and Load Pipe

In [57]:
MODEL_NAME = "clinical_deidentification_docwise_benchmark_light"

In [58]:
deid_model.write().overwrite().save(f'models/{MODEL_NAME}')

In [59]:
from sparknlp.pretrained import PretrainedPipeline

deid_pipeline2 = PretrainedPipeline.from_disk(f"models/{MODEL_NAME}")

In [60]:
deid_res = deid_pipeline2.fullAnnotate("""Dr. John Lee, from Royal Medical Clinic in Chicago,  attended to the patient on 11/05/2024. The patient’s medical record number is 56467890.The patient, Emma Wilson, is 50 years old,  her Contact number: 444-456-7890 .""")[0]


In [64]:
get_result(deid_res)

Unnamed: 0,Sentence,Obfuscated
0,"Dr. John Lee, from Royal Medical Clinic in Chicago, attended to the patient on 11/05/2024. The patient’s medical record number is 56467890.The patient, Emma Wilson, is 50 years old, her Contact number: 444-456-7890 .","Dr. Rennis Pucker, from 150 Medical Plaza in 1700 E 38th St, attended to the patient on 07/06/2024. The patient’s medical record number is 07374185.Wgp patient, Quinn Dines, is 43 years old, her Contact number: 333-307-4185 ."
