In [1]:
import pandas as pd
import numpy as np

import re
import spacy
import pickle
import scispacy
from spacy.language import Language
from spacy.tokens import Span, Doc
from spacy.matcher import PhraseMatcher
from scispacy.linking import EntityLinker
from negspacy.negation import Negex
from negspacy.termsets import termset
from spacy.util import filter_spans

from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer 
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_colwidth = 100

In [2]:
# Dataset to process
# filename = "rmh_1217_test"
filename = "rmh_1819"

### Load normalised triage notes

In [4]:
df = pd.read_csv("../../data/" + filename + "_cleaned.csv")
print(df.shape)
df.head()

(159172, 12)


Unnamed: 0,patient_id,uid,age,gender,arrival_mode,arrival_date,year,text,text_clean,length,SH,SI
0,8118059.0,142671,64,female,self/community/pt,2018-01-01 00:02:00,2018,Was moving a chair/picking up a chair - has torn off half the nail on the 3rd finger on the R) h...,was moving a chair/picking up a chair - has torn off half the nail on the 3rd finger on the righ...,188,0,0
1,4043373.0,664964,26,male,self/community/pt,2018-01-01 00:04:00,2018,"Pt accidentally drank cater clean food grade sanitiser at work, states did not swallow any but ...","pt accidentally drank cater clean food grade sanitiser at work, states did not swallow any but f...",164,0,0
2,4058076.0,192808,25,male,road ambulance,2018-01-01 00:05:00,2018,"ETOH this pm. fallen of bike 10kph hit occiput. nil LOC, nil pain. refusing Tx with AV. denies L...","etoh this pm. fallen of bike 10kph hit occiput. nil loc, nil pain. refusing tx with av. denies l...",173,0,0
3,4058077.0,721017,20,male,road ambulance,2018-01-01 00:11:00,2018,in vic pol lock up for drunk. being agressive. ? fell over in cells abhrasions to face and occip...,in vic pol lock up for drunk. being agressive. ? fell over in cells abhrasions to face and occip...,198,0,0
4,4058078.0,800741,30,female,self/community/pt,2018-01-01 00:12:00,2018,"2/52 intermittant abdo pain, more so in the evening. Denies other sx, haemodynamically stable, l...","2/52 intermittant abdo pain, more so in the evening. denies other sx, haemodynamically stable, l...",128,0,0


### Define NLP pipeline

In [5]:
@Language.component("custom_ner") 
def custom_ner(doc):
    ents = []
    for token in doc:
        if not token.is_stop and not token.is_punct and not token.like_num and token.text!="+":
            ents.append(Span(doc, token.i, token.i+1, label="CONCEPT"))
    doc.ents = ents
    return doc

In [6]:
# Load Scispacy model
nlp = spacy.load("en_core_sci_lg", disable=['ner'])

# Add custom NER 
nlp.add_pipe("custom_ner", last=True)

print("NLP pipeline: tokenizer + {}".format(nlp.pipe_names))

NLP pipeline: tokenizer + ['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer', 'parser', 'custom_ner']


In [7]:
df['doc'] = df.text_clean.apply(nlp)
df['concepts'] = df.doc.apply(lambda x: " ".join([ent.text for ent in x.ents]))
df.drop(columns='doc').to_csv("../../data/" + filename + "_prepared_cnpt.csv", index=False)