In [2]:
import pandas as pd
import numpy as np

import re
import spacy
import pickle
import scispacy
from spacy.language import Language
from spacy.tokens import Span, Doc
from spacy.matcher import PhraseMatcher
from scispacy.linking import EntityLinker
from negspacy.negation import Negex
from negspacy.termsets import termset
from spacy.util import filter_spans

from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer 
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_colwidth = 100

In [3]:
# Dataset to process
# filename = "rmh_1217_test"
filename = "rmh_1219"

### Load cleaned data

In [4]:
df = pd.read_csv("../data/" + filename + "_cleaned.csv")
print(df.shape)
df.head()

(555455, 12)


Unnamed: 0,patient_id,uid,age,sex,arrival_mode,arrival_date,year,text,length,SH,SI,text_clean
0,1029335.0,240891,64,female,other,2012-01-08 00:35:00,2012,"SOB for 5/7, been to GP given prednisolone, coughing taken inhalers with minimal relief, speakin...",140,0,0,"sob for 5/7 , been to gp given prednisolone , coughing taken inhalers with minimal relief , spea..."
1,2073046.0,696853,31,male,other,2012-01-08 00:41:00,2012,"pt has lac down right forehead, to eyebrow, will require stitches and ADT, denies loc wound abou...",107,0,0,"pt has lac down right forehead , to eyebrow , will require stitches and adt , denies loc wound a..."
2,2073047.0,988598,19,male,road ambulance,2012-01-08 00:52:00,2012,"pt expect MBA, trapped for 45mins, #right femur, had 40mg morphine, GCS 15",74,0,0,"pt expect mba , trapped for 45 mins , fracture right femur , had 40 mg morphine , gcs 15"
3,1349154.0,941235,51,male,other,2012-01-08 01:11:00,2012,L) sided flank pain same as previous renal colic pain unimproved with analgesia for the past 1/5...,169,0,0,left sided flank pain same as previous renal colic pain unimproved with analgesia for the past 1...
4,1367452.0,900875,25,female,other,2012-01-08 01:23:00,2012,generalised abdo pain and associated headache for 1 year worse tonight. Pt states that she had ...,196,0,0,generalised abdo pain and associated headache for 1 year worse tonight . pt states that she had ...


### Tokenize and Filter

In [5]:
@Language.component("custom_ner") 
def custom_ner(doc):
    ents = []
    for token in doc:
        if not token.is_stop and not token.is_punct and not token.like_num and token.text!="+":
            ents.append(Span(doc, token.i, token.i+1, label="ENTITY"))
    doc.ents = ents
    return doc


@Language.component("bigram_detector") 
def bigram_detector(doc):
    matches = matcher(doc)
    spans = [Span(doc, start, end) for _, start, end in matches]
    filtered = filter_spans(spans)
    with doc.retokenize() as retokenizer:
        for span in filtered:
            retokenizer.merge(span)
    return doc


def get_canonical_name(span):
    if span._.kb_ents:
        concept = linker.kb.cui_to_entity[span._.kb_ents[0][0]].canonical_name.lower()
        return re.sub("\W", "_", concept)
    else:
        return span.text
    
    
def format_merged_tokens(span):
    return re.sub("\s", "_", span.text)


def apply_transformation(span, transform=""):
    if transform == "linked":
        return span._.linked
    elif transform == "merged":
        return span._.merged
    else:
        return span.text
    

def add_negation(span, transform=""):
    return span._.negex * "neg_" + span._.transformed(transform)

    
def prepare_tokens(doc, negation=False, transform=""):
    if negation:
        return " ".join([ent._.negated(transform) for ent in doc.ents])
    else:
        return " ".join([ent._.transformed(transform) for ent in doc.ents])

### Define bigrams

In [None]:
with open ('../data/most_common_bigrams.txt', 'rb') as f:
    most_common_bigrams = pickle.load(f)

In [6]:
# Load scispacy model
nlp = spacy.load("en_core_sci_lg", disable=['ner'])

# Custom NER 
nlp.add_pipe("custom_ner", last=True)

# # Define bigrams
# bigram_patterns = list(nlp.pipe(most_common_bigrams))
# matcher = PhraseMatcher(nlp.vocab)
# matcher.add("BIGRAM", None, *bigram_patterns)

# # Bigram detector
# nlp.add_pipe("bigram_detector", last=True)

# # Entity linker
# nlp.add_pipe("scispacy_linker", config={'linker_name': 'mesh', 'threshold': 0.9}, last=True)

# Modify negex termsets
ts = termset('en_clinical').get_patterns()
ts['preceding_negations'].extend(["nil", "non"])
ts['termination'].extend([",", ";", ":", "obviously"])

# Negation detector
nlp.add_pipe("negex", config={'ent_types': ['ENTITY']})

# Set attributes
Span.set_extension("linked", getter=get_canonical_name, force=True)
Span.set_extension("merged", getter=format_merged_tokens, force=True)
Span.set_extension("transformed", method=apply_transformation, force=True)
Span.set_extension("negated", method=add_negation, force=True)
Doc.set_extension("entities", method=prepare_tokens, force=True)

print("NLP pipeline: tokenizer + {}".format(nlp.pipe_names))

NLP pipeline: tokenizer + ['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer', 'parser', 'custom_ner', 'negex']


In [7]:
%%time
df['doc'] = df.text_clean.apply(nlp)
df['entities'] = df.doc.apply(lambda x: x._.entities())
df.drop(columns='doc').to_csv("../data/" + filename + "_prepared_ents.csv", index=False)

CPU times: user 53min 7s, sys: 9.66 s, total: 53min 17s
Wall time: 53min 21s


**Entities and negated entities**

In [None]:
%%time
with nlp.disable_pipes(['bigram_detector', 'scispacy_linker']):
    df['doc'] = df.text_clean.apply(nlp)

df['entities'] = df.doc.apply(lambda x: x._.entities())
df['neg_entities'] = df.doc.apply(lambda x: x._.entities(negation=True))

df.drop(columns='doc').to_csv("../data/" + filename + "prepared_ents.csv", index=False)

**Merged entities and negated merged entities**

In [None]:
%%time
with nlp.disable_pipes(["EntityLinker"]):
    df['doc'] = df.text_clean.apply(nlp)

In [None]:
df['merged_entities'] = df.doc.apply(lambda x: x._.entities(transform="merged"))
df['neg_merged_entities'] = df.doc.apply(lambda x: x._.entities(negation=True, transform="merged"))

In [None]:
df.drop(columns="doc").to_csv("./data/rmh_prepared_merged.csv", index=False)

**Linked entities and negated linked entities**

In [None]:
%%time
with nlp.disable_pipes(["bigram_detector"]):
    df['doc'] = df.text_clean.apply(nlp)

In [None]:
df['linked_entities'] = df.doc.apply(lambda x: x._.entities(transform="linked"))
df['neg_linked_entities'] = df.doc.apply(lambda x: x._.entities(negation=True, transform="linked"))

In [None]:
df.drop(columns="doc").to_csv("./data/rmh_prepared_linked.csv", index=False)

### Merge datasets

In [None]:
import pandas as pd

In [None]:
df1 = pd.read_csv("./data/rmh_prepared_linked_1.csv")
df1.shape

In [None]:
df2 = pd.read_csv("./data/rmh_prepared_linked_2.csv")
df2.shape

In [None]:
df3 = pd.read_csv("./data/rmh_prepared_linked_3.csv")
df3.shape

In [None]:
df = pd.concat([df1,df2, df3], axis=0)
df.shape

In [None]:
df.to_csv("./data/rmh_prepared_linked.csv", index=False)

In [None]:
df1 = pd.read_csv("./data/rmh_prepared_ents.csv")
df1.shape

In [None]:
df2 = pd.read_csv("./data/rmh_prepared_merged.csv")
df2.shape

In [None]:
df = df1.merge(df2[["merged_entities", "neg_merged_entities"]], left_index=True, right_index=True)

In [None]:
df3 = pd.read_csv("./data/rmh_prepared_linked.csv")
df3.shape

In [None]:
df = df.merge(df3[["linked_entities", "neg_linked_entities"]], left_index=True, right_index=True)

In [None]:
df.to_csv("./data/rmh_prepared.csv", index=False)

### Check the results

In [None]:
doc = df_.loc[3, "doc"]
doc

In [None]:
def umls_entity(ent):
    if ent._.kb_ents:
        return linker.kb.cui_to_entity[ent._.kb_ents]
    else:
        return ent.text

In [None]:
doc = nlp(df.loc[10000, 'doc'])
print(doc, "\n")
for token in doc:
    print(token, token.pos_)

In [None]:
for ent in doc.ents:
    print(ent.text, ent._.negex)

In [None]:
for ent in doc.ents:
    if ent._.kb_ents:
        print("\nEntity: \"{}\", number of linked concepts: {}".format(ent, len(ent._.kb_ents)))
#     print(ent.text, ent._.negex)
#     print(canonical_name(ent), "\n")
        for concept in ent._.kb_ents:
            print("\n", linker.kb.cui_to_entity[concept[0]])