In [1]:
import pandas as pd
import numpy as np

import re
import spacy
import pickle
import scispacy
from spacy.language import Language
from spacy.tokens import Span, Doc
from spacy.matcher import PhraseMatcher
from scispacy.linking import EntityLinker
from negspacy.negation import Negex
from negspacy.termsets import termset
from spacy.util import filter_spans

from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer 
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_colwidth = 100

# Load data

In [2]:
df = pd.read_csv("../data/epic2020_cleaned.csv")
print(df.shape)
df.head()

(57681, 4)


Unnamed: 0,timestamp,text,length,text_clean
0,2020-01-01 00:15:00,"argument with friend, threatened to jump off balcony. had voices in her head for 2/12. nil visua...",112,"argument with friend, threatened to jump off balcony. had voices in her head for 2/12. nil visua..."
1,2020-01-01 00:20:00,mech fall with swelling to L) hand and dec ROM. ETOH intake.,60,mech fall with swelling to left hand and dec rom. etoh intake.
2,2020-01-01 00:33:00,"Left lower dental pain since last year, seeking analgsia until able to attend dental hospital in...",142,"left lower dental pain since last year, seeking analgesia until able to attend dental hospital i..."
3,2020-01-01 00:34:00,"ETOH, scuffle with HS ? LOC, lac approx 2cm above R eyebrow will require sutures. GCS 15, full p...",145,"etoh, scuffle with hs ? loc, lac approx 2cm above r eyebrow will require sutures. gcs 15, full p..."
4,2020-01-01 00:36:00,"mech fall landed L) hip. headstrike onto wall. pain to L) hip, rotation/shortening.",84,"mech fall landed left hip. headstrike onto wall. pain to left hip, rotation/shortening."


### Tokenize and filter

In [3]:
@Language.component("custom_ner") 
def custom_ner(doc):
    ents = []
    for token in doc:
        if not token.is_stop and not token.is_punct and not token.like_num and token.text!="+":
            ents.append(Span(doc, token.i, token.i+1, label="ENTITY"))
    doc.ents = ents
    return doc

In [4]:
# Load scispacy model
nlp = spacy.load("en_core_sci_lg", disable=['ner'])

# Custom NER 
nlp.add_pipe("custom_ner", last=True)

print("NLP pipeline: tokenizer + {}".format(nlp.pipe_names))

# Modify negex termsets
ts = termset('en_clinical').get_patterns()
ts['preceding_negations'].extend(["nil", "non"])
ts['termination'].extend([",", ";", ":", "obviously"])

NLP pipeline: tokenizer + ['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer', 'parser', 'custom_ner']


In [5]:
%%time
df['doc'] = df.text_clean.apply(nlp)

CPU times: user 5min 14s, sys: 819 ms, total: 5min 15s
Wall time: 5min 15s


### Create a list of common bigrams

In [None]:
def word_count(data):
    vectorizer = CountVectorizer(stop_words=stopwords.words('english') + ts['preceding_negations'], 
                                 ngram_range=(2,2), 
                                 token_pattern=r'\S+')
    vectors = vectorizer.fit_transform(data)
    
    vocab = list(vectorizer.get_feature_names())
    counts = vectors.sum(axis=0).A1
    
    return Counter(dict(zip(vocab, counts)))

In [None]:
bigrams = word_count(df.doc.apply(lambda x: " ".join([ent.text for ent in x.ents])))
len(bigrams)

In [None]:
cut_off = np.quantile(np.fromiter(bigrams.values(), dtype=np.int), 0.99) 
print("Cut-off:", cut_off)
n_bigrams = (np.fromiter(bigrams.values(), dtype=np.int) > cut_off).sum()
print("%d most common bigrams" % n_bigrams)
most_common_bigrams = [item[0] for item in bigrams.most_common(n_bigrams)]

In [None]:
with open ('most_common_bigrams.txt', 'wb') as f:
    pickle.dump(most_common_bigrams, f)

### The rest of NLP pipeline

In [6]:
with open ('../data/most_common_bigrams.txt', 'rb') as f:
    most_common_bigrams = pickle.load(f)

In [7]:
def bigram_detector(doc):
    matches = matcher(doc)
    spans = [Span(doc, start, end) for _, start, end in matches]
    filtered = filter_spans(spans)
    with doc.retokenize() as retokenizer:
        for span in filtered:
            retokenizer.merge(span)
    return doc

In [8]:
def get_canonical_name(span):
    if span._.kb_ents:
        concept = linker.kb.cui_to_entity[span._.kb_ents[0][0]].canonical_name.lower()
        return re.sub("\W", "_", concept)
    else:
        return span.text
    
def format_merged_tokens(span):
    return re.sub("\s", "_", span.text)

def apply_transformation(span, transform=""):
    if transform == "linked":
        return span._.linked
    elif transform == "merged":
        return span._.merged
    else:
        return span.text

def add_negation(span, transform=""):
    return span._.negex * "neg_" + span._.transformed(transform)
    
def prepare_tokens(doc, negation=False, transform=""):
    if negation:
        return " ".join([ent._.negated(transform) for ent in doc.ents])
    else:
        return " ".join([ent._.transformed(transform) for ent in doc.ents])

In [9]:
bigram_patterns = list(nlp.pipe(most_common_bigrams))
matcher = PhraseMatcher(nlp.vocab)
matcher.add("BIGRAM", None, *bigram_patterns)

# Bigram detector
# nlp.add_pipe(bigram_detector, last=True)

# Entity linker
# linker = EntityLinker(name="mesh", threshold=0.9)
# nlp.add_pipe(linker, last=True)

# Negation detector
# nlp.add_pipe("negex", config={'ent_types': ['ENTITY'], 
#                               'neg_termset':{
#             preceding_negations": ["not"],
#             "following_negations":["declined"],
#             "termination": ["but","however"]
#         }
#     }
#     )

print("NLP pipeline: tokenizer + {}".format(nlp.pipe_names))

Span.set_extension("linked", getter=get_canonical_name, force=True)
Span.set_extension("merged", getter=format_merged_tokens, force=True)
Span.set_extension("transformed", method=apply_transformation, force=True)
Span.set_extension("negated", method=add_negation, force=True)
Doc.set_extension("entities", method=prepare_tokens, force=True)

NLP pipeline: tokenizer + ['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer', 'parser', 'custom_ner']


In [10]:
df['entities'] = df.doc.apply(lambda x: x._.entities())

**Entities and negated entities**

In [None]:
%%time
with nlp.disable_pipes(["bigram_detector", "EntityLinker"]):
    df['doc'] = df.text_clean.apply(nlp)

In [None]:
df['entities'] = df.doc.apply(lambda x: x._.entities())
df['neg_entities'] = df.doc.apply(lambda x: x._.entities(negation=True))

In [11]:
df.drop(columns="doc").to_csv("../data/epic2020_prepared_ents.csv", index=False)

**Merged entities and negated merged entities**

In [None]:
%%time
with nlp.disable_pipes(["EntityLinker"]):
    df['doc'] = df.text_clean.apply(nlp)

In [None]:
df['merged_entities'] = df.doc.apply(lambda x: x._.entities(transform="merged"))
df['neg_merged_entities'] = df.doc.apply(lambda x: x._.entities(negation=True, transform="merged"))

In [None]:
df.drop(columns="doc").to_csv("./data/rmh_prepared_merged.csv", index=False)

**Linked entities and negated linked entities**

In [None]:
%%time
with nlp.disable_pipes(["bigram_detector"]):
    df['doc'] = df.text_clean.apply(nlp)

In [None]:
df['linked_entities'] = df.doc.apply(lambda x: x._.entities(transform="linked"))
df['neg_linked_entities'] = df.doc.apply(lambda x: x._.entities(negation=True, transform="linked"))

In [None]:
df.drop(columns="doc").to_csv("./data/rmh_prepared_linked.csv", index=False)

### Merge datasets

In [None]:
import pandas as pd

In [None]:
df1 = pd.read_csv("./data/rmh_prepared_linked_1.csv")
df1.shape

In [None]:
df2 = pd.read_csv("./data/rmh_prepared_linked_2.csv")
df2.shape

In [None]:
df3 = pd.read_csv("./data/rmh_prepared_linked_3.csv")
df3.shape

In [None]:
df = pd.concat([df1,df2, df3], axis=0)
df.shape

In [None]:
df.to_csv("./data/rmh_prepared_linked.csv", index=False)

In [None]:
df1 = pd.read_csv("./data/rmh_prepared_ents.csv")
df1.shape

In [None]:
df2 = pd.read_csv("./data/rmh_prepared_merged.csv")
df2.shape

In [None]:
df = df1.merge(df2[["merged_entities", "neg_merged_entities"]], left_index=True, right_index=True)

In [None]:
df3 = pd.read_csv("./data/rmh_prepared_linked.csv")
df3.shape

In [None]:
df = df.merge(df3[["linked_entities", "neg_linked_entities"]], left_index=True, right_index=True)

In [None]:
df.to_csv("./data/rmh_prepared.csv", index=False)

### Check the results

In [None]:
doc = df_.loc[3, "doc"]
doc

In [None]:
def umls_entity(ent):
    if ent._.kb_ents:
        return linker.kb.cui_to_entity[ent._.kb_ents]
    else:
        return ent.text

In [None]:
doc = nlp(df.loc[10000, 'doc'])
print(doc, "\n")
for token in doc:
    print(token, token.pos_)

In [None]:
for ent in doc.ents:
    print(ent.text, ent._.negex)

In [None]:
for ent in doc.ents:
    if ent._.kb_ents:
        print("\nEntity: \"{}\", number of linked concepts: {}".format(ent, len(ent._.kb_ents)))
#     print(ent.text, ent._.negex)
#     print(canonical_name(ent), "\n")
        for concept in ent._.kb_ents:
            print("\n", linker.kb.cui_to_entity[concept[0]])

# Significant bigrams

In [None]:
from nlp_utils import get_vectorizer

In [None]:
df_train = pd.read_csv("./data/rmh_train.csv")
df_train.shape

In [None]:
text = "entities"
vectorizer_mode = "select features"
params = {'analyzer' : "word",
          'ngram_range' : (2,2),
          'use_idf' : True,
          'mode' : "select by pvalue",
          'thresh' : 0.0001}

vectorizer = get_vectorizer(vectorizer_mode, params)

In [None]:
y_train = df_train.SH.values

In [None]:
vectorizer.fit(df_train[text], y_train)

In [None]:
vectorizer.df_features.p_value.max()