In [1]:
import pandas as pd
import numpy as np

import re
import spacy
import pickle
import scispacy
from spacy.language import Language
from spacy.tokens import Span, Doc
from negspacy.termsets import termset

from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer 
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_colwidth = 100

In [2]:
filename = "rmh_1217_test"

In [3]:
df = pd.read_csv("../data/" + filename + "_cleaned.csv")
print(df.shape)
df.head()

(79240, 12)


Unnamed: 0,patient_id,uid,age,sex,arrival_mode,arrival_date,year,text,length,SH,SI,text_clean
0,1127885.0,299836,45,male,self/community/pt,2017-05-25 10:30:00,2017,L flank pain - seen here yesterday and had USS - now presents requesting results.,81,0,0,l flank pain - seen here yesterday and had uss - now presents requesting results .
1,2095728.0,315549,27,female,other,2012-08-16 21:50:00,2012,"Abdo pain 1/52, worsening this pm, RIF, vomiting, no diarrhoea, nil urinary sx. Taking regular ...",179,0,0,"abdo pain 1/52 , worsening this pm , rif , vomiting , no diarrhoea , nil urinary sx . taking reg..."
2,1009473.0,482578,45,male,other,2012-06-15 20:06:00,2012,Painful R) elbow post fall from motorbike. Movement decreasing with time. Denies headstrike or L...,159,0,0,painful right elbow post fall from motorbike . movement decreasing with time . denies headstrike...
3,4010717.0,612892,75,female,self/community/pt,2016-12-26 11:11:00,2016,Episode dizzy nausea and blurred vision. Sx resolved hgowever pt feeling tired. Phx nil,87,0,0,episode dizzy nausea and blurred vision . sx resolved hgowever pt feeling tired . phx nil
4,2073600.0,372128,25,female,other,2012-01-14 14:50:00,2012,Dulled sensation and feeling of cool down L arm and L leg post workout at gym this am. Full equa...,137,0,0,dulled sensation and feeling of cool down l arm and l leg post workout at gym this am . full equ...


In [4]:
@Language.component("custom_ner") 
def custom_ner(doc):
    ents = []
    for token in doc:
        if not token.is_stop and not token.is_punct and not token.like_num and token.text!="+":
            ents.append(Span(doc, token.i, token.i+1, label="ENTITY"))
    doc.ents = ents
    return doc

In [5]:
# Load scispacy model
nlp = spacy.load("en_core_sci_lg", disable=['tagger', 'attribute_ruler', 'lemmatizer', 'parser', 'ner'])

# Custom NER 
nlp.add_pipe("custom_ner", last=True)

print("NLP pipeline: tokenizer + {}".format(nlp.pipe_names))

NLP pipeline: tokenizer + ['tok2vec', 'custom_ner']


In [6]:
%%time
df['doc'] = df.text_clean.apply(nlp)

CPU times: user 3min 34s, sys: 1.11 s, total: 3min 35s
Wall time: 3min 36s


In [7]:
# Modify negex termsets
ts = termset('en_clinical').get_patterns()
ts['preceding_negations'].extend(["nil", "non"])
ts['termination'].extend([",", ";", ":", "obviously"])

In [10]:
def word_count(data):
    vectorizer = CountVectorizer(stop_words=stopwords.words('english') + ts['preceding_negations'], 
                                 ngram_range=(2,2), 
                                 token_pattern=r'\S+')
    vectors = vectorizer.fit_transform(data)
    
    vocab = vectorizer.vocabulary_
    counts = vectors.sum(axis=0).A1
    
    return Counter(dict(zip(vocab, counts)))

In [11]:
bigrams = word_count(df.doc.apply(lambda x: " ".join([ent.text for ent in x.ents])))
len(bigrams)

370683

In [12]:
cut_off = np.quantile(np.fromiter(bigrams.values(), dtype=np.int), 0.99) 
print("Cut-off:", cut_off)
n_bigrams = (np.fromiter(bigrams.values(), dtype=np.int) > cut_off).sum()
print("%d most common bigrams" % n_bigrams)
most_common_bigrams = [item[0] for item in bigrams.most_common(n_bigrams)]

Cut-off: 32.0
3597 most common bigrams


In [None]:
with open ("../data/" + filename + "most_common_bigrams.txt", 'wb') as f:
    pickle.dump(most_common_bigrams, f)