In [2]:
import numpy as np
import pandas as pd
import re
import pickle
import joblib

import spacy
from custom_tokenizer import combined_rule_tokenizer
from spellchecker import SpellChecker
from spacy.language import Language
from spacy.tokens import Span
from utils import evaluate_model

# Pretty plots
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('ticks')
plt.rcParams['figure.figsize'] = (7, 5)
plt.rcParams['axes.titlesize'] = 22
plt.rcParams['axes.labelsize'] = 20
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 16

In [3]:
def is_compound_token(string):
    pattern = re.compile(".[-/\+_,\?\.].")
    return pattern.search(string) and string not in vocab

def retokenize(doc):
    new_doc = []
    for token in doc:
        if token.like_num:
            new_doc.append(token.text)
        elif is_compound_token(token.text):
            [new_doc.append(new_token) for new_token in re.split('([-/\+_,\?\.])', token.text)]
        else:
            new_doc.append(token.text)
            
    return ' '.join(new_doc)

def spelling_correction(doc):
    tokens = doc.split()
    corrected_tokens = [misspelled[token][1] if token in misspelled else token for token in tokens]
    return ' '.join(corrected_tokens)

def slang_to_generic(doc):
    tokens = doc.split()
    corrected_tokens = [slang_names[token] if token in slang_names else token for token in tokens]
    return ' '.join(corrected_tokens)

@Language.component("custom_ner") 
def custom_ner(doc):
    ents = []
    for token in doc:
        if not token.is_stop and not token.is_punct and not token.like_num and token.text!="+":
            ents.append(Span(doc, token.i, token.i+1, label="CONCEPT"))
    doc.ents = ents
    return doc

In [4]:
# Arguments
train_data = "rmh_1217_train"
test_data = "rmh_2012_2017_test"

In [14]:
if __name__ == '__main__':
    # Load preprocessed triage notes
    df = pd.read_csv("../../data/" + test_data + "_normalised.csv")
    
    ###TEMPORARY
    df.rename({'preprocessed_triage_note': 'text_clean'}, axis=1, inplace=True)
    df.text_clean.fillna("", inplace=True)
    
    df = df[:1000]

    # Load scispacy model for tokenization
    nlp = spacy.load("en_core_sci_sm", disable=['tagger', 'attribute_ruler', 'lemmatizer', 'parser', 'ner'])
    nlp.tokenizer = combined_rule_tokenizer(nlp)

    # Apply tokenizer
    df.text_clean = df.text_clean.apply(nlp)

    # Load previously learned custom vocabulary (word frequency list)
    with open ("../../data/spelling_correction/" + train_data + "_vocab.txt", 'rb') as f:
        vocab = pickle.load(f)

    # Initialise spellchecker with a custom vocab
    spell = SpellChecker(language=None)
    spell.word_frequency.load_words(vocab)

    # Apply re-tokenizer
    df.text_clean = df.text_clean.apply(retokenize)

    # Define regex pattern to split leading full stop
    pattern = re.compile("\s\.([a-z]{2,})")

    # Apply regex
    df.text_clean = df.text_clean.apply(lambda x: pattern.sub(r" . \1", x))

    # Load previously learned dictionary of misspellings
    with open ("../../data/spelling_correction/" + train_data + "_misspelled_dict.txt", 'rb') as f:
        misspelled = pickle.load(f)

    # Apply spelling correction
    df.text_clean = df.text_clean.apply(spelling_correction)

    # Load medication names
    df_drugs = pd.read_csv("../../data/spelling_correction/medication_names.csv")

    df_drugs.slang = df_drugs.slang.str.strip().str.lower()
    df_drugs.generic_name = df_drugs.generic_name.str.strip().str.lower()
    df_drugs.dropna(subset=["slang"], inplace=True)

    # Create a dictionary to convert slang to generic names
    slang_names = dict(zip(df_drugs.slang, df_drugs.generic_name))

    # Apply slang replacement
    df.text_clean = df.text_clean.apply(slang_to_generic)

    # Load Scispacy model
    nlp = spacy.load("en_core_sci_sm", disable=['ner'])

    # Add custom NER 
    nlp.add_pipe("custom_ner", last=True)

    # Apply NLP pipeline to extract concepts
    df['doc'] = df.text_clean.apply(nlp)
    df['concepts'] = df.doc.apply(lambda x: " ".join([ent.text for ent in x.ents]))

    # Load pretrained classifier
    path = "../../models/pretrained_pipe_" + train_data + ".sav"
    pipe = joblib.load(path)
    thresh = 0.372

    # Make predictions
    y_proba = pipe.predict_proba(df.concepts.fillna(""))

    # Convert predicted probabilities to class labels and evaluate results
    class_names = ("Controls", "Self-harm")
    df['y_pred'] = evaluate_model(df.SH.values, y_proba, 
                                  class_names, "2012-2017 test", 
                                  thresh=thresh, digits=3, 
                                  save_figures=False, filename="../../results/" + test_data)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


ModuleNotFoundError: No module named 'pandas.core.indexes.numeric'

In [22]:
with open ("../../models/pretrained_pipe_" + train_data + ".sav 3", 'rb') as f:
    vocab = pickle.load(f)

  vocab = pickle.load(f)
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


ModuleNotFoundError: No module named 'pandas.core.indexes.numeric'