In [2]:
import scispacy
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import stopwords
import matplotlib
from skmultilearn.problem_transform import LabelPowerset
from imblearn.over_sampling import RandomOverSampler
stop_words = set(stopwords.words('english'))

In [268]:
# issues
# same element in body part and protocol like Chest/Abdomen/Pelvis, None etc

In [11]:
def explode_protocol_summary(text):
    items = re.findall("<b>(.*?)</b>", text)
    values = re.findall("</b>(.*?)<b>", text)
    protocol = "standard"
    body_part = oral_contrast = iv_contrast = ""
    try:
        if len(values) != len(items):
            values.append(re.findall("(?s:.*)</b>(.*?)$", text)[-1].split("|")[0])
    except:
        pass
    processed_values = []
    for s in values:
        processed_values.append("|".join(string for string in s.split("|") if len(string) > 0))
    if "Protocol:" not in items:
        items.insert(0, "Protocol:")
        processed_values.insert(0, "Standard")
    for i in range(len(items)):
        if items[i] == "Protocol:":
            protocol = processed_values[i]
        if items[i] == "Body Part:":
            body_part = processed_values[i]
        if items[i] == "Oral Contrast:":
            oral_contrast = processed_values[i]
        if items[i] == "IV Contrast:":
            iv_contrast = processed_values[i]
    return (protocol, body_part, oral_contrast, iv_contrast)

def extract_protocol(text):
    values = re.findall("Protocol:</b>(.*?)<b>", text)
    if len(values) == 0:
        values = re.findall("Protocol:</b>(.*?)$", text)
        if len(values) != 0:
            values[0] = values[0] + "|"
        else:
            values = ["Standard"]
    values[0] = re.sub(r'Please(.*?)$', '', values[0].strip())
    values[0] = re.sub(r'MD to(.*?)$', '', values[0].strip())
    if values[0] == "|":
        values[0] = "Standard"
    return values[0]

def extract_iv_contrast(text):
    values = re.findall("IV Contrast:</b>(.*?)<b>", text)
    if len(values) == 0:
        values = re.findall("IV Contrast:</b>(.*?)$", text)
        if len(values) != 0:
            values[0] = values[0] + "|"
        else:
            values = ["None"]
    values[0] = re.sub(r'Please(.*?)$', '', values[0].strip())
    if values[0] == "|":
        values[0] = "None"
    return values[0]

def extract_body_part(text):
    values = re.findall("Body Part:</b>(.*?)<b>", text)
    if len(values) == 0:
        values = re.findall("Body Part:</b>(.*?)$", text)
        if len(values) != 0:
            values[0] = values[0] + "|"
        else:
            values = ["None"]
    values[0] = re.sub(r'Please(.*?)$', '', values[0].strip())
    if values[0] == "|":
        values[0] = "None"
    return values[0]

def extract_oral_contrast(text):
    values = re.findall("Oral Contrast:</b>(.*?)<b>", text)
    if len(values) == 0:
        values = re.findall("Oral Contrast:</b>(.*?)$", text)
        if len(values) != 0:
            values[0] = values[0] + "|"
        else:
            values = ["None"]
    values[0] = re.sub(r'Please(.*?)$', '', values[0].strip())
    if values[0] == "|":
        values[0] = "None"
    return values[0]

def create_labels(arr):
    l = set()
    for i in arr:
        for j in i.split("|"):
            if(j.strip != ""):
                l.add(j.strip())
    return list(set(l))
        

def combine_features(text):
    return text["StudyDesc"] + " " + text["procedurecode"] + " " + text["reasonofstudy"]

In [12]:
data = pd.read_csv("data/Past5YCT2.csv", delimiter=",")
filter_rows = ["reasonofstudy", "StudyDesc", "procedurecode", "protocolsummary"]
data = data[filter_rows]
data = data.dropna()

data["Protocol"], data["Body Part"], data["Oral Contrast"], data["IV Contrast"] = zip(*data.apply(lambda x: explode_protocol_summary(x["protocolsummary"]), axis=1))
data = data.dropna()
not_protocols = ["Dual-Energy Candidate"]
data["raw_protocols"] = data["protocolsummary"].apply(extract_protocol)
data["raw_iv_contrast"] = data["protocolsummary"].apply(extract_iv_contrast)
data["raw_body_part"] = data["protocolsummary"].apply(extract_body_part)
data["raw_oral_contrast"] = data["protocolsummary"].apply(extract_oral_contrast)
data["X"] = data.apply(combine_features, axis=1)

In [13]:
data["X"]

1         CT UPPER ABD AND PELVIS W RADCT001 Reason: his...
2         CT HEAD WO RAD1023 Reason: ICH  History: fall ...
3         CT P HEAD WO RADP1024 Reason: view for hemorrh...
4         CT UPPER ABD AND PELVIS W RADCT001 Reason: His...
5         CT P CHEST W RADP1018 Reason: Synovial sarcoma...
                                ...                        
310304    CT CHEST WO RAD1019 Reason: assess for mets  H...
310305    CT UPPER ABD AND PELVIS W RADCT001 Reason: s/p...
310306    CT CHEST ABDOMEN PELVIS W RADCT022 Reason: Fol...
310307    CT PE CHEST W RAD1102 Reason: PE as cause of p...
310308    CT CHEST ABDOMEN PELVIS WO RADCT023 Reason: Pa...
Name: X, Length: 249166, dtype: object

In [14]:
protocol_labels = create_labels(set(data["raw_protocols"]))[1:]
body_part_labels = create_labels(set(data["raw_body_part"]))[1:]
oral_contrast_labels = create_labels(set(data["raw_oral_contrast"]))[1:]
iv_contrast_labels = create_labels(set(data["raw_iv_contrast"]))[1:]

for p in not_protocols:
    protocol_labels.remove(p)

In [8]:
# protocols = tuple(open("data/protocolsList.txt", 'r'))
# body_part = tuple(open("data/bodyPartList.txt", 'r'))
# oral_contrast = tuple(open("data/oralContrastList.txt", 'r'))
# iv_contrast = tuple(open("data/ivContrastList", 'r'))

# protocols_labels = [p.split(",")[0].strip()[1:-1] for p in protocols]
# body_part_labels = [p.split(",")[0].strip()[1:-1] for p in body_part]
# oral_contrast_labels = [p.split(",")[0].strip()[1:-1] for p in oral_contrast]
# iv_contrast_labels = [p.split(",")[0].strip()[1:-1] for p in iv_contrast]

# labels = list(set(protocols + body_part + oral_contrast + iv_contrast))

In [15]:
pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(RandomForestClassifier(warm_start=True))),
            ])
pipeline

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all'...
                 OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True,
                                                                      class_weight=None,
                                                                      criterion='gini',
                                                        

In [16]:
def is_label_present(text, label):
    text = text.split("|")
    text = [x.strip() for x in text]
    if label in text:
        return True
    return False

def all_labels(text, label_set):
    l_labels = []
    for label in label_set:
        if is_label_present(text, label):
            l_labels.append(label)
    if len(l_labels) == 0:
        l_labels.append("Standard")
    return list(set(l_labels))

# for label in labels:
#     data[label] = data["protocolsummary"].apply(lambda x: is_label_present(x, label))

data["protocol_labels"] = data["raw_protocols"].apply(all_labels, label_set=protocol_labels)
data["body_part_labels"] = data["raw_body_part"].apply(all_labels, label_set=body_part_labels)
data["oral_contrast_labels"] = data["raw_oral_contrast"].apply(all_labels, label_set=oral_contrast_labels)
data["iv_contrast_label"] = data["raw_iv_contrast"].apply(all_labels, label_set=iv_contrast_labels)
train, test = train_test_split(data, random_state=42, test_size=0.33, shuffle=True)


In [29]:
lb = preprocessing.MultiLabelBinarizer(classes = protocol_labels)
y_train = lb.fit_transform(train["protocol_labels"])
y_test = lb.fit_transform(test["protocol_labels"])

In [28]:
np.array(train["protocol_labels"])

array([list(['Brain']), list(['Brain']), list(['Standard']), ...,
       list(['Standard']), list(['Standard']), list(['CTA Brain/Neck'])],
      dtype=object)

In [30]:
# for i, label in enumerate(body_part_labels):
#     pipeline.fit(train["reasonofstudy"], train[label])
#     prediction = pipeline.predict(test["reasonofstudy"])
#     if i == 1:
#         print(len(list(prediction)))
#         break
#     print(f"Processing", label, " - Accuracy", accuracy_score(test[label], prediction))


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0])

In [30]:
pipeline.fit(list(train["X"]), y_train)
predicted = pipeline.predict(test["X"])









In [31]:
accuracy_score(y_test, predicted)

0.8348434174521131

In [77]:
d = {}
for line in data["protocol_labels"]:
    for k in line:
        if not k in d:
            d[k] = 1
        else:
            d[k] = d[k] + 1
weights = []
for label in protocol_labels:
    pos = data.shape[0] / d[label]
    

In [36]:
lb.inverse_transform(np.reshape(predicted[175], (1, len(set(protocol_labels)))))

[('Standard Contrast Enhanced Chest/Abdomen',)]

In [37]:
lb.inverse_transform(np.reshape(y_test[175], (1, len(set(protocol_labels)))))

[('Standard Contrast Enhanced Chest/Abdomen',)]

In [40]:
train.shape

(166941, 17)