In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


False

In [3]:
np.random.seed(100)

In [4]:
df = pd.read_csv("KEC_SAC_radiology_data_for_CS_8.3.2022.csv", header=0, names=["study_id", "label", "mr_report"])

In [5]:
df.head()

Unnamed: 0,study_id,label,mr_report
0,2,0,Reason for Exam: CHRONIC LOWER BACK PAIN. GET...
1,3,0,Reason for Exam: KNOWN MULTILEVEL DEGENERATIVE...
2,4,0,MR LUMBAR SPINE Reason for Exam: PROGRESSIV...
3,5,0,MR CERVICAL SPINE Reason for Exam: HAS HX O...
4,6,0,MRI lumbar spine Comparison: No prior ...


In [6]:
# dropiing few negative outcomes to maintain balance
remove_n = 136
drop_indices = np.random.choice(df[(df.label == 0)].index, remove_n, replace=False)
print(drop_indices)
df2 = df.drop(drop_indices)
df2.reset_index(inplace=True)
df2.shape

[368 203   1  27 151   7  67 296 120  72  18  61 329  93 352 209 243 132
 344 255 119 164 337  88 303 274 247  49 356 184 258 238 275 229 193 141
 159 331 224 148 291 227 200 262 239 264 307 150 283 195  15  78 191 237
 363 325 144 277 235 103 189 306 340  23 261  10 280  59 179  74 328 210
  19 297  42  66 215 154 302 330 231  41 312  33 241 216 315 202 113  68
 350 300 319 318 100  32 259 339  57  26 305 263 295  30 130 171 349   3
 206 167 341 260  52  51 111 114 268 310 234 198 147  70 192 115 250 270
  95 133  38 162 348  98 246 240  45 102]


(235, 4)

In [7]:
df2[:300]

Unnamed: 0,index,study_id,label,mr_report
0,0,2,0,Reason for Exam: CHRONIC LOWER BACK PAIN. GET...
1,2,4,0,MR LUMBAR SPINE Reason for Exam: PROGRESSIV...
2,4,6,0,MRI lumbar spine Comparison: No prior ...
3,5,7,0,MRI CERVICAL SPINE HISTORY: Sensory changes w...
4,6,8,0,"Reason for Exam: LOW BACK PAIN, PAIN AND NUMBN..."
...,...,...,...,...
230,365,491,0,Technique: MRI lumbar spine routine protocol w...
231,366,493,0,Reason for Exam: LONG STANDING H/O LOWER BACK ...
232,367,494,0,"TECHNIQUE: Sagittal T2, sagittal T1, axial T2,..."
233,369,498,0,Reason for Exam: CERVICAL RADICULOPATHY. PAIN ...


# Data pre-processing

In [8]:
# Removing blank rows if any.
df['mr_report'].dropna(inplace=True)

In [9]:
# Changing all the text to lower case.
df2['mr_report'] = [entry.lower() for entry in df2['mr_report']]

In [10]:
# Tokenization : Each entry in the mri_report will be broken into set of words
df2['mr_report']= [word_tokenize(entry) for entry in df2['mr_report']]

In [11]:
# Removing Stop words, Non-Numeric and perfom Word Stemming/Lemmenting

# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV



for index,entry in enumerate(df2['mr_report']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    df2.loc[index,'text_final'] = str(Final_words)

In [12]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(df2['text_final'],df2['label'],test_size=0.2)

In [13]:
print(Train_X[:10])

170    ['mr', 'lumbar', 'spine', 'reason', 'exam', 'o...
110    ['ct', 'lumbar', 'spine', 'imaging', 'obtain',...
148    ['history', 'pathology', 'low', 'back', 'pain'...
29     ['mr', 'lumbar', 'spine', 'location', 'exam', ...
135    ['procedure', 'date', 'procedure', 'name', 'mr...
199    ['history', 'right', 'arm', 'pain', 'weakness'...
53     ['mri', 'lumbar', 'spine', 'history', 'increas...
106    ['technique', 'sagittal', 'imaging', 'cervical...
172    ['mr', 'lumbar', 'spine', 'reason', 'exam', 'w...
224    ['technique', 'routine', 'non', 'enhance', 'pr...
Name: text_final, dtype: object


In [14]:
print(Train_Y[:10])

170    1
110    0
148    0
29     0
135    0
199    1
53     1
106    1
172    0
224    0
Name: label, dtype: int64


In [15]:
# Term Frequency: This summarizes how often a given word appears within a document
# Inverse Document Frequency: This down scales words that appear a lot across documents.

# fit the TG-IDF model on the whole mri_report. 
# This will help TF-IDF build a vocabulary of words which it has learned from the MRI data 
# and it will assign a unique integer number to each of these words.


Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Train_X)

# Finally we will transform Train_X and Test_X to vectorized Train_X_Tfidf and Test_X_Tfidf
# Each row will be a list of unique integer number and its associated importance calculated by TF-IDF.
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [16]:
#  the vocabulary that it has learned from the mri_report
print(Tfidf_vect.vocabulary_)

{'mr': 959, 'lumbar': 870, 'spine': 1471, 'reason': 1269, 'exam': 525, 'ongoing': 1057, 'radicular': 1255, 'sx': 1554, 'symptom': 1558, 'left': 816, 'foot': 599, 'leg': 818, 'surgical': 1544, 'target': 1568, 'intervention': 759, 'history': 676, 'radiate': 1250, 'technique': 1573, 'sagittal': 1366, 'axial': 144, 'sequence': 1402, 'acquire': 15, 'comparison': 261, 'january': 784, 'finding': 579, 'vertebral': 1702, 'body': 167, 'normal': 1026, 'alignment': 48, 'height': 655, 'maintain': 879, 'marrow': 894, 'signal': 1430, 'conus': 313, 'end': 499, 'significant': 1432, 'abnormality': 3, 'spinal': 1470, 'canal': 197, 'neural': 999, 'foramen': 600, 'widely': 1734, 'patent': 1116, 'minor': 942, 'loss': 864, 'disc': 437, 'desiccation': 399, 'posterior': 1173, 'protrusion': 1229, 'impinge': 708, 'anterior': 74, 'aspect': 117, 'thecal': 1585, 'sac': 1357, 'ap': 86, 'diameter': 424, 'mildly': 936, 'narrow': 985, 'stenosis': 1498, 'mild': 934, 'narrowing': 987, 'right': 1342, 'lateral': 807, 'rece

In [17]:
# Printing the vectorized data
print(Train_X_Tfidf)

  (0, 1734)	0.09788374301413733
  (0, 1702)	0.03990885537914796
  (0, 1607)	0.038989228077125654
  (0, 1585)	0.05376729930405794
  (0, 1573)	0.02029496450403266
  (0, 1568)	0.15934057653846403
  (0, 1558)	0.10147808536257155
  (0, 1554)	0.13302721860158848
  (0, 1544)	0.09788374301413733
  (0, 1515)	0.03739218041168359
  (0, 1498)	0.09333859960186729
  (0, 1471)	0.04789385280558253
  (0, 1470)	0.07675963776907553
  (0, 1458)	0.03985971454945999
  (0, 1451)	0.15943885819783996
  (0, 1450)	0.041273179065230606
  (0, 1433)	0.050739042681285776
  (0, 1432)	0.12600761391920332
  (0, 1430)	0.018770649603581994
  (0, 1428)	0.03595511912918879
  (0, 1402)	0.03464888908734105
  (0, 1366)	0.023936352512184837
  (0, 1357)	0.054133129202132944
  (0, 1348)	0.15044547083232296
  (0, 1342)	0.06882503976629768
  :	:
  (187, 298)	0.1433616925627286
  (187, 297)	0.061447943184314555
  (187, 274)	0.09437397601838389
  (187, 270)	0.06544110461574688
  (187, 261)	0.024610383912388734
  (187, 228)	0.0519438

# Using Support Vector Machine


In [18]:
# fit the training dataset on the SVM
SVM = svm.SVC(C=9, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)

# predicting the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print("F1-Score -> ",f1_score(predictions_SVM, Test_Y))
print("Precision -> ",precision_score(predictions_SVM, Test_Y))
print("Recall -> ", recall_score(predictions_SVM, Test_Y))

SVM Accuracy Score ->  68.08510638297872
F1-Score ->  0.2105263157894737
Precision ->  0.25
Recall ->  0.18181818181818182


In [19]:
c_range = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

c_scores = []
f_scores = []

# we will loop through reasonable values of c
for c in c_range:
    
    # fit the training dataset on the SVM
    SVM = svm.SVC(C=c, kernel='linear', degree=3, gamma='auto')
    SVM.fit(Train_X_Tfidf,Train_Y)
    # predicting the labels on validation dataset
    predictions_SVM = SVM.predict(Test_X_Tfidf)
    
    accuracy = accuracy_score(predictions_SVM, Test_Y)*100
    f1 = f1_score(predictions_SVM, Test_Y)
    
    c_scores.append(accuracy)
    f_scores.append(f1)
    
    print("SVM Accuracy Score and F1-Score when c=", c, "->", accuracy_score(predictions_SVM, Test_Y)*100, "and", f1_score(predictions_SVM, Test_Y))



SVM Accuracy Score and F1-Score when c= 1 -> 80.85106382978722 and 0.0
SVM Accuracy Score and F1-Score when c= 2 -> 72.3404255319149 and 0.23529411764705882
SVM Accuracy Score and F1-Score when c= 3 -> 70.2127659574468 and 0.22222222222222224
SVM Accuracy Score and F1-Score when c= 4 -> 70.2127659574468 and 0.22222222222222224
SVM Accuracy Score and F1-Score when c= 5 -> 68.08510638297872 and 0.2105263157894737
SVM Accuracy Score and F1-Score when c= 6 -> 68.08510638297872 and 0.2105263157894737
SVM Accuracy Score and F1-Score when c= 7 -> 68.08510638297872 and 0.2105263157894737
SVM Accuracy Score and F1-Score when c= 8 -> 68.08510638297872 and 0.2105263157894737
SVM Accuracy Score and F1-Score when c= 9 -> 68.08510638297872 and 0.2105263157894737
SVM Accuracy Score and F1-Score when c= 10 -> 68.08510638297872 and 0.2105263157894737


# Using Logistic Regression

In [20]:
# fit the training dataset on the LR
lr = LogisticRegressionCV(cv=5, class_weight = 'balanced', random_state=0)
lr.fit(Train_X_Tfidf,Train_Y)

# predicting the labels on validation dataset
predictions_LR = lr.predict(Test_X_Tfidf)

print("LR Accuracy Score -> ",accuracy_score(predictions_LR, Test_Y)*100)
print("F1-Score -> ",f1_score(predictions_LR, Test_Y))
print("Precision -> ",precision_score(predictions_SVM, Test_Y))
print("Recall -> ", recall_score(predictions_SVM, Test_Y))

LR Accuracy Score ->  65.95744680851064
F1-Score ->  0.2
Precision ->  0.25
Recall ->  0.18181818181818182


# Using Voting Ensembles

In [21]:
clf1 = LogisticRegression(multi_class='multinomial', class_weight = 'balanced', random_state=0)
clf2 = svm.SVC(C=9, kernel='linear', degree=3, gamma='auto')

eclf1 = VotingClassifier(estimators=[('lr', clf1), ('svm', clf2)], voting='hard')
eclf1 = eclf1.fit(Train_X_Tfidf,Train_Y)
predictions_eclf1 = eclf1.predict(Test_X_Tfidf)
print("Hard voting Accuracy Score -> ",accuracy_score(predictions_eclf1, Test_Y)*100)
print("Hard voting F1-Score -> ",f1_score(predictions_eclf1, Test_Y))
print("Precision -> ",precision_score(predictions_SVM, Test_Y))
print("Recall -> ", recall_score(predictions_SVM, Test_Y))



Hard voting Accuracy Score ->  70.2127659574468
Hard voting F1-Score ->  0.22222222222222224
Precision ->  0.25
Recall ->  0.18181818181818182


# Keywords

In [22]:
KEYWORDS = ['urgent', 'urgently', 'urgency', 'prudent',
            'advanced', 'serious', 'seriously', 'critical', 'critically',
            'crucial', 'dire', 'emergency',
            'surgery', 'surgical', 'consultation', 'advised',
            'tumor', 'tumour']
i = 0
for text in Test_X:
    contains_keyword = 0
    count = 0
    for word in KEYWORDS:
        if word in text:
            count += 1

    if count >= 3:
        contains_keyword = 1
    if predictions_eclf1[i] == 0:
        predictions_eclf1[i] = contains_keyword
    i += 1

print("Keywords Accuracy Score -> ",accuracy_score(predictions_eclf1, Test_Y)*100)
print("Keywords voting F1-Score -> ",f1_score(predictions_eclf1, Test_Y))
print("Precision -> ",precision_score(predictions_SVM, Test_Y))
print("Recall -> ", recall_score(predictions_SVM, Test_Y))


Keywords Accuracy Score ->  70.2127659574468
Keywords voting F1-Score ->  0.22222222222222224
Precision ->  0.25
Recall ->  0.18181818181818182
