In [1]:
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score
from sklearn import model_selection, naive_bayes, svm

In [2]:
# Loading dataset
df = pd.read_csv("KEC_SAC_radiology_data_for_CS_8.3.2022.csv", header=0, names=["study_id", "label", "mr_report"])
df['mr_report'] = df['mr_report'].astype(str)

df.head()

Unnamed: 0,study_id,label,mr_report
0,2,0,Reason for Exam: CHRONIC LOWER BACK PAIN. GETTING WORSE WITH TIME. AFFECTING THE PATIENT'S LIF...
1,3,0,Reason for Exam: KNOWN MULTILEVEL DEGENERATIVE DISC DISEASE AND MULTILEVEL DISC BULGING GRADUAL ...
2,4,0,MR LUMBAR SPINE Reason for Exam: PROGRESSIVELY WORSENING RT SIDED LOWER BACK AND RT LEG BURNI...
3,5,0,"MR CERVICAL SPINE Reason for Exam: HAS HX OF MVA, SEVERE NECK AND BACK PAIN. NUMBNESS, WEAK A..."
4,6,0,MRI lumbar spine Comparison: No prior Technique: Routine MRI of the lumbar spine FIN...


In [3]:
# dropiing few negative outcomes to maintain balance
remove_n = 136
drop_indices = np.random.choice(df[(df.label == 0)].index, remove_n, replace=False)
print(drop_indices)
df2 = df.drop(drop_indices)
df2.reset_index(inplace=True)
df2.shape

[322 119 116  52 145 171  98 120  63 262 209 151  67 352 189 206 210 368
 367  70  77 236 154 224 291  41   8 296 133 230 188 105  78 180 305   6
 326  54 191 257 306 277 235 332 162  21 108 271 348 103 218 122 251 270
 204 260  68 212 223 174 114 208 165  57 175 315 126  26 355 284 255 345
 181 319 318 290  15 200 203 341  32 282 250 360 193 144 366 241  12 274
 287 256 273  42 166 293  59 330  18 365 192 358  49 324  81  89 253 179
 297  24 351  11 139  65 215 240 317 254 364 280 158  13  95  22  55 353
 323 178 169  46 343  94  19 276 337  75]


(235, 4)

In [4]:
# Cleaning data using the built in cleaner in gensim
df2['text_clean'] = df2['mr_report'].apply(lambda x: gensim.utils.simple_preprocess(x))
df2.head()

Unnamed: 0,index,study_id,label,mr_report,text_clean
0,0,2,0,Reason for Exam: CHRONIC LOWER BACK PAIN. GETTING WORSE WITH TIME. AFFECTING THE PATIENT'S LIF...,"[reason, for, exam, chronic, lower, back, pain, getting, worse, with, time, affecting, the, pati..."
1,1,3,0,Reason for Exam: KNOWN MULTILEVEL DEGENERATIVE DISC DISEASE AND MULTILEVEL DISC BULGING GRADUAL ...,"[reason, for, exam, known, multilevel, degenerative, disc, disease, and, multilevel, disc, bulgi..."
2,2,4,0,MR LUMBAR SPINE Reason for Exam: PROGRESSIVELY WORSENING RT SIDED LOWER BACK AND RT LEG BURNI...,"[mr, lumbar, spine, reason, for, exam, progressively, worsening, rt, sided, lower, back, and, rt..."
3,3,5,0,"MR CERVICAL SPINE Reason for Exam: HAS HX OF MVA, SEVERE NECK AND BACK PAIN. NUMBNESS, WEAK A...","[mr, cervical, spine, reason, for, exam, has, hx, of, mva, severe, neck, and, back, pain, numbne..."
4,4,6,0,MRI lumbar spine Comparison: No prior Technique: Routine MRI of the lumbar spine FIN...,"[mri, lumbar, spine, comparison, no, prior, technique, routine, mri, of, the, lumbar, spine, fin..."


In [5]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split (df2['text_clean'], df2['label'] , test_size=0.2)

In [6]:
# Training the word2vec model
w2v_model = gensim.models.Word2Vec(X_train, vector_size=100, window=5, min_count=2)

In [7]:
# Representing all of the words that the Word2Vec model learned a vector for.
w2v_model.wv.index_to_key

['the',
 'of',
 'is',
 'and',
 'disc',
 'at',
 'no',
 'with',
 'mild',
 'stenosis',
 'there',
 'spinal',
 'narrowing',
 'left',
 'canal',
 'foraminal',
 'to',
 'right',
 'moderate',
 'degenerative',
 'facet',
 'are',
 'spine',
 'neural',
 'nerve',
 'lumbar',
 'on',
 'normal',
 'significant',
 'in',
 'or',
 'posterior',
 'central',
 'changes',
 'level',
 'root',
 'bilateral',
 'lateral',
 'cord',
 'bulge',
 'severe',
 'signal',
 'vertebral',
 'protrusion',
 'findings',
 'this',
 'cervical',
 'sac',
 'thecal',
 'joint',
 'osteophyte',
 'recess',
 'small',
 'as',
 'for',
 'roots',
 'impingement',
 'impression',
 'height',
 'foramen',
 'complex',
 'from',
 'body',
 'seen',
 'noted',
 'mri',
 'exiting',
 'broad',
 'patent',
 'within',
 'based',
 'pain',
 'bilaterally',
 'foramina',
 'conus',
 'change',
 'joints',
 'hypertrophy',
 'sagittal',
 'compression',
 'space',
 'loss',
 'axial',
 'technique',
 'bulging',
 'present',
 'maintained',
 'mm',
 'endplate',
 'arthropathy',
 'multilevel',
 '

In [8]:
w2v_model.wv.most_similar('spine')

[('lumbar', 0.9951924681663513),
 ('mri', 0.9945905804634094),
 ('axial', 0.9888128638267517),
 ('routine', 0.9886494278907776),
 ('protocol', 0.9878730773925781),
 ('sagittal', 0.987753689289093),
 ('performed', 0.9871012568473816),
 ('reason', 0.9863398671150208),
 ('findings', 0.9855054616928101),
 ('comparison', 0.9853112697601318)]

In [9]:
# Generate aggregated sentence vectors based on the word vectors for each word in the sentence

words = set(w2v_model.wv.index_to_key )
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_train])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_test])

  X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_train])
  X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_test])


In [10]:
for i, v in enumerate(X_train_vect):
    print(len(X_train.iloc[i]), len(v))

218 218
151 148
316 301
316 306
257 252
248 246
349 349
192 191
141 137
440 433
325 321
238 238
295 292
277 274
341 339
454 452
102 101
297 292
127 123
522 520
186 186
194 192
109 109
270 268
236 228
492 490
406 404
504 502
495 489
516 508
187 186
353 345
228 218
571 566
320 317
120 119
109 108
414 409
326 324
254 254
234 233
157 155
269 269
756 744
199 197
175 174
389 387
214 212
305 302
827 812
115 113
434 434
481 470
622 593
233 230
245 242
590 589
235 229
193 188
244 238
213 208
292 284
431 431
190 187
394 390
160 156
302 299
421 415
257 256
560 558
253 250
146 142
213 208
573 571
234 234
298 294
136 135
201 201
155 151
309 304
459 452
328 325
172 169
350 350
314 313
199 199
130 129
357 322
327 325
197 197
97 95
332 325
300 295
402 386
292 288
588 586
424 423
273 271
338 336
197 197
338 338
413 410
390 383
348 343
194 188
299 293
221 220
147 145
177 173
151 148
317 315
287 279
435 426
219 217
181 173
479 472
382 382
426 424
201 194
173 169
618 618
567 563
138 130
150 150
272 268
25

In [19]:
X_test_vect[0][1]

array([-0.40076643,  0.36740884,  0.37754995,  0.09438734,  0.4270553 ,
       -0.4195938 ,  0.35371464,  1.1218278 ,  0.09371022, -0.26987255,
       -0.5473552 , -0.45671287, -0.15781069,  0.42611578,  0.00638119,
       -0.30295205, -0.18307954, -0.6500213 , -0.28712854, -0.6097589 ,
       -0.05473719,  0.3896054 , -0.19432881,  0.11734878,  0.2799899 ,
        0.5256908 ,  0.11829483, -0.83323896, -0.5843147 ,  0.31870443,
        0.55235904,  0.06268162, -0.10053503, -0.12104149, -0.16314286,
        1.0744277 ,  0.32484534, -0.6761159 ,  0.06479732, -1.1284413 ,
       -0.14002538, -0.37034106, -0.272294  , -0.38191637,  0.5559744 ,
        0.05579705, -0.05843733, -0.06351724,  0.57364744, -0.2585903 ,
        0.3021634 , -0.514015  , -0.3094624 ,  0.45295736, -0.5717651 ,
        0.44756117, -0.89476544, -0.3234784 , -0.6406612 , -0.03802901,
        0.19231614,  0.33472145, -0.332658  ,  0.02001887, -0.28413296,
        0.33599687,  0.08871414,  0.33790743, -0.6128048 ,  0.52

In [11]:
# Computing sentence vectors by averaging the word vectors for the words contained in the sentence

X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [20]:
X_test_vect_avg[0]

array([-0.12540548,  0.23528606, -0.13865426, -0.02869947,  0.39377782,
       -0.4535137 ,  0.20123076,  0.87140507, -0.43345404, -0.231745  ,
       -0.04901423, -0.6385936 , -0.17245853,  0.2790352 ,  0.53380215,
       -0.25936255, -0.00225338, -0.2935026 ,  0.27925855, -0.9589953 ,
        0.13241999, -0.02839183,  0.44407603, -0.04201308, -0.1324732 ,
        0.18829915, -0.0956796 , -0.2964589 , -0.29269853,  0.18265955,
        0.48867837, -0.14686705,  0.42845324, -0.4045663 , -0.4415801 ,
        0.2853043 , -0.04032714, -0.07733487, -0.19834559, -0.65666443,
        0.16215542, -0.35727227, -0.02570017,  0.07526274,  0.35960555,
       -0.12090905, -0.4292245 , -0.1585075 ,  0.23565921,  0.33073324,
        0.09976795, -0.07728504,  0.33200172, -0.11747357,  0.09883501,
       -0.01752951,  0.0637873 ,  0.2119473 ,  0.00691046, -0.07408798,
        0.08666965, -0.1319542 ,  0.26746327, -0.11835349, -0.55809724,
        0.30675432,  0.16850522,  0.62761486, -0.6138752 ,  0.49

In [12]:
for i, v in enumerate(X_train_vect_avg):
    print(len(X_train.iloc[i]), len(v))

218 100
151 100
316 100
316 100
257 100
248 100
349 100
192 100
141 100
440 100
325 100
238 100
295 100
277 100
341 100
454 100
102 100
297 100
127 100
522 100
186 100
194 100
109 100
270 100
236 100
492 100
406 100
504 100
495 100
516 100
187 100
353 100
228 100
571 100
320 100
120 100
109 100
414 100
326 100
254 100
234 100
157 100
269 100
756 100
199 100
175 100
389 100
214 100
305 100
827 100
115 100
434 100
481 100
622 100
233 100
245 100
590 100
235 100
193 100
244 100
213 100
292 100
431 100
190 100
394 100
160 100
302 100
421 100
257 100
560 100
253 100
146 100
213 100
573 100
234 100
298 100
136 100
201 100
155 100
309 100
459 100
328 100
172 100
350 100
314 100
199 100
130 100
357 100
327 100
197 100
97 100
332 100
300 100
402 100
292 100
588 100
424 100
273 100
338 100
197 100
338 100
413 100
390 100
348 100
194 100
299 100
221 100
147 100
177 100
151 100
317 100
287 100
435 100
219 100
181 100
479 100
382 100
426 100
201 100
173 100
618 100
567 100
138 100
150 100
272 100
2

In [13]:
# fit the training dataset on the LR
lr = LogisticRegressionCV(cv=5, max_iter = 10000, class_weight = 'balanced', random_state=0)
lr.fit(X_train_vect_avg, y_train.values.ravel())

# predicting the labels on validation dataset
predictions_LR = lr.predict(X_test_vect_avg)

print("LR Accuracy Score -> ",accuracy_score(predictions_LR, y_test)*100)
print("F1-Score -> ",f1_score(predictions_LR, y_test))

LR Accuracy Score ->  72.3404255319149
F1-Score ->  0.5185185185185185


In [14]:
# Precision and Recall

precision = precision_score(y_test, predictions_LR)
recall = recall_score(y_test, predictions_LR)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((predictions_LR==y_test).sum()/len(predictions_LR), 3)))

Precision: 0.438 / Recall: 0.636 / Accuracy: 0.723


In [15]:
# fit the training dataset on the SVM
SVM = svm.SVC(C=9, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train_vect_avg, y_train.values.ravel())

# predicting the labels on validation dataset
predictions_SVM = SVM.predict(X_test_vect_avg)

# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, y_test)*100)
print("F1-Score -> ",f1_score(predictions_SVM, y_test))

SVM Accuracy Score ->  76.59574468085107
F1-Score ->  0.0


In [16]:
# Precision and Recall

precision = precision_score(y_test, predictions_SVM)
recall = recall_score(y_test, predictions_SVM)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((predictions_SVM==y_test).sum()/len(predictions_SVM), 3)))

Precision: 0.0 / Recall: 0.0 / Accuracy: 0.766


  _warn_prf(average, modifier, msg_start, len(result))


# Using Voting Ensembles

In [17]:
from sklearn.ensemble import VotingClassifier
clf1 = LogisticRegression(multi_class='multinomial', class_weight = 'balanced', random_state=0)
clf2 = svm.SVC(C=9, kernel='linear', degree=3, gamma='auto')

eclf1 = VotingClassifier(estimators=[('lr', clf1), ('svm', clf2)], voting='hard')
eclf1 = eclf1.fit(X_train_vect_avg, y_train.values.ravel())
predictions_eclf1 = eclf1.predict(X_test_vect_avg)
print("Hard voting Accuracy Score -> ",accuracy_score(predictions_eclf1, y_test)*100)
print("Hard voting F1-Score -> ",f1_score(predictions_eclf1, y_test))


Hard voting Accuracy Score ->  76.59574468085107
Hard voting F1-Score ->  0.0
