In [1]:
pip install -U gensim

Note: you may need to restart the kernel to use updated packages.


In [37]:
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score
from sklearn import model_selection, naive_bayes, svm

In [38]:
# Loading dataset
df = pd.read_csv(r"D:\E\Baikka\Project\KEC SAC radiology data for CS 8.3.2022.csv",encoding='utf-8')
df['mr_report'] = df['mr_report'].astype(str)

df.head()

Unnamed: 0,Outcome,mr_report
0,0,Reason for Exam: CHRONIC LOWER BACK PAIN. GETTING WORSE WITH TIME. AFFECTING THE PATIENT'S LIF...
1,0,Reason for Exam: KNOWN MULTILEVEL DEGENERATIVE DISC DISEASE AND MULTILEVEL DISC BULGING GRADUAL ...
2,0,MR LUMBAR SPINE Reason for Exam: PROGRESSIVELY WORSENING RT SIDED LOWER BACK AND RT LEG BURNI...
3,0,"MR CERVICAL SPINE Reason for Exam: HAS HX OF MVA, SEVERE NECK AND BACK PAIN. NUMBNESS, WEAK A..."
4,0,MRI lumbar spine Comparison: No prior Technique: Routine MRI of the lumbar spine FIN...


In [1]:
# dropiing few negative outcomes to maintain balance
remove_n = 136
drop_indices = np.random.choice(df[(df.Outcome == 0)].index, remove_n, replace=False)
print(drop_indices)
df2 = df.drop(drop_indices)
df2.reset_index(inplace=True)
df2.shape

NameError: name 'np' is not defined

In [40]:
# Cleaning data using the built in cleaner in gensim
df2['text_clean'] = df2['mr_report'].apply(lambda x: gensim.utils.simple_preprocess(x))
df2.head()

Unnamed: 0,index,Outcome,mr_report,text_clean
0,0,0,Reason for Exam: CHRONIC LOWER BACK PAIN. GETTING WORSE WITH TIME. AFFECTING THE PATIENT'S LIF...,"[reason, for, exam, chronic, lower, back, pain, getting, worse, with, time, affecting, the, pati..."
1,1,0,Reason for Exam: KNOWN MULTILEVEL DEGENERATIVE DISC DISEASE AND MULTILEVEL DISC BULGING GRADUAL ...,"[reason, for, exam, known, multilevel, degenerative, disc, disease, and, multilevel, disc, bulgi..."
2,2,0,MR LUMBAR SPINE Reason for Exam: PROGRESSIVELY WORSENING RT SIDED LOWER BACK AND RT LEG BURNI...,"[mr, lumbar, spine, reason, for, exam, progressively, worsening, rt, sided, lower, back, and, rt..."
3,3,0,"MR CERVICAL SPINE Reason for Exam: HAS HX OF MVA, SEVERE NECK AND BACK PAIN. NUMBNESS, WEAK A...","[mr, cervical, spine, reason, for, exam, has, hx, of, mva, severe, neck, and, back, pain, numbne..."
4,4,0,MRI lumbar spine Comparison: No prior Technique: Routine MRI of the lumbar spine FIN...,"[mri, lumbar, spine, comparison, no, prior, technique, routine, mri, of, the, lumbar, spine, fin..."


In [41]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split (df2['text_clean'], df2['Outcome'] , test_size=0.2)

In [42]:
# Training the word2vec model
w2v_model = gensim.models.Word2Vec(X_train, vector_size=100, window=5, min_count=2)

In [43]:
# Representing all of the words that the Word2Vec model learned a vector for.
w2v_model.wv.index_to_key

['the',
 'is',
 'of',
 'and',
 'disc',
 'at',
 'no',
 'mild',
 'with',
 'there',
 'stenosis',
 'spinal',
 'narrowing',
 'left',
 'right',
 'moderate',
 'canal',
 'to',
 'degenerative',
 'foraminal',
 'facet',
 'are',
 'spine',
 'neural',
 'on',
 'lumbar',
 'in',
 'nerve',
 'normal',
 'or',
 'significant',
 'changes',
 'posterior',
 'central',
 'bilateral',
 'level',
 'cord',
 'bulge',
 'root',
 'signal',
 'severe',
 'cervical',
 'vertebral',
 'findings',
 'protrusion',
 'joint',
 'this',
 'lateral',
 'sac',
 'thecal',
 'osteophyte',
 'patent',
 'for',
 'foramen',
 'impression',
 'present',
 'as',
 'complex',
 'change',
 'small',
 'recess',
 'from',
 'body',
 'height',
 'seen',
 'space',
 'bilaterally',
 'foramina',
 'endplate',
 'hypertrophy',
 'broad',
 'within',
 'mri',
 'compression',
 'pain',
 'roots',
 'based',
 'exiting',
 'noted',
 'conus',
 'sagittal',
 'which',
 'uncovertebral',
 'arthropathy',
 'intervertebral',
 'loss',
 'impingement',
 'axial',
 'technique',
 'bulging',
 'm

In [44]:
w2v_model.wv.most_similar('spine')

[('lumbar', 0.9966387748718262),
 ('mri', 0.988824725151062),
 ('routine', 0.9884322881698608),
 ('comparison', 0.9846733212471008),
 ('entire', 0.9839710593223572),
 ('findings', 0.9803573489189148),
 ('technique', 0.9793612957000732),
 ('sequences', 0.9787678122520447),
 ('protocol', 0.9783787131309509),
 ('mr', 0.9781882762908936)]

In [45]:
# Generate aggregated sentence vectors based on the word vectors for each word in the sentence

words = set(w2v_model.wv.index_to_key )
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_train])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_test])

  X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_train])
  X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_test])


In [46]:
for i, v in enumerate(X_train_vect):
    print(len(X_train.iloc[i]), len(v))

197 197
218 218
359 355
313 313
704 699
392 388
383 375
292 287
321 321
351 349
447 447
194 193
140 137
299 293
1037 1034
298 296
243 243
353 341
194 193
130 130
614 609
185 181
236 235
216 214
228 220
282 272
332 324
186 186
431 414
223 217
166 163
292 281
389 388
184 183
426 422
573 572
273 272
287 279
239 233
155 154
160 158
269 266
492 481
425 416
85 85
221 221
427 425
305 302
274 272
726 704
353 349
317 316
308 302
361 360
320 318
109 109
298 294
120 119
303 294
219 216
254 254
291 287
327 325
406 402
421 414
144 142
170 165
347 346
190 188
302 301
143 139
257 256
802 781
394 388
181 172
289 288
273 271
172 168
176 169
213 210
202 201
177 177
157 155
394 394
481 471
210 209
242 237
188 183
76 74
654 646
137 134
289 286
253 251
157 152
294 283
234 233
309 306
207 207
414 411
146 144
357 356
235 233
328 323
148 144
205 202
100 99
337 335
618 617
177 176
508 493
238 236
275 273
141 137
297 292
382 382
301 300
504 504
347 343
127 122
424 421
259 253
205 205
355 354
147 145
366 364
201

In [47]:
# Computing sentence vectors by averaging the word vectors for the words contained in the sentence

X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [48]:
for i, v in enumerate(X_train_vect_avg):
    print(len(X_train.iloc[i]), len(v))

197 100
218 100
359 100
313 100
704 100
392 100
383 100
292 100
321 100
351 100
447 100
194 100
140 100
299 100
1037 100
298 100
243 100
353 100
194 100
130 100
614 100
185 100
236 100
216 100
228 100
282 100
332 100
186 100
431 100
223 100
166 100
292 100
389 100
184 100
426 100
573 100
273 100
287 100
239 100
155 100
160 100
269 100
492 100
425 100
85 100
221 100
427 100
305 100
274 100
726 100
353 100
317 100
308 100
361 100
320 100
109 100
298 100
120 100
303 100
219 100
254 100
291 100
327 100
406 100
421 100
144 100
170 100
347 100
190 100
302 100
143 100
257 100
802 100
394 100
181 100
289 100
273 100
172 100
176 100
213 100
202 100
177 100
157 100
394 100
481 100
210 100
242 100
188 100
76 100
654 100
137 100
289 100
253 100
157 100
294 100
234 100
309 100
207 100
414 100
146 100
357 100
235 100
328 100
148 100
205 100
100 100
337 100
618 100
177 100
508 100
238 100
275 100
141 100
297 100
382 100
301 100
504 100
347 100
127 100
424 100
259 100
205 100
355 100
147 100
366 100
2

In [49]:
# fit the training dataset on the LR
lr = LogisticRegressionCV(cv=5, max_iter = 10000, class_weight = 'balanced', random_state=0)
lr.fit(X_train_vect_avg, y_train.values.ravel())

# predicting the labels on validation dataset
predictions_LR = lr.predict(X_test_vect_avg)

print("LR Accuracy Score -> ",accuracy_score(predictions_LR, y_test)*100)
print("F1-Score -> ",f1_score(predictions_LR, y_test))

LR Accuracy Score ->  53.191489361702125
F1-Score ->  0.38888888888888895


In [50]:
# Precision and Recall

precision = precision_score(y_test, predictions_LR)
recall = recall_score(y_test, predictions_LR)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((predictions_LR==y_test).sum()/len(predictions_LR), 3)))

Precision: 0.292 / Recall: 0.583 / Accuracy: 0.532


In [51]:
# fit the training dataset on the SVM
SVM = svm.SVC(C=9, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train_vect_avg, y_train.values.ravel())

# predicting the labels on validation dataset
predictions_SVM = SVM.predict(X_test_vect_avg)

# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, y_test)*100)
print("F1-Score -> ",f1_score(predictions_SVM, y_test))

SVM Accuracy Score ->  74.46808510638297
F1-Score ->  0.0


In [52]:
# Precision and Recall

precision = precision_score(y_test, predictions_SVM)
recall = recall_score(y_test, predictions_SVM)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((predictions_SVM==y_test).sum()/len(predictions_SVM), 3)))

Precision: 0.0 / Recall: 0.0 / Accuracy: 0.745


  _warn_prf(average, modifier, msg_start, len(result))


# Using Voting Ensembles

In [53]:
from sklearn.ensemble import VotingClassifier
clf1 = LogisticRegression(multi_class='multinomial', class_weight = 'balanced', random_state=0)
clf2 = svm.SVC(C=9, kernel='linear', degree=3, gamma='auto')

eclf1 = VotingClassifier(estimators=[('lr', clf1), ('svm', clf2)], voting='hard')
eclf1 = eclf1.fit(X_train_vect_avg, y_train.values.ravel())
predictions_eclf1 = eclf1.predict(X_test_vect_avg)
print("Hard voting Accuracy Score -> ",accuracy_score(predictions_eclf1, y_test)*100)
print("Hard voting F1-Score -> ",f1_score(predictions_eclf1, y_test))


Hard voting Accuracy Score ->  74.46808510638297
Hard voting F1-Score ->  0.0
