In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zahin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zahin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\zahin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zahin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
np.random.seed(200)

In [7]:
df = pd.read_csv(r"D:\E\Baikka\Project\KEC SAC radiology data for CS 8.3.2022.csv",encoding='utf-8')
df['mr_report'] = df['mr_report'].astype(str)

In [8]:
df.head()

Unnamed: 0,Outcome,mr_report
0,0,Reason for Exam: CHRONIC LOWER BACK PAIN. GET...
1,0,Reason for Exam: KNOWN MULTILEVEL DEGENERATIVE...
2,0,MR LUMBAR SPINE Reason for Exam: PROGRESSIV...
3,0,MR CERVICAL SPINE Reason for Exam: HAS HX O...
4,0,MRI lumbar spine Comparison: No prior ...


In [9]:
# dropiing few negative outcomes to maintain balance
remove_n = 136
drop_indices = np.random.choice(df[(df.Outcome == 0)].index, remove_n, replace=False)
print(drop_indices)
df2 = df.drop(drop_indices)
df2.reset_index(inplace=True)
df2.shape

[ 65 111 364   4 351  93 269 221  70  32  77 268  42 270 353 171 207 310
 107 180  36  51 195 263 275 218 247 177 196 251 201 331 144 222 203 326
  12 299 210 311 294  22 126 369 161 153 298   8 264 175 276  86 137 273
 357  35 109 328 228  54 301 266 101 365  41 162 323 119 355 288 333 238
  67 143 199  11 146   9 200  20  18 360 349 178 363 337 283 184   5  75
 117 103  19 176  47 344  40 234 255 148 361 235 253 188 280 106  61  78
 345  33 114 306 319  82 116 295  23  95 286 123 145 343 212 183 132 340
 133 265  88  59  37 170 118  31 166 305]


(235, 3)

In [10]:
df2[:300]

Unnamed: 0,index,Outcome,mr_report
0,0,0,Reason for Exam: CHRONIC LOWER BACK PAIN. GET...
1,1,0,Reason for Exam: KNOWN MULTILEVEL DEGENERATIVE...
2,2,0,MR LUMBAR SPINE Reason for Exam: PROGRESSIV...
3,3,0,MR CERVICAL SPINE Reason for Exam: HAS HX O...
4,6,0,"Reason for Exam: LOW BACK PAIN, PAIN AND NUMBN..."
...,...,...,...
230,362,0,FINDINGS: L1/2: Remote compression fracture o...
231,366,0,Reason for Exam: LONG STANDING H/O LOWER BACK ...
232,367,0,"TECHNIQUE: Sagittal T2, sagittal T1, axial T2,..."
233,368,0,HISTORY: FELL BACKWARDS IN AUG2019 AND PAIN AN...


# Data pre-processing

In [11]:
# Removing blank rows if any.
df['mr_report'].dropna(inplace=True)

In [12]:
# Changing all the text to lower case.
df2['mr_report'] = [entry.lower() for entry in df2['mr_report']]

In [13]:
# Tokenization : Each entry in the mri_report will be broken into set of words
df2['mr_report']= [word_tokenize(entry) for entry in df2['mr_report']]

In [14]:
# Removing Stop words, Non-Numeric and perfom Word Stemming/Lemmenting

# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV



for index,entry in enumerate(df2['mr_report']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    df2.loc[index,'text_final'] = str(Final_words)

In [15]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(df2['text_final'],df2['Outcome'],test_size=0.2)

In [16]:
print(Train_X[:10])

227    ['technique', 'multiplanar', 'multisequence', ...
56     ['mr', 'thoracic', 'spine', 'reason', 'exam', ...
118    ['clinical', 'history', 'progressive', 'back',...
98     ['mr', 'lumbar', 'spine', 'reason', 'exam', 'm...
39     ['mr', 'cervical', 'spine', 'reason', 'exam', ...
199    ['narrative', 'impression', 'mr', 'lumbar', 's...
20     ['procedure', 'name', 'mr', 'brachial', 'plexu...
154    ['mr', 'cervical', 'spine', 'reason', 'exam', ...
129    ['narrative', 'impression', 'mr', 'lumbar', 's...
122    ['narrative', 'impression', 'mr', 'lumbar', 's...
Name: text_final, dtype: object


In [17]:
print(Train_Y[:10])

227    0
56     0
118    0
98     1
39     1
199    1
20     0
154    0
129    0
122    0
Name: Outcome, dtype: int64


In [18]:
# Term Frequency: This summarizes how often a given word appears within a document
# Inverse Document Frequency: This down scales words that appear a lot across documents.

# fit the TG-IDF model on the whole mri_report. 
# This will help TF-IDF build a vocabulary of words which it has learned from the MRI data 
# and it will assign a unique integer number to each of these words.


Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df2['text_final'])

# Finally we will transform Train_X and Test_X to vectorized Train_X_Tfidf and Test_X_Tfidf
# Each row will be a list of unique integer number and its associated importance calculated by TF-IDF.
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [19]:
#  the vocabulary that it has learned from the mri_report
print(Tfidf_vect.vocabulary_)

{'reason': 1451, 'exam': 611, 'chronic': 274, 'lower': 1005, 'back': 164, 'pain': 1257, 'get': 732, 'bad': 169, 'time': 1809, 'affect': 38, 'patient': 1288, 'life': 967, 'quality': 1420, 'show': 1625, 'spondylolisthesis': 1683, 'severe': 1612, 'degenerative': 440, 'change': 266, 'mri': 1113, 'lumbar': 1007, 'spine': 1673, 'technique': 1779, 'routine': 1545, 'finding': 670, 'defect': 430, 'par': 1261, 'interarticularis': 873, 'bilaterally': 188, 'grade': 740, 'spondylolytic': 1685, 'identify': 815, 'approximately': 108, 'mm': 1097, 'anterolisthesis': 83, 'demonstrate': 447, 'alignment': 51, 'posterior': 1344, 'margin': 1027, 'rest': 1512, 'vertebra': 1910, 'maintain': 1015, 'modic': 1101, 'endplate': 582, 'present': 1365, 'level': 961, 'focal': 690, 'lesion': 956, 'hyperintensity': 801, 'arise': 116, 'within': 1946, 'vertebral': 1911, 'body': 195, 'likely': 977, 'represent': 1499, 'small': 1659, 'hemangioma': 767, 'osseous': 1236, 'destructive': 476, 'compression': 318, 'fracture': 705,

In [20]:
# Printing the vectorized data
print(Train_X_Tfidf)

  (0, 1947)	0.05457257036770288
  (0, 1946)	0.040718628905094055
  (0, 1939)	0.055804794303892374
  (0, 1920)	0.05302852729266345
  (0, 1911)	0.06023903014402536
  (0, 1883)	0.08891451443169272
  (0, 1871)	0.12914646746262604
  (0, 1860)	0.04687178499351783
  (0, 1838)	0.07652328184367437
  (0, 1815)	0.05802242373037838
  (0, 1798)	0.08405344569540316
  (0, 1793)	0.04093486003612818
  (0, 1779)	0.03011951507201268
  (0, 1754)	0.15523038684160306
  (0, 1750)	0.07353488293174823
  (0, 1703)	0.25936200212769256
  (0, 1693)	0.09312765262217139
  (0, 1673)	0.049139471757392895
  (0, 1672)	0.21272407695954249
  (0, 1666)	0.040292335102522524
  (0, 1661)	0.05946677956726508
  (0, 1631)	0.12543730316859192
  (0, 1629)	0.028441861164525382
  (0, 1612)	0.11034672036240811
  (0, 1584)	0.07008502956907274
  :	:
  (187, 464)	0.10667589763542691
  (187, 440)	0.06922377439721841
  (187, 434)	0.061567984998674025
  (187, 409)	0.10667589763542691
  (187, 406)	0.11473770279277655
  (187, 379)	0.09651922

# Using Support Vector Machine


In [22]:
# fit the training dataset on the SVM
SVM = svm.SVC(C=9, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)

# predicting the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print("F1-Score -> ",f1_score(predictions_SVM, Test_Y))

SVM Accuracy Score ->  78.72340425531915
F1-Score ->  0.5833333333333334


In [23]:
c_range = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

c_scores = []
f_scores = []

# we will loop through reasonable values of c
for c in c_range:
    
    # fit the training dataset on the SVM
    SVM = svm.SVC(C=c, kernel='linear', degree=3, gamma='auto')
    SVM.fit(Train_X_Tfidf,Train_Y)
    # predicting the labels on validation dataset
    predictions_SVM = SVM.predict(Test_X_Tfidf)
    
    accuracy = accuracy_score(predictions_SVM, Test_Y)*100
    f1 = f1_score(predictions_SVM, Test_Y)
    
    c_scores.append(accuracy)
    f_scores.append(f1)
    
    print("SVM Accuracy Score and F1-Score when c=", c, "->", accuracy_score(predictions_SVM, Test_Y)*100, "and", f1_score(predictions_SVM, Test_Y))



SVM Accuracy Score and F1-Score when c= 1 -> 76.59574468085107 and 0.2666666666666667
SVM Accuracy Score and F1-Score when c= 2 -> 78.72340425531915 and 0.4444444444444444
SVM Accuracy Score and F1-Score when c= 3 -> 80.85106382978722 and 0.5263157894736842
SVM Accuracy Score and F1-Score when c= 4 -> 85.1063829787234 and 0.631578947368421
SVM Accuracy Score and F1-Score when c= 5 -> 82.97872340425532 and 0.6
SVM Accuracy Score and F1-Score when c= 6 -> 78.72340425531915 and 0.5454545454545455
SVM Accuracy Score and F1-Score when c= 7 -> 76.59574468085107 and 0.5217391304347826
SVM Accuracy Score and F1-Score when c= 8 -> 78.72340425531915 and 0.5833333333333334
SVM Accuracy Score and F1-Score when c= 9 -> 78.72340425531915 and 0.5833333333333334
SVM Accuracy Score and F1-Score when c= 10 -> 78.72340425531915 and 0.5833333333333334


# Using Logistic Regression

In [24]:
# fit the training dataset on the LR
lr = LogisticRegressionCV(cv=5, class_weight = 'balanced', random_state=0)
lr.fit(Train_X_Tfidf,Train_Y)

# predicting the labels on validation dataset
predictions_LR = lr.predict(Test_X_Tfidf)

print("LR Accuracy Score -> ",accuracy_score(predictions_LR, Test_Y)*100)
print("F1-Score -> ",f1_score(predictions_LR, Test_Y))

LR Accuracy Score ->  78.72340425531915
F1-Score ->  0.6428571428571429


# Using Voting Ensembles

In [25]:
clf1 = LogisticRegression(multi_class='multinomial', class_weight = 'balanced', random_state=0)
clf2 = svm.SVC(C=9, kernel='linear', degree=3, gamma='auto')

eclf1 = VotingClassifier(estimators=[('lr', clf1), ('svm', clf2)], voting='hard')
eclf1 = eclf1.fit(Train_X_Tfidf,Train_Y)
predictions_eclf1 = eclf1.predict(Test_X_Tfidf)
print("Hard voting Accuracy Score -> ",accuracy_score(predictions_eclf1, Test_Y)*100)
print("Hard voting F1-Score -> ",f1_score(predictions_eclf1, Test_Y))



Hard voting Accuracy Score ->  82.97872340425532
Hard voting F1-Score ->  0.6363636363636364
