In [273]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [274]:
random_state = 42
np.random.seed(random_state)

In [275]:
data_path = "KEC_SAC_radiology_data_for_CS_8.3.2022.csv"
data = pd.read_csv(data_path, header=0, names=["study_id", "label", "text"])
data.head()

Unnamed: 0,study_id,label,text
0,2,0,Reason for Exam: CHRONIC LOWER BACK PAIN. GET...
1,3,0,Reason for Exam: KNOWN MULTILEVEL DEGENERATIVE...
2,4,0,MR LUMBAR SPINE Reason for Exam: PROGRESSIV...
3,5,0,MR CERVICAL SPINE Reason for Exam: HAS HX O...
4,6,0,MRI lumbar spine Comparison: No prior ...


In [276]:
max_features = 1000

# Preprocess
* Remove numbers (such as years and dates) but keep alphanumerics (t1, t5, etc)

In [277]:
# def replace(string):
#     return re.sub(r'\b[\d]+\b', '',string)

# data['text'] = data['text'].apply(replace)

# Bag of Words Unigrams
* Lowercase all words (distinction between uppercase and lowercase is not important)
* Remove words that occur in more 90% of training texts (hopefully removes stop words such as 'a' and 'the' that don't provide much meaning)
* Remove words that occur in less than 1% training texts (easy way to remove mispelled words)

In [278]:
k = 5
fold = 0

accuracy = 0
f1 = 0
precision = 0
recall = 0

folds = KFold(n_splits=k, random_state=random_state, shuffle=True)

for train_index, test_index in folds.split(data['text'], data['label']):
    fold += 1
    X_train, Y_train = data['text'][train_index], data['label'][train_index]
    X_test, Y_test = data['text'][test_index], data['label'][test_index]
    
    vectorizer = CountVectorizer(lowercase=True, max_df=0.90, min_df=0.01)

    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    
    
    lr = LogisticRegression(penalty='l1', solver='liblinear', C=0.16, random_state=random_state, class_weight='balanced')
    SVM = SVC(C=9, kernel='linear', degree=3, gamma='auto')
    
    ensemble = VotingClassifier(estimators=[('lr', lr), ('svm', SVM)], voting='hard')
    ensemble.fit(X_train, Y_train)
    
    Y_pred = ensemble.predict(X_test)
    
    accuracy += accuracy_score(Y_pred, Y_test)/k
    f1 += f1_score(Y_pred, Y_test)/k
    precision += precision_score(Y_pred, Y_test)/k
    recall += recall_score(Y_pred, Y_test)/k
    
    print(f"Fold {fold}")
    print(f"Accuracy -> {accuracy_score(Y_pred, Y_test):0.4f}")
    print(f"F1-Score -> {f1_score(Y_pred, Y_test):0.4f}")
    print(f"Precision -> {precision_score(Y_pred, Y_test):0.4f}")
    print(f"Recall -> {recall_score(Y_pred, Y_test):0.4f}")


Fold 1
Accuracy -> 0.8400
F1-Score -> 0.2500
Precision -> 0.2500
Recall -> 0.2500
Fold 2
Accuracy -> 0.8514
F1-Score -> 0.1538
Precision -> 0.1429
Recall -> 0.1667
Fold 3
Accuracy -> 0.7973
F1-Score -> 0.4000
Precision -> 0.2941
Recall -> 0.6250
Fold 4
Accuracy -> 0.7838
F1-Score -> 0.3333
Precision -> 0.2857
Recall -> 0.4000
Fold 5
Accuracy -> 0.7703
F1-Score -> 0.2609
Precision -> 0.2308
Recall -> 0.3000


In [279]:
print(f"Accuracy -> {accuracy:0.4f}")
print(f"F1-Score -> {f1:0.4f}")
print(f"Precision -> {precision:0.4f}")
print(f"Recall -> {recall:0.4f}")

Accuracy -> 0.8085
F1-Score -> 0.2796
Precision -> 0.2407
Recall -> 0.3483


# TF-IDF Unigrams
* Lowercase all words (distinction between uppercase and lowercase is not important)
* Remove words that occur in more 90% of training texts (hopefully removes stop words such as 'a' and 'the' that don't provide much meaning)
* Remove words that occur in less than 1% training texts (easy way to remove mispelled words)

In [280]:
k = 5
fold = 0

accuracy = 0
f1 = 0
precision = 0
recall = 0

folds = KFold(n_splits=k, random_state=random_state, shuffle=True)

for train_index, test_index in folds.split(data['text'], data['label']):
    fold += 1
    X_train, Y_train = data['text'][train_index], data['label'][train_index]
    X_test, Y_test = data['text'][test_index], data['label'][test_index]
    
    vectorizer = TfidfVectorizer(lowercase=True, max_df=0.90, min_df=0.01)

    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    
    
    lr = LogisticRegression(random_state=random_state, class_weight='balanced')
    SVM = SVC(C=9, kernel='linear', degree=3, gamma='auto')
    
    ensemble = VotingClassifier(estimators=[('lr', lr), ('svm', SVM)], voting='hard')
    ensemble.fit(X_train, Y_train)
    
    Y_pred = ensemble.predict(X_test)
    
    accuracy += accuracy_score(Y_pred, Y_test)/k
    f1 += f1_score(Y_pred, Y_test)/k
    precision += precision_score(Y_pred, Y_test)/k
    recall += recall_score(Y_pred, Y_test)/k
    
    print(f"Fold {fold}")
    print(f"Accuracy -> {accuracy_score(Y_pred, Y_test):0.4f}")
    print(f"F1-Score -> {f1_score(Y_pred, Y_test):0.4f}")
    print(f"Precision -> {precision_score(Y_pred, Y_test):0.4f}")
    print(f"Recall -> {recall_score(Y_pred, Y_test):0.4f}")


Fold 1
Accuracy -> 0.8800
F1-Score -> 0.3077
Precision -> 0.2500
Recall -> 0.4000
Fold 2
Accuracy -> 0.8649
F1-Score -> 0.2857
Precision -> 0.2857
Recall -> 0.2857
Fold 3
Accuracy -> 0.7838
F1-Score -> 0.2727
Precision -> 0.1765
Recall -> 0.6000
Fold 4
Accuracy -> 0.8108
F1-Score -> 0.3636
Precision -> 0.2857
Recall -> 0.5000
Fold 5
Accuracy -> 0.8108
F1-Score -> 0.3636
Precision -> 0.3077
Recall -> 0.4444


In [281]:
print(f"Accuracy -> {accuracy:0.4f}")
print(f"F1-Score -> {f1:0.4f}")
print(f"Precision -> {precision:0.4f}")
print(f"Recall -> {recall:0.4f}")

Accuracy -> 0.8301
F1-Score -> 0.3187
Precision -> 0.2611
Recall -> 0.4460


# Bag of Words Unigrams + Bigrams
* Lowercase all words (distinction between uppercase and lowercase is not important)
* Remove words that occur in more 90% of training texts (hopefully removes stop words such as 'a' and 'the' that don't provide much meaning)
* Remove words that occur in less than 1% training texts (easy way to remove mispelled words)

In [282]:
k = 5
fold = 0

accuracy = 0
f1 = 0
precision = 0
recall = 0

folds = KFold(n_splits=k, random_state=random_state, shuffle=True)

for train_index, test_index in folds.split(data['text'], data['label']):
    fold += 1
    X_train, Y_train = data['text'][train_index], data['label'][train_index]
    X_test, Y_test = data['text'][test_index], data['label'][test_index]
    
    vectorizer = CountVectorizer(ngram_range = [1, 2], lowercase=True, max_df=0.90, min_df=0.01)

    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    
    
    lr = LogisticRegression(penalty='l1', solver='liblinear', C=0.16, random_state=random_state, class_weight='balanced')
    SVM = SVC(C=9, kernel='linear', degree=3, gamma='auto')
    
    ensemble = VotingClassifier(estimators=[('lr', lr), ('svm', SVM)], voting='hard')
    ensemble.fit(X_train, Y_train)
    
    Y_pred = ensemble.predict(X_test)
    
    accuracy += accuracy_score(Y_pred, Y_test)/k
    f1 += f1_score(Y_pred, Y_test)/k
    precision += precision_score(Y_pred, Y_test)/k
    recall += recall_score(Y_pred, Y_test)/k
    
    print(f"Fold {fold}")
    print(f"Accuracy -> {accuracy_score(Y_pred, Y_test):0.4f}")
    print(f"F1-Score -> {f1_score(Y_pred, Y_test):0.4f}")
    print(f"Precision -> {precision_score(Y_pred, Y_test):0.4f}")
    print(f"Recall -> {recall_score(Y_pred, Y_test):0.4f}")


Fold 1
Accuracy -> 0.8400
F1-Score -> 0.2500
Precision -> 0.2500
Recall -> 0.2500
Fold 2
Accuracy -> 0.8649
F1-Score -> 0.0000
Precision -> 0.0000
Recall -> 0.0000
Fold 3
Accuracy -> 0.8108
F1-Score -> 0.3636
Precision -> 0.2353
Recall -> 0.8000
Fold 4
Accuracy -> 0.7838
F1-Score -> 0.2000
Precision -> 0.1429
Recall -> 0.3333
Fold 5
Accuracy -> 0.8243
F1-Score -> 0.2353
Precision -> 0.1538
Recall -> 0.5000


In [283]:
print(f"Accuracy -> {accuracy:0.4f}")
print(f"F1-Score -> {f1:0.4f}")
print(f"Precision -> {precision:0.4f}")
print(f"Recall -> {recall:0.4f}")

Accuracy -> 0.8248
F1-Score -> 0.2098
Precision -> 0.1564
Recall -> 0.3767


# TF-IDF Unigrams + Bigrams
* Lowercase all words (distinction between uppercase and lowercase is not important)
* Remove words that occur in more 90% of training texts (hopefully removes stop words such as 'a' and 'the' that don't provide much meaning)
* Remove words that occur in less than 1% training texts (easy way to remove mispelled words)

In [284]:
k = 5
fold = 0

accuracy = 0
f1 = 0
precision = 0
recall = 0

folds = KFold(n_splits=k, random_state=random_state, shuffle=True)

for train_index, test_index in folds.split(data['text'], data['label']):
    fold += 1
    X_train, Y_train = data['text'][train_index], data['label'][train_index]
    X_test, Y_test = data['text'][test_index], data['label'][test_index]
    
    vectorizer = TfidfVectorizer(ngram_range=[1,2], lowercase=True, max_df=0.90, min_df=0.01)

    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    
    
    lr = LogisticRegression(random_state=random_state, class_weight='balanced')
    SVM = SVC(C=9, kernel='linear', degree=3, gamma='auto')
    
    ensemble = VotingClassifier(estimators=[('lr', lr), ('svm', SVM)], voting='hard')
    ensemble.fit(X_train, Y_train)
    
    Y_pred = ensemble.predict(X_test)
    
    accuracy += accuracy_score(Y_pred, Y_test)/k
    f1 += f1_score(Y_pred, Y_test)/k
    precision += precision_score(Y_pred, Y_test)/k
    recall += recall_score(Y_pred, Y_test)/k
    
    print(f"Fold {fold}")
    print(f"Accuracy -> {accuracy_score(Y_pred, Y_test):0.4f}")
    print(f"F1-Score -> {f1_score(Y_pred, Y_test):0.4f}")
    print(f"Precision -> {precision_score(Y_pred, Y_test):0.4f}")
    print(f"Recall -> {recall_score(Y_pred, Y_test):0.4f}")


Fold 1
Accuracy -> 0.8667
F1-Score -> 0.2857
Precision -> 0.2500
Recall -> 0.3333
Fold 2
Accuracy -> 0.8649
F1-Score -> 0.1667
Precision -> 0.1429
Recall -> 0.2000
Fold 3
Accuracy -> 0.7973
F1-Score -> 0.2857
Precision -> 0.1765
Recall -> 0.7500
Fold 4
Accuracy -> 0.7973
F1-Score -> 0.1176
Precision -> 0.0714
Recall -> 0.3333
Fold 5
Accuracy -> 0.8108
F1-Score -> 0.1250
Precision -> 0.0769
Recall -> 0.3333


In [285]:
print(f"Accuracy -> {accuracy:0.4f}")
print(f"F1-Score -> {f1:0.4f}")
print(f"Precision -> {precision:0.4f}")
print(f"Recall -> {recall:0.4f}")

Accuracy -> 0.8274
F1-Score -> 0.1961
Precision -> 0.1435
Recall -> 0.3900
