In [9]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [10]:
random_state = 42
np.random.seed(random_state)

In [11]:
data_path = "KEC_SAC_radiology_data_for_CS_8.3.2022.csv"
data = pd.read_csv(data_path, header=0, names=["study_id", "label", "text"])
data.head()

Unnamed: 0,study_id,label,text
0,2,0,Reason for Exam: CHRONIC LOWER BACK PAIN. GET...
1,3,0,Reason for Exam: KNOWN MULTILEVEL DEGENERATIVE...
2,4,0,MR LUMBAR SPINE Reason for Exam: PROGRESSIV...
3,5,0,MR CERVICAL SPINE Reason for Exam: HAS HX O...
4,6,0,MRI lumbar spine Comparison: No prior ...


# Baseline Unigrams
* Lowercase all words (distinction between uppercase and lowercase is not important)
* Remove words that occur in more 90% of training texts (hopefully removes stop words such as 'a' and 'the' that don't provide much meaning)
* Remove words that occur in less than 1% training texts (easy way to remove mispelled words)

In [12]:
k = 5
fold = 0

accuracy = 0
f1 = 0
precision = 0
recall = 0

folds = KFold(n_splits=k, random_state=random_state, shuffle=True)

for train_index, test_index in folds.split(data['text'], data['label']):
    fold += 1
    X_train, Y_train = data['text'][train_index], data['label'][train_index]
    X_test, Y_test = data['text'][test_index], data['label'][test_index]
    
    vectorizer = TfidfVectorizer(lowercase=True, max_df=0.90, min_df=0.01)

    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    
    
    dum = DummyClassifier(strategy='stratified', random_state=random_state)
    dum = dum.fit(X_train, Y_train)
    
    Y_pred = dum.predict(X_test)
    
    accuracy += accuracy_score(Y_pred, Y_test)/k
    f1 += f1_score(Y_pred, Y_test)/k
    precision += precision_score(Y_pred, Y_test)/k
    recall += recall_score(Y_pred, Y_test)/k
    
    print(f"Fold {fold}")
    print(f"Accuracy -> {accuracy_score(Y_pred, Y_test):0.4f}")
    print(f"F1-Score -> {f1_score(Y_pred, Y_test):0.4f}")
    print(f"Precision -> {precision_score(Y_pred, Y_test):0.4f}")
    print(f"Recall -> {recall_score(Y_pred, Y_test):0.4f}")


Fold 1
Accuracy -> 0.7200
F1-Score -> 0.0000
Precision -> 0.0000
Recall -> 0.0000
Fold 2
Accuracy -> 0.8378
F1-Score -> 0.4000
Precision -> 0.5714
Recall -> 0.3077
Fold 3
Accuracy -> 0.7027
F1-Score -> 0.2143
Precision -> 0.1765
Recall -> 0.2727
Fold 4
Accuracy -> 0.6892
F1-Score -> 0.0800
Precision -> 0.0714
Recall -> 0.0909
Fold 5
Accuracy -> 0.6757
F1-Score -> 0.0000
Precision -> 0.0000
Recall -> 0.0000


In [13]:
print(f"Accuracy -> {accuracy:0.4f}")
print(f"F1-Score -> {f1:0.4f}")
print(f"Precision -> {precision:0.4f}")
print(f"Recall -> {recall:0.4f}")

Accuracy -> 0.7251
F1-Score -> 0.1389
Precision -> 0.1639
Recall -> 0.1343
