In [197]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [198]:
random_state = 42
np.random.seed(random_state)

In [199]:
data_path = "KEC_SAC_radiology_data_for_CS_8.3.2022.csv"
data = pd.read_csv(data_path, header=0, names=["study_id", "label", "text"])
data.head()

Unnamed: 0,study_id,label,text
0,2,0,Reason for Exam: CHRONIC LOWER BACK PAIN. GET...
1,3,0,Reason for Exam: KNOWN MULTILEVEL DEGENERATIVE...
2,4,0,MR LUMBAR SPINE Reason for Exam: PROGRESSIV...
3,5,0,MR CERVICAL SPINE Reason for Exam: HAS HX O...
4,6,0,MRI lumbar spine Comparison: No prior ...


In [200]:
k = 5
fold = 0

accuracy = 0
f1 = 0
precision = 0
recall = 0

folds = KFold(n_splits=k, random_state=random_state, shuffle=True)

for train_index, test_index in folds.split(data['text'], data['label']):
    fold += 1
    X_train, Y_train = data['text'][train_index], data['label'][train_index]
    X_test, Y_test = data['text'][test_index], data['label'][test_index]
    
    vectorizer = CountVectorizer(max_features = 500, lowercase=True, max_df=0.90, min_df=0.01)

    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    
    # lr = LogisticRegression(penalty='l1', solver='liblinear', C=0.16, random_state=random_state, class_weight='balanced')
    # lr.fit(X_train, Y_train)
    # Y_pred_lr = lr.predict(X_test)
    
    
    # SVM = SVC(C=9, kernel='linear', degree=3, gamma='auto')
    # SVM.fit(X_train, Y_train)
    # Y_pred_SVM = SVM.predict(X_test)
    
    ANN = MLPClassifier(solver='lbfgs', hidden_layer_sizes = (50,), learning_rate_init = 0.00003, random_state=random_state)
    ANN.fit(X_train, Y_train)
    Y_pred = ANN.predict(X_test)
    
    accuracy += accuracy_score(Y_pred, Y_test)/k
    f1 += f1_score(Y_pred, Y_test)/k
    precision += precision_score(Y_pred, Y_test)/k
    recall += recall_score(Y_pred, Y_test)/k
    
    print(f"Fold {fold}")
    print(f"Accuracy -> {accuracy_score(Y_pred, Y_test):0.4f}")
    print(f"F1-Score -> {f1_score(Y_pred, Y_test):0.4f}")
    print(f"Precision -> {precision_score(Y_pred, Y_test):0.4f}")
    print(f"Recall -> {recall_score(Y_pred, Y_test):0.4f}")


Fold 1
Accuracy -> 0.7333
F1-Score -> 0.1667
Precision -> 0.2500
Recall -> 0.1250
Fold 2
Accuracy -> 0.8649
F1-Score -> 0.2857
Precision -> 0.2857
Recall -> 0.2857
Fold 3
Accuracy -> 0.8108
F1-Score -> 0.4167
Precision -> 0.2941
Recall -> 0.7143
Fold 4
Accuracy -> 0.7973
F1-Score -> 0.5161
Precision -> 0.5714
Recall -> 0.4706
Fold 5
Accuracy -> 0.7703
F1-Score -> 0.1905
Precision -> 0.1538
Recall -> 0.2500


In [201]:
print(f"Accuracy -> {accuracy:0.4f}")
print(f"F1-Score -> {f1:0.4f}")
print(f"Precision -> {precision:0.4f}")
print(f"Recall -> {recall:0.4f}")

Accuracy -> 0.7953
F1-Score -> 0.3151
Precision -> 0.3110
Recall -> 0.3691
