In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import scipy.stats as stats
from sklearn import preprocessing
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector as sfs
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from scipy import stats
from scipy.stats import linregress
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, concatenate
from tensorflow.keras.callbacks import EarlyStopping



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#import dataset
train = pd.read_csv('/content/drive/MyDrive/trainProcessed.csv')
validate = pd.read_csv('/content/drive/MyDrive/validateProcessed.csv')
test = pd.read_csv('/content/drive/MyDrive/testProcessed.csv')

In [None]:
import ast
def convert_string_to_list(string):
    try:
        return ast.literal_eval(string)
    except ValueError:
        return []

In [None]:
train['DIFFERENTIAL_DIAGNOSIS'] = train['DIFFERENTIAL_DIAGNOSIS'].apply(convert_string_to_list)
test['DIFFERENTIAL_DIAGNOSIS'] = test['DIFFERENTIAL_DIAGNOSIS'].apply(convert_string_to_list)
validate['DIFFERENTIAL_DIAGNOSIS'] = validate['DIFFERENTIAL_DIAGNOSIS'].apply(convert_string_to_list)

In [None]:
unique_diseases = unique_diseases = sorted(list(set(train['PATHOLOGY'])))

def data_pre(df, unique_diseases):
    X = df.drop(['DIFFERENTIAL_DIAGNOSIS', 'PATHOLOGY', 'INITIAL_EVIDENCE'], axis=1)

    # Deal with DIFFERENTIAL_DIAGNOSIS
    y = []
    for diagnosis_list in df['DIFFERENTIAL_DIAGNOSIS']:
        diagnosis_encoding = np.zeros(len(unique_diseases))
        for diagnosis in diagnosis_list:
            if isinstance(diagnosis, list) and len(diagnosis) == 2:
                disease_name, probability = diagnosis
                if disease_name in unique_diseases:
                    index = unique_diseases.index(disease_name)
                    diagnosis_encoding[index] = probability
        y.append(diagnosis_encoding)
    y = pd.DataFrame(y, columns=unique_diseases)

    # PATHOLOGY
    pathology = df['PATHOLOGY']
    pathology_encoded = np.zeros((len(pathology), len(unique_diseases)))
    for i, disease in enumerate(pathology):
        if disease in unique_diseases:
            index = unique_diseases.index(disease)
            pathology_encoded[i, index] = 1
    pathology_encoded = pd.DataFrame(pathology_encoded, columns=unique_diseases)

    return X, y, pathology_encoded



X_train, y_train, pathology_encoded_train = data_pre(train, unique_diseases)
X_validate, y_validate, pathology_encoded_validate = data_pre(validate, unique_diseases)
X_test, y_test, pathology_encoded_test = data_pre(test, unique_diseases)


In [None]:
# Calculate GTPA@1
class GTPAat1(tf.keras.metrics.Metric):
    def __init__(self, name='gtpa_at_1', **kwargs):
        super(GTPAat1, self).__init__(name=name, **kwargs)
        self.correct_predictions = self.add_weight(name='cp', initializer='zeros')
        self.total_samples = self.add_weight(name='ts', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        top1_pred = tf.argmax(y_pred, axis=1)
        true_labels = tf.argmax(y_true, axis=1)
        correct_predictions = tf.cast(tf.equal(top1_pred, true_labels), tf.float32)
        self.correct_predictions.assign_add(tf.reduce_sum(correct_predictions))
        self.total_samples.assign_add(tf.cast(tf.size(true_labels), tf.float32))

    def result(self):
        return self.correct_predictions / self.total_samples

    def reset_states(self):
        self.correct_predictions.assign(0.0)
        self.total_samples.assign(0.0)


In [None]:
from tensorflow.keras.layers import LeakyReLU

model = Sequential()
model.add(Dense(1024, input_dim=X_train.shape[1]))
model.add(LeakyReLU(alpha=0.1))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(512))
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(0.3))
model.add(Dense(y_train.shape[1], activation='sigmoid'))
optimizer = Adam(learning_rate=0.00005)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy', GTPAat1()])


In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=200, batch_size=128, validation_data=(X_validate, y_validate), callbacks=[early_stopping])

Epoch 1/200

  m.reset_state()


Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 7

In [None]:

gtpa_at1_metric=GTPAat1()

# On Validation set
y_pred_validate = model.predict(X_validate, batch_size=128)
gtpa_at1_metric.update_state(pathology_encoded_validate, y_pred_validate)
gtpa_at1_validate = gtpa_at1_metric.result().numpy()
print("GTPA@1 on Validation Set:", gtpa_at1_validate)


gtpa_at1_metric.reset_states()

# On test Set
y_pred_test = model.predict(X_test, batch_size=128)
gtpa_at1_metric.update_state(pathology_encoded_test, y_pred_test)
gtpa_at1_test = gtpa_at1_metric.result().numpy()
print("GTPA@1 on Test Set:", gtpa_at1_test)


GTPA@1 on Validation Set: 0.7367646
GTPA@1 on Test Set: 0.7373429


In [None]:
def calculate_DDR(y_true, y_pred, threshold=0.01):
  #turn prediction intro binary according to the threshold
    y_true_binary = (y_true > threshold).astype(int)
    y_pred_binary = (y_pred > threshold).astype(int)

    # calculate DDR
    DDR = np.mean([
        np.sum(y_true_binary[i] & y_pred_binary[i]) / np.sum(y_true_binary[i])
        if np.sum(y_true_binary[i]) > 0 else 0
        for i in range(y_true_binary.shape[0])
    ])
    return DDR


def calculate_DDP(y_true, y_pred, threshold=0.01):
  #turn prediction intro binary according to the threshold
    y_true_binary = (y_true > threshold).astype(int)
    y_pred_binary = (y_pred > threshold).astype(int)

   # calculate DDP
    DDP = np.mean([
        np.sum(y_true_binary[i] & y_pred_binary[i]) / np.sum(y_pred_binary[i])
        if np.sum(y_pred_binary[i]) > 0 else 0
        for i in range(y_pred_binary.shape[0])
    ])
    return DDP




In [None]:
def calculate_F1(DDR, DDP):
    # Check for a case where both DDR and DDP are zero to avoid division by zero
    if DDR == 0 and DDP == 0:
        return 0

    F1 = 2 * (DDR * DDP) / (DDR + DDP)
    return F1


In [None]:
#On validation set
y_validate_np = y_validate.values
y_test_np = y_test.values

DDR_validate = calculate_DDR(y_validate_np, y_pred_validate)
DDP_validate = calculate_DDP(y_validate_np, y_pred_validate)
F1_validate = calculate_F1(DDR_validate, DDP_validate)
print("DDR on Validation Set:", DDR_validate)
print("DDP on Validation Set:", DDP_validate)
print("F1 on Validation Set:", F1_validate)

DDR on Validation Set: 0.9965352038700647
DDP on Validation Set: 0.906254342272449
F1 on Validation Set: 0.9492530138873967


In [None]:
# On test set
DDR_test = calculate_DDR(y_test_np, y_pred_test)
DDP_test = calculate_DDP(y_test_np, y_pred_test)
F1_test = calculate_F1(DDR_test, DDP_test)
print("DDR on Test Set:", DDR_test)
print("DDP on Test Set:", DDP_test)
print("F1 on Test Set:", F1_test)

DDR on Test Set: 0.9965999921550472
DDP on Test Set: 0.9046718765537546
F1 on Test Set: 0.9484135329774365
