In [1]:
import pandas as pd 
import numpy as np 

df_train = pd.read_csv('clean_data/train.csv')
df_test = pd.read_csv('clean_data/test.csv')

target_cols = ['EC1', 'EC2']
num_cols = ['BertzCT', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3v',
            'Chi4n', 'EState_VSA1', 'EState_VSA2', 'ExactMolWt', 'FpDensityMorgan1',
            'FpDensityMorgan2', 'FpDensityMorgan3', 'HallKierAlpha',
            'HeavyAtomMolWt', 'Kappa3', 'MaxAbsEStateIndex', 'MinEStateIndex',
            'NumHeteroatoms', 'PEOE_VSA10', 'PEOE_VSA14', 'PEOE_VSA6', 'PEOE_VSA7',
            'PEOE_VSA8', 'SMR_VSA10', 'SMR_VSA5', 'SlogP_VSA3', 'VSA_EState9',
            'fr_COO', 'fr_COO2']
binary_cols = ['EC3', 'EC4', 'EC5', 'EC6']

x_train = df_train[num_cols].to_numpy()
y_train = df_train[target_cols].to_numpy()

x_test = df_test[num_cols].to_numpy()

In [2]:
from sklearn.model_selection import train_test_split

x_train, x_cv, y_train, y_cv = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [5]:
import tensorflow as tf
import numpy as np
from sklearn.metrics import roc_auc_score

# Define the custom metric function
def auc_roc(y_true, y_pred):
    auc_roc_values = tf.py_function(roc_auc_score, (y_true, y_pred[:, 1]), tf.float32)
    return auc_roc_values

# Define the model architecture
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(x_train.shape[1],),
                          kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dense(2, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy', auc_roc])  # Add the custom metric

# Train the model
model.fit(x_train, y_train, validation_data=(x_cv, y_cv), epochs=10, batch_size=32)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2b4c2f044d0>

In [6]:
# Predict probabilities for the training set
y_pred_proba_train = model.predict(x_train)

# Calculate the AUC-ROC score for each category in training data
auc_scores_train = []
for i in range(2):
    y_true_train = (y_train == i).astype(int)
    auc_score_train = roc_auc_score(y_true_train, y_pred_proba_train[:, i])
    auc_scores_train.append(auc_score_train)

# Calculate the average AUC-ROC score for training data
avg_auc_score_train = sum(auc_scores_train) / len(auc_scores_train)

# Predict probabilities for the cross-validation set (as you already have in your code)
y_pred_proba_cv = model.predict(x_cv)

# Calculate the AUC-ROC score for each category in cross-validation data (as you already have in your code)
auc_scores_cv = []
for i in range(2):
    y_true_cv = (y_cv == i).astype(int)
    auc_score_cv = roc_auc_score(y_true_cv, y_pred_proba_cv[:, i])
    auc_scores_cv.append(auc_score_cv)

# Calculate the average AUC-ROC score for cross-validation data (as you already have in your code)
avg_auc_score_cv = sum(auc_scores_cv) / len(auc_scores_cv)

# Print the AUC-ROC scores for training and cross-validation data
print("AUC-ROC Score for Training Data:")
print(f"Category 1: {auc_scores_train[0]}")
print(f"Category 2: {auc_scores_train[1]}")
print(f"Average: {avg_auc_score_train}")

print("\nAUC-ROC Score for Cross-Validation Data:")
print(f"Category 1: {auc_scores_cv[0]}")
print(f"Category 2: {auc_scores_cv[1]}")
print(f"Average: {avg_auc_score_cv}")

AUC-ROC Score for Training Data:
Category 1: 0.6964839595887984
Category 2: 0.6964839595887984
Average: 0.6964839595887984

AUC-ROC Score for Cross-Validation Data:
Category 1: 0.7049662445323346
Category 2: 0.7049659716947362
Average: 0.7049661081135354


In [9]:
y_pred = model.predict(x_test)

y_pred_1, y_pred_2 = y_pred[:, 0], y_pred[:, 1]

ids = df_test['id']

# creating submission file
df_y_pred_1 = pd.DataFrame({'EC1': y_pred_1})

df_y_pred_2 = pd.DataFrame({'EC2': y_pred_2})

df_ids = pd.DataFrame({'id': ids})

result = pd.concat([df_ids, df_y_pred_1, df_y_pred_2], axis=1)

result.to_csv('submissions/submission_2_neural_net_2.csv', index=False)

