In [1]:
import pandas as pd 
import numpy as np 

df_train = pd.read_csv('clean_data/train.csv')
df_test = pd.read_csv('clean_data/test.csv')

target_cols = ['EC1', 'EC2']
num_cols = ['BertzCT', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3v',
            'Chi4n', 'EState_VSA1', 'EState_VSA2', 'ExactMolWt', 'FpDensityMorgan1',
            'FpDensityMorgan2', 'FpDensityMorgan3', 'HallKierAlpha',
            'HeavyAtomMolWt', 'Kappa3', 'MaxAbsEStateIndex', 'MinEStateIndex',
            'NumHeteroatoms', 'PEOE_VSA10', 'PEOE_VSA14', 'PEOE_VSA6', 'PEOE_VSA7',
            'PEOE_VSA8', 'SMR_VSA10', 'SMR_VSA5', 'SlogP_VSA3', 'VSA_EState9',
            'fr_COO', 'fr_COO2']
binary_cols = ['EC3', 'EC4', 'EC5', 'EC6']

x_train = df_train[num_cols].to_numpy()
y_train = df_train[target_cols].to_numpy()

x_test = df_test[num_cols].to_numpy()

In [2]:
from sklearn.model_selection import train_test_split

x_train, x_cv, y_train, y_cv = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [3]:
import tensorflow as tf
from sklearn.metrics import roc_auc_score

# Define the model architecture
model_ec1 = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(x_train.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_ec2 = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(x_train.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model with AUC-ROC as the metric
model_ec1.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=[tf.keras.metrics.AUC(curve='ROC')])

model_ec2.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=[tf.keras.metrics.AUC(curve='ROC')])


# Train the model
model_ec1.fit(x_train, y_train[:, 0], epochs=10, batch_size=32, validation_data=(x_cv, y_cv[:, 0]))
print('\n', '='*150, '\n')
model_ec2.fit(x_train, y_train[:, 1], epochs=10, batch_size=32, validation_data=(x_cv, y_cv[:, 1]))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x297e6947ad0>

In [4]:
# Predict probabilities on the test set
cv_y_pred_prob_1 = model_ec1.predict(x_cv)
cv_y_pred_prob_2 = model_ec2.predict(x_cv)

train_y_pred_prob_1 = model_ec1.predict(x_train)
train_y_pred_prob_2 = model_ec2.predict(x_train)

# Calculate AUC-ROC score on the test set
train_auc_roc_1 = roc_auc_score(y_train[:, 0], train_y_pred_prob_1)
train_auc_roc_2 = roc_auc_score(y_train[:, 1], train_y_pred_prob_2)

cv_auc_roc_1 = roc_auc_score(y_cv[:, 0], cv_y_pred_prob_1)
cv_auc_roc_2 = roc_auc_score(y_cv[:, 1], cv_y_pred_prob_2)

print('\n', '='*150, '\n')

print("Train AUC-ROC score 1:", train_auc_roc_1)
print("Train AUC-ROC score 2:", train_auc_roc_2)

print("CV AUC-ROC score 1:", cv_auc_roc_1)
print("CV AUC-ROC score 2:", cv_auc_roc_2)

print("\nAvg AUC-ROC score: ", (cv_auc_roc_1 + cv_auc_roc_2)/2)



Train AUC-ROC score 1: 0.7278870604677388
Train AUC-ROC score 2: 0.6335723646826583
CV AUC-ROC score 1: 0.697597253769175
CV AUC-ROC score 2: 0.5888200557511737
Avg AUC-ROC score:  0.6432086547601743


In [5]:
y_pred_1, y_pred_2 = model_ec1.predict(x_test), model_ec2.predict(x_test)

ids = df_test['id']

# creating submission file
df_y_pred_1 = pd.DataFrame({'EC1': y_pred_1})

df_y_pred_2 = pd.DataFrame({'EC2': y_pred_2})

df_ids = pd.DataFrame({'id': ids})

result = pd.concat([df_ids, df_y_pred_1, df_y_pred_2], axis=1)

result.to_csv('submissions/submission_2_neural_net_2.csv', index=False)



ValueError: Per-column arrays must each be 1-dimensional