In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

In [37]:
# Load data
train_data = pd.read_csv('/home/aghasemi/CompBio481/datasets/processed_datasets/usable_datasets_branch2/nph_vs_nc_train.csv')
test_data = pd.read_csv('/home/aghasemi/CompBio481/datasets/processed_datasets/usable_datasets_branch2/nph_vs_nc_test.csv')

In [38]:
# Assume 'Diagnosis' is the target column
X_train = train_data.drop(['Diagnosis', 'ID_1'], axis=1)
y_train = train_data['Diagnosis']
X_test = test_data.drop(['Diagnosis', 'ID_1'], axis=1)
y_test = test_data['Diagnosis']

In [39]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [40]:
# Build model
model = Sequential()
model.add(Dense(128, input_dim=X_train_scaled.shape[1], activation='relu'))  # First hidden layer
model.add(Dense(64, activation='relu'))  # Second hidden layer
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

In [41]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [42]:
model.fit(X_train_scaled, y_train, epochs=10, batch_size=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fe44dca8650>

In [43]:
# Evaluate model
loss, accuracy = model.evaluate(X_test_scaled, y_test, verbose=1)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Test Loss: 1.45769202709198, Test Accuracy: 0.7599999904632568


**Test 2**

In [105]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping

In [124]:
# Load data
train_data = pd.read_csv('/home/aghasemi/CompBio481/datasets/processed_datasets/usable_datasets_branch2/NC_vs_VaD_train.csv')
test_data = pd.read_csv('/home/aghasemi/CompBio481/datasets/processed_datasets/usable_datasets_branch2/NC_vs_VaD_test.csv')

In [125]:
# Prepare training data
X_train = train_data.drop(['Diagnosis', 'ID_1'], axis=1)
y_train = train_data['Diagnosis']
X_test = test_data.drop(['Diagnosis', 'ID_1'], axis=1)
y_test = test_data['Diagnosis']

In [126]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [127]:
# Define 5-fold stratified cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_no = 1
loss_per_fold = []
accuracy_per_fold = []

In [128]:
for train_index, test_index in skf.split(X_train_scaled, y_train):
    # Split data
    X_train_fold, X_val_fold = X_train_scaled[train_index], X_train_scaled[test_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    # Build model
    model = Sequential([
        Dense(128, input_dim=X_train_fold.shape[1], activation='relu'),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Fit model with early stopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model.fit(X_train_fold, y_train_fold, epochs=50, batch_size=10, verbose=1, validation_data=(X_val_fold, y_val_fold), callbacks=[early_stopping])

    # Evaluate model on the validation fold
    scores = model.evaluate(X_val_fold, y_val_fold, verbose=0)
    print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
    loss_per_fold.append(scores[0])
    accuracy_per_fold.append(scores[1] * 100)
    fold_no += 1

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Score for fold 1: loss of 0.4990762174129486; accuracy of 80.32786846160889%
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Score for fold 2: loss of 0.689996063709259; accuracy of 73.77049326896667%
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Score for fold 3: loss of 0.7192360162734985; accuracy of 80.32786846160889%
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Score for fold 4: loss of 0.49869340658187866; accuracy of 83.33333134651184%
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Score for fold 5: loss of 0.4521109163761139; accuracy of 86.66666746139526%


In [129]:
# == Provide average scores ==
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(accuracy_per_fold)):
    print('------------------------------------------------------------------------')
    print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {accuracy_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(accuracy_per_fold)} (+- {np.std(accuracy_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')
print('------------------------------------------------------------------------')

------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
> Fold 1 - Loss: 0.4990762174129486 - Accuracy: 80.32786846160889%
------------------------------------------------------------------------
> Fold 2 - Loss: 0.689996063709259 - Accuracy: 73.77049326896667%
------------------------------------------------------------------------
> Fold 3 - Loss: 0.7192360162734985 - Accuracy: 80.32786846160889%
------------------------------------------------------------------------
> Fold 4 - Loss: 0.49869340658187866 - Accuracy: 83.33333134651184%
------------------------------------------------------------------------
> Fold 5 - Loss: 0.4521109163761139 - Accuracy: 86.66666746139526%
------------------------------------------------------------------------
Average scores for all folds:
> Accuracy: 80.88524580001831 (+- 4.258145179357782)
> Loss: 0.5718225240707397
-----------------------------

In [130]:
# Final evaluation on the test data
final_scores = model.evaluate(X_test_scaled, y_test, verbose=1)
print(f'Final Test Loss: {final_scores[0]}, Final Test Accuracy: {final_scores[1]*100}%')

Final Test Loss: 0.9559434652328491, Final Test Accuracy: 72.36841917037964%


In [131]:
# Prepare the data in a DataFrame
data = {
    'Final Train Accuracy': [np.mean(accuracy_per_fold)],
    'Final Test Accuracy': [final_scores[1] * 100]
}
results_df = pd.DataFrame(data)

In [132]:
# Save the DataFrame to a CSV file
results_df.to_csv('NC_vs_VaD_accuracy_results.csv', index=False)