In [None]:
# Import libraries
%matplotlib inline
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import pandas as pd
import seaborn as sns
import re
from sklearn import preprocessing
#import keras
import tensorflow as tf
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
plt.style.use('seaborn')

In [None]:
# Import OTU data
df = pd.read_csv(open('df_all.csv','rb'),skip_blank_lines=True)

# Add 1 to all counts and log10-transform
df_plus = df.copy()
df_plus.loc[:, 'Otu8':'Otu819'] = df.loc[:, 'Otu8':'Otu819'] + 1
df_log10 = df_plus.copy()
df_log10.loc[:, 'Otu8':'Otu819'] = np.log10(df_plus.loc[:, 'Otu8':'Otu819'])

# Grab original OTU labels
OTU_columns = df_plus.loc[:, 'Otu8':'Otu819'].columns

# Import clustering results
cluster_tally = pd.read_csv('cluster_tally.csv',index_col=0)

In [1]:
# Import representative OTUs from hierarchical clustering
closest_OTUs = np.load("closest_OTUs.npy")

# Grab log10 counts of representative OTUs
rep_OTUs = df_log10[closest_OTUs]

# Append the SampleID column to the left
rep_OTUs.insert(loc=0, column='SampleID', value=df_log10.loc[:,'SampleID'].values)

# Standardize these representative OTU counts
rep_OTUs_scaled = rep_OTUs.copy()
rep_OTUs_scaled = rep_OTUs_scaled.drop(labels=['SampleID'],axis=1)

scaler = preprocessing.StandardScaler()
rep_OTUs_scaled = pd.DataFrame(scaler.fit_transform(rep_OTUs_scaled))
rep_OTUs_scaled.columns = rep_OTUs.columns[1:]

# # Sanity check after standardization:
# print("Averages are: \n{} \n".format(np.mean(rep_OTUs_scaled,axis=0))) # Sanity check
# print("Standard deviations are: \n{} \n".format(np.std(rep_OTUs_scaled,axis=0))) # Sanity check

# Finally add back the SampleID column
rep_OTUs_scaled.insert(loc=0, column='SampleID', value=df_log10.loc[:,'SampleID'].values)

NameError: name 'np' is not defined

In [None]:
# Import representative OTUs from Gaussian clustering
closest_OTUs_Gaussian = np.load("closest_OTUs_Gaussian.npy")

# Grab log10 counts of representative OTUs
rep_OTUs_Gaussian = df_log10[closest_OTUs_Gaussian]

# Append the SampleID column to the left
rep_OTUs_Gaussian.insert(loc=0, column='SampleID', value=df_log10.loc[:,'SampleID'].values)

# Standardize these representative OTU counts
rep_OTUs_Gaussian_scaled = rep_OTUs_Gaussian.copy()
rep_OTUs_Gaussian_scaled = rep_OTUs_Gaussian_scaled.drop(labels=['SampleID'],axis=1)

scaler = preprocessing.StandardScaler()
rep_OTUs_Gaussian_scaled = pd.DataFrame(scaler.fit_transform(rep_OTUs_Gaussian_scaled))
rep_OTUs_Gaussian_scaled.columns = rep_OTUs_Gaussian.columns[1:]

# Finally add back the SampleID column
rep_OTUs_Gaussian_scaled.insert(loc=0, column='SampleID', value=df_log10.loc[:,'SampleID'].values)

In [None]:
# Import representative OTUs from Dirichlet clustering
closest_OTUs_Dirichlet = pd.read_csv('Dirichlet_OTUs.txt',sep='\s',engine='python').columns

# Grab log10 counts of representative OTUs
rep_OTUs_Dirichlet = df_log10[closest_OTUs_Dirichlet]

# Append the SampleID column to the left
rep_OTUs_Dirichlet.insert(loc=0, column='SampleID', value=df_log10.loc[:,'SampleID'].values)

# Standardize these representative OTU counts
rep_OTUs_Dirichlet_scaled = rep_OTUs_Dirichlet.copy()
rep_OTUs_Dirichlet_scaled = rep_OTUs_Dirichlet_scaled.drop(labels=['SampleID'],axis=1)

scaler = preprocessing.StandardScaler()
rep_OTUs_Dirichlet_scaled = pd.DataFrame(scaler.fit_transform(rep_OTUs_Dirichlet_scaled))
rep_OTUs_Dirichlet_scaled.columns = rep_OTUs_Dirichlet.columns[1:]

# Finally add back the SampleID column
rep_OTUs_Dirichlet_scaled.insert(loc=0, column='SampleID', value=df_log10.loc[:,'SampleID'].values)

In [None]:
# Group the OTUs according to their clusters, then sum up the log10 counts:
df_log10_sums = pd.DataFrame(index=df_log10.index,columns=cluster_tally.index)

for i in df_log10_sums.index:
    rel_row = df_log10.loc[[i]]
    for j in df_log10_sums.columns:
        # Grab members corresponding to group j
        members = cluster_tally.loc[[j]].dropna(axis='columns',how='any').values
        # Sum up their log10 abundance counts
        counts_sum = 0
        for m in range(0,members.shape[1]):
            OTU_tag = members[0,m]
            counts_sum = counts_sum + rel_row.loc[:,OTU_tag].values
        df_log10_sums.loc[[i],j] = counts_sum

# Finally, append the SampleID column to the left
df_log10_sums.insert(loc=0, column='SampleID', value=df_log10.loc[:,'SampleID'].values)

In [None]:
## Merge the clustered OTU data with waterchem data
# Grab waterchem data
df_waterchem = pd.read_csv(open('df_waterchem.csv','rb'),skip_blank_lines=True)

# Append OTU data to the right
df_everything = pd.merge(df_waterchem,rep_OTUs_scaled, how='left', left_on=['SampleID'], right_on=['SampleID'])

In [None]:
# Sanity check to ensure every column is standardized
print("Averages are: \n{} \n".format(np.mean(df_everything,axis=0))) # Sanity check
print("Standard deviations are: \n{} \n".format(np.std(df_everything,axis=0))) # Sanity check

## 1. MDA for benchmark case (water chem variables only)

In [None]:
X = df_everything.drop(labels=['SampleID','NRR','NRF','SeRR','SeRF'],axis=1)
y = pd.DataFrame(df_everything['SeRR'])
n_feats = X.columns.shape[0]

In [None]:
# What are the waterchem variables?
waterchem_var = X_train_water.columns

# How many waterchem variables are there?
num_waterchem_var = waterchem_var.shape[0]

print(num_waterchem_var)
print(waterchem_var)

Note that each model we train corresponds to a randomly selected training/testing split, as well as a random permutation of one variable. Therefore we need to repeat this procedure $n_{trials}$ times and average over them to get a reasonable estimate of the MDA values.

### 1.1 Base-case model with no variables permutated

In [None]:
# First, specify the number of MDA experiments to perform:
n_MDA_runs = 10

shuffled_accuracies = np.zeros([num_waterchem_var,n_MDA_runs])

In [None]:
# Specify desired test fraction:
test_frac = 0.4 # No hyperparameter selection, so no validation set

# Obtain categorical values of y: (y is 0 if positive, 1 if negative)
y['class'] = np.where(y['SeRR']>=0, 0, 1)
y_class = y['class'].values
C = np.unique(y_class).shape[0] # Number of classes

y_onehot = np.eye(C)[y_class.astype(int)] # Convert categorical to one-hot-encoding
y_onehot = y_onehot.reshape(y_class.shape[0],C) # Reshape into dimensions (n_t rows by C columns)

model_num = 0

for i in range(0,n_MDA_runs):
    ### STEP 0: Randomly split data into training and testing.

    # Split into training and testing portions
    X_train, X_test, y_train, y_test = train_test_split(X,y_onehot,test_size=test_frac,shuffle=True)

    ## Now grab the waterchem data input subset:
    X_train_water = X_train.loc[:,'EBCT':'FBR3']
    X_test_water = X_test.loc[:,'EBCT':'FBR3']

    acc_array = []

    ## For TRAINING SET:
    # Shuffle the values of this variable arbitrarily:
    values_to_shuffle = X_train_water.loc[:,permutate_var].values
    values_to_shuffle = np.random.permutation(values_to_shuffle)

    # Put these shuffled values back in the original dataset
    X_train_water_shuffled = X_train_water.copy()
    X_train_water_shuffled[permutate_var] = values_to_shuffle

    ## For TESTING SET:
    # Shuffle the values of this variable arbitrarily:
    values_to_shuffle = X_test_water.loc[:,permutate_var].values
    values_to_shuffle = np.random.permutation(values_to_shuffle)

    # Put these shuffled values back in the original dataset
    X_test_water_shuffled = X_test_water.copy()
    X_test_water_shuffled[permutate_var] = values_to_shuffle

### STEP 2: RE-TRAIN ANN USING SHUFFLED DATASET, THEN EVALUATE ACCURACY:

    # Train ANN:
    X_ANN = tf.placeholder(tf.float32,shape=[None,X_train_water_shuffled.shape[1]])
    y_true_ANN = tf.placeholder(tf.float32,shape=[None,C])

    # Specify hyperparameters
    npl = 20 # Number of neurons per layer
    n_hidden = 10 # Number of hidden layers
    lrate = 0.01 # Learning rate for gradient descent
    epochs = 10 # Total number of iterations
    spe = 20 # Steps per epoch: One step represents one update to the cost function gradient
    actf = tf.nn.relu # Activation function
    alpha = 0.1 # Magnitude of regularizer
    reg = tf.contrib.layers.l2_regularizer(scale=alpha) # Regularizer function

    dimof_output = C # Dimension of y

    ## Specify neural net architecture

    # Use a dictionary setup to generalize number of layers
    hlayer = dict()
    hlayer[0] = tf.layers.dense(X_ANN,npl,activation=actf,kernel_regularizer=reg) # Specify first layer

    # Now specify layers 2 through (n_hidden), assuming the same activation function is used throughout
    for el in range(1,n_hidden):
        hlayer[el] = tf.layers.dense(hlayer[el-1],npl,activation=actf,kernel_regularizer=reg)

    outlayer = tf.layers.dense(hlayer[n_hidden-1],dimof_output,activation=tf.nn.softmax)

    cross_ent = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=y_true_ANN,logits=outlayer))
    optimizer = tf.train.AdamOptimizer(lrate)
    train_ANN = optimizer.minimize(cross_ent)

    ## Train neural net for classification
    init2 = tf.global_variables_initializer()

    with tf.Session() as sess:

        sess.run(init2)

        for epoch in range(epochs):

            #print("On Epoch {}".format(epoch))

            for step in range(spe):

                sess.run(train_ANN,feed_dict={X_ANN:X_train_water_shuffled,y_true_ANN:y_train})

        predictions = outlayer.eval(feed_dict={X_ANN:X_test_water_shuffled})
        
    model_num += 1
    print("Model {}".format(model_num))
    # Evaluate accuracy on test set:
    predictions_r = predictions.round(0)

    # Compute test set accuracy
    ANN_test_acc_waterchem_shuffled = np.sum(np.all(np.equal(predictions_r,y_test),axis=1))/y_test.shape[0]
    #print("ANN testing accuracy is {:0.2f}% using SHUFFLED water chemistry variables only.".format(ANN_test_acc_waterchem*100))
    acc_array.append(ANN_test_acc_waterchem_shuffled)
    acc_array = np.asarray(acc_array)
    shuffled_accuracies[:,i] = acc_array

In [None]:
# Display accuracies in all experiment in dataframe:
acc_df = pd.DataFrame(columns = ['Variable Permutated'])
acc_df['Variable Permutated'] = waterchem_var   

for i in range(0,n_MDA_runs):
    col_name = (i+1 )
    acc_df[col_name] = (shuffled_accuracies[:,i])*100

# Finally, calculate average accuracy values:
acc_base_df = pd.DataFrame(columns = ['Variable','Base Accuracy (%)'])
acc_base_df['Variable'] = waterchem_var

for i in acc_base_df.index:
    acc_base_df.loc[[i],'Base Accuracy (%)'] = np.average(np.asarray(acc_df.loc[[i]].values[:,1:]),axis=1)

acc_base_df['Base Accuracy (%)'] = acc_base_df.loc[:,'Base Accuracy (%)':'Base Accuracy (%)'].astype(float).round(1)
acc_base_df    

### 1.2 MDA Experiments

In [None]:
X = df_everything.drop(labels=['SampleID','NRR','NRF','SeRR','SeRF'],axis=1)
y = pd.DataFrame(df_everything['SeRR'])
n_feats = X.columns.shape[0]

# Specify desired test fraction:
test_frac = 0.4 # No hyperparameter selection, so no validation set

# Obtain categorical values of y: (y is 0 if positive, 1 if negative)
y['class'] = np.where(y['SeRR']>=0, 0, 1)
y_class = y['class'].values
C = np.unique(y_class).shape[0] # Number of classes

y_onehot = np.eye(C)[y_class.astype(int)] # Convert categorical to one-hot-encoding
y_onehot = y_onehot.reshape(y_class.shape[0],C) # Reshape into dimensions (n_t rows by C columns)

model_num = 0

for i in range(0,n_MDA_runs):
    ### STEP 0: Randomly split data into training and testing.

    # Split into training and testing portions
    X_train, X_test, y_train, y_test = train_test_split(X,y_onehot,test_size=test_frac,shuffle=True)

    ## Now grab the waterchem data input subset:
    X_train_water = X_train.loc[:,'EBCT':'FBR3']
    X_test_water = X_test.loc[:,'EBCT':'FBR3']

    acc_array = []
    for j in range(0,num_waterchem_var):
    ### STEP 1: PERMUTATE THE SPECIFIC VARIABLE

        # Select one of these variables to randomly permutate
        permutate_var = waterchem_var[j]

        ## For TRAINING SET:
        # Shuffle the values of this variable arbitrarily:
        values_to_shuffle = X_train_water.loc[:,permutate_var].values
        values_to_shuffle = np.random.permutation(values_to_shuffle)

        # Put these shuffled values back in the original dataset
        X_train_water_shuffled = X_train_water.copy()
        X_train_water_shuffled[permutate_var] = values_to_shuffle

        ## For TESTING SET:
        # Shuffle the values of this variable arbitrarily:
        values_to_shuffle = X_test_water.loc[:,permutate_var].values
        values_to_shuffle = np.random.permutation(values_to_shuffle)

        # Put these shuffled values back in the original dataset
        X_test_water_shuffled = X_test_water.copy()
        X_test_water_shuffled[permutate_var] = values_to_shuffle

    ### STEP 2: RE-TRAIN ANN USING SHUFFLED DATASET, THEN EVALUATE ACCURACY:

        # Train ANN:
        X_ANN = tf.placeholder(tf.float32,shape=[None,X_train_water_shuffled.shape[1]])
        y_true_ANN = tf.placeholder(tf.float32,shape=[None,C])

        # Specify hyperparameters
        npl = 20 # Number of neurons per layer
        n_hidden = 10 # Number of hidden layers
        lrate = 0.01 # Learning rate for gradient descent
        epochs = 10 # Total number of iterations
        spe = 20 # Steps per epoch: One step represents one update to the cost function gradient
        actf = tf.nn.relu # Activation function
        alpha = 0.1 # Magnitude of regularizer
        reg = tf.contrib.layers.l2_regularizer(scale=alpha) # Regularizer function

        dimof_output = C # Dimension of y

        ## Specify neural net architecture

        # Use a dictionary setup to generalize number of layers
        hlayer = dict()
        hlayer[0] = tf.layers.dense(X_ANN,npl,activation=actf,kernel_regularizer=reg) # Specify first layer

        # Now specify layers 2 through (n_hidden), assuming the same activation function is used throughout
        for el in range(1,n_hidden):
            hlayer[el] = tf.layers.dense(hlayer[el-1],npl,activation=actf,kernel_regularizer=reg)

        outlayer = tf.layers.dense(hlayer[n_hidden-1],dimof_output,activation=tf.nn.softmax)

        cross_ent = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=y_true_ANN,logits=outlayer))
        optimizer = tf.train.AdamOptimizer(lrate)
        train_ANN = optimizer.minimize(cross_ent)

        ## Train neural net for classification
        init2 = tf.global_variables_initializer()

        with tf.Session() as sess:

            sess.run(init2)

            for epoch in range(epochs):

                #print("On Epoch {}".format(epoch))

                for step in range(spe):

                    sess.run(train_ANN,feed_dict={X_ANN:X_train_water_shuffled,y_true_ANN:y_train})

            predictions = outlayer.eval(feed_dict={X_ANN:X_test_water_shuffled})
        
        model_num += 1
        print("Model {}".format(model_num))
        # Evaluate accuracy on test set:
        predictions_r = predictions.round(0)

        # Compute test set accuracy
        ANN_test_acc_waterchem_shuffled = np.sum(np.all(np.equal(predictions_r,y_test),axis=1))/y_test.shape[0]
        #print("ANN testing accuracy is {:0.2f}% using SHUFFLED water chemistry variables only.".format(ANN_test_acc_waterchem*100))
        acc_array.append(ANN_test_acc_waterchem_shuffled)
    acc_array = np.asarray(acc_array)
    shuffled_accuracies[:,i] = acc_array

In [None]:
# Display accuracies in all experiment in dataframe:

acc_df = pd.DataFrame(columns = ['Variable Permutated'])
acc_df['Variable Permutated'] = waterchem_var   

for i in range(0,n_MDA_runs):
    col_name = (i+1 )
    acc_df[col_name] = (shuffled_accuracies[:,i])*100
    
# Finally, calculate average MDA values:
acc_avg_df = pd.DataFrame(columns = ['Variable Permutated','Avg Acc Increase (%)'])
acc_avg_df['Variable Permutated'] = waterchem_var

for i in acc_base_df.index:
    acc_avg_df.loc[[i],'Avg Acc Increase (%)'] = np.average(np.asarray(acc_df.loc[[i]].values[:,1:]),axis=1) - acc_base_df.loc[[i],'Base Accuracy (%)']

acc_avg_df['Avg Acc Increase (%)'] = acc_avg_df.loc[:,'Avg Acc Increase (%)':'Avg Acc Increase (%)'].astype(float).round(1)
acc_avg_df

In [None]:
# Pick out the top waterchem and OTU variables, in terms of importance according to MDA:
num_waterchem_smallest = 4

acc_avg_df_waterchem = acc_avg_df[0:num_waterchem_var]

top_waterchem = acc_avg_df_waterchem.nsmallest(num_waterchem_smallest, 'Avg Acc Increase (%)', keep='first')

display(top_waterchem)

## 2. MDA for water chem + representative OTUs from hierarchical clustering

In [None]:
## Merge the clustered OTU data with waterchem data
# Grab waterchem data
df_waterchem = pd.read_csv(open('df_waterchem.csv','rb'),skip_blank_lines=True)

# Append OTU data to the right
df_everything_hierarch = pd.merge(df_waterchem,rep_OTUs_scaled, how='left', left_on=['SampleID'], right_on=['SampleID'])

In [None]:
# Grab waterchem variables + representative OTU variables
df_everything = df_everything_hierarch.drop(labels=['SampleID','NRR','NRF','SeRR','SeRF'],axis=1)

# What are the relevant variables?
everything_var = df_everything.columns

# How many waterchem variables are there?
num_everything_var = everything_var.shape[0]

print(num_everything_var)
print(everything_var)

In [None]:
# Specify the number of MDA experiments to perform:
n_MDA_runs = 10

shuffled_accuracies = np.zeros([num_everything_var,n_MDA_runs])

In [None]:
X = df_everything
y = pd.DataFrame(df_everything_hierarch['SeRR'])
n_feats = X.columns.shape[0]

# What are the variables in this dataset?
hierarch_var = X.columns

# How many hierarchical variables are there?
num_hierarch_var = hierarch_var.shape[0]

print(num_hierarch_var)
print(hierarch_var)

In [None]:
# Specify desired test fraction:
test_frac = 0.4 # No hyperparameter selection, so no validation set

# Obtain categorical values of y: (y is 0 if positive, 1 if negative)
y['class'] = np.where(y['SeRR']>=0, 0, 1)
y_class = y['class'].values
C = np.unique(y_class).shape[0] # Number of classes

y_onehot = np.eye(C)[y_class.astype(int)] # Convert categorical to one-hot-encoding
y_onehot = y_onehot.reshape(y_class.shape[0],C) # Reshape into dimensions (n_t rows by C columns)

model_num = 0

for i in range(0,n_MDA_runs):
    ### STEP 0: Randomly split data into training and testing.

    # Split into training and testing portions
    X_train, X_test, y_train, y_test = train_test_split(X,y_onehot,test_size=test_frac,shuffle=True)

    acc_array = []
    for j in range(0,num_hierarch_var):
    ### STEP 1: PERMUTATE THE SPECIFIC VARIABLE

        # Select one of these variables to randomly permutate
        permutate_var = hierarch_var[j]

        ## For TRAINING SET:
        # Shuffle the values of this variable arbitrarily:
        values_to_shuffle = X_train.loc[:,permutate_var].values
        values_to_shuffle = np.random.permutation(values_to_shuffle)

        # Put these shuffled values back in the original dataset
        X_train_shuffled = X_train.copy()
        X_train_shuffled[permutate_var] = values_to_shuffle

        ## For TESTING SET:
        # Shuffle the values of this variable arbitrarily:
        values_to_shuffle = X_test.loc[:,permutate_var].values
        values_to_shuffle = np.random.permutation(values_to_shuffle)

        # Put these shuffled values back in the original dataset
        X_test_shuffled = X_test.copy()
        X_test_shuffled[permutate_var] = values_to_shuffle

    ### STEP 2: RE-TRAIN ANN USING SHUFFLED DATASET, THEN EVALUATE ACCURACY:

        # Train ANN:
        X_ANN = tf.placeholder(tf.float32,shape=[None,X_train_shuffled.shape[1]])
        y_true_ANN = tf.placeholder(tf.float32,shape=[None,C])

        # Specify hyperparameters
        npl = 20 # Number of neurons per layer
        n_hidden = 10 # Number of hidden layers
        lrate = 0.01 # Learning rate for gradient descent
        epochs = 10 # Total number of iterations
        spe = 20 # Steps per epoch: One step represents one update to the cost function gradient
        actf = tf.nn.relu # Activation function
        alpha = 0.1 # Magnitude of regularizer
        reg = tf.contrib.layers.l2_regularizer(scale=alpha) # Regularizer function

        dimof_output = C # Dimension of y

        ## Specify neural net architecture

        # Use a dictionary setup to generalize number of layers
        hlayer = dict()
        hlayer[0] = tf.layers.dense(X_ANN,npl,activation=actf,kernel_regularizer=reg) # Specify first layer

        # Now specify layers 2 through (n_hidden), assuming the same activation function is used throughout
        for el in range(1,n_hidden):
            hlayer[el] = tf.layers.dense(hlayer[el-1],npl,activation=actf,kernel_regularizer=reg)

        outlayer = tf.layers.dense(hlayer[n_hidden-1],dimof_output,activation=tf.nn.softmax)

        cross_ent = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=y_true_ANN,logits=outlayer))
        optimizer = tf.train.AdamOptimizer(lrate)
        train_ANN = optimizer.minimize(cross_ent)

        ## Train neural net for classification
        init2 = tf.global_variables_initializer()

        with tf.Session() as sess:

            sess.run(init2)

            for epoch in range(epochs):

                # print("On Epoch {}".format(epoch))

                for step in range(spe):

                    sess.run(train_ANN,feed_dict={X_ANN:X_train_shuffled,y_true_ANN:y_train})

            predictions = outlayer.eval(feed_dict={X_ANN:X_test_shuffled})
        
        model_num += 1
        print("Model {}".format(model_num))
        # Evaluate accuracy on test set:
        predictions_r = predictions.round(0)

        # Compute test set accuracy
        ANN_test_acc_shuffled = np.sum(np.all(np.equal(predictions_r,y_test),axis=1))/y_test.shape[0]
        #print("ANN testing accuracy is {:0.2f}% using SHUFFLED hierarch variables.".format(ANN_test_acc_waterchem*100))
        acc_array.append(ANN_test_acc_shuffled)
    acc_array = np.asarray(acc_array)
    shuffled_accuracies[:,i] = acc_array

In [None]:
# Display accuracies in all experiment in dataframe:

acc_df = pd.DataFrame(columns = ['Variable Permutated'])
acc_df['Variable Permutated'] = waterchem_var   

for i in range(0,n_MDA_runs):
    col_name = (i+1 )
    acc_df[col_name] = (shuffled_accuracies[:,i]-ANN_test_acc_waterchem)*100
    
# Finally, calculate average MDA values:
acc_avg_df = pd.DataFrame(columns = ['Variable Permutated','Average Accuracy (%)'])
acc_avg_df['Variable Permutated'] = waterchem_var

for i in acc_avg_df.index:
    acc_avg_df.loc[[i],'Average Accuracy (%)'] = np.average(np.asarray(acc_df.loc[[i]].values[:,1:]),axis=1)

acc_avg_df['Average Accuracy (%)'] = acc_avg_df.loc[:,'Average Accuracy (%)':'Average Accuracy (%)'].astype(float).round(1)
acc_avg_df