In [7]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model
import joblib
import os

In [2]:
def mc_dropout_predictions_class(model, inputs, label_encoder, n_samples=100):
    predictions = []
    for _ in range(n_samples):
        pred_output = model(inputs, training=True)  # Dropout active during inference
        pred_output = pred_output.numpy().argmax(1)
        pred_classes = label_encoder.inverse_transform(pred_output)  # Decode predicted classes
        predictions.append(pred_classes)
        
    predictions = np.stack(predictions)
    return predictions


In [5]:
# Load the saved model
model_dl = load_model('trained_model.h5')
le = joblib.load('label_encoder.pkl')

In [8]:
############################################
#Import all csv file under Train_data folder
############################################

folder_path = "Predict_data"
# List to hold DataFrames
dataframes = []
# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)
        dataframes.append(df)

combined_data = pd.concat(dataframes, ignore_index=True)
combined_data.to_csv('combined_predict_data.csv', index=False)
file_path = os.path.join('combined_predict_data.csv')
data = pd.read_csv(file_path)

In [32]:
test_seqs = np.array([test_item.replace("|", ",").replace(",", " ") for test_item in data["sequence"].values])
test_seqs2 = test_seqs[0:100]
test_seqs2.shape
label = data['cgc_id'][0:100]


In [15]:
result = mc_dropout_predictions_class(model_dl, test_seqs2, le)

In [16]:
t_result = result.T

In [22]:
t_result

array([['starch', 'starch', 'starch', ..., 'starch', 'starch', 'starch'],
       ['host glycan', 'host glycan', 'pectin', ..., 'pectin', 'pectin',
        'pectin'],
       ['pectin', 'pectin', 'pectin', ..., 'pectin', 'pectin', 'pectin'],
       ...,
       ['starch', 'pectin', 'pectin', ..., 'xylan', 'xylan', 'pectin'],
       ['pectin', 'pectin', 'pectin', ..., 'pectin', 'pectin', 'pectin'],
       ['pectin', 'pectin', 'pectin', ..., 'pectin', 'pectin', 'pectin']],
      dtype=object)

In [24]:
unique_numbers = np.unique(t_result)  # Get the unique numbers in the array
unique_numbers

array(['alginate', 'beta-glucan', 'cellulose', 'host glycan', 'pectin',
       'starch', 'xylan'], dtype=object)

In [34]:
unique_numbers = np.unique(t_result)  # Get the unique numbers in the array

# Initialize an empty array to store the counts
counts_by_row = np.zeros((t_result.shape[0], len(unique_numbers)), dtype=int)

# Iterate over each row and count the occurrences of each unique number
for i, row in enumerate(t_result):
    counts = np.array([np.sum(row == num) for num in unique_numbers])
    counts_by_row[i] = counts

# Print the counts
print(counts_by_row)
result = pd.DataFrame(counts_by_row, columns = unique_numbers)
result.insert(0, 'ID', label)
result.to_csv('predictions_counts.csv', index=False)

[[  0   0   0   0   2  98   0]
 [  3   0   0  34  61   1   1]
 [  0   0   0   1  99   0   0]
 [  3   1   0  89   7   0   0]
 [  0   0   1   0  73   0  26]
 [  0   0   3   0  30   0  67]
 [  0   0  88   0   2   2   8]
 [  0   3   1  26  70   0   0]
 [ 34   0   0   2  64   0   0]
 [  0   0   4   0  95   0   1]
 [  0   0   0   0  99   0   1]
 [  0   0   0   0  98   0   2]
 [  0   0   0   6  93   0   1]
 [  1   0   1   0  98   0   0]
 [  2  87   0   2   9   0   0]
 [  0   0   0  25  75   0   0]
 [  0   0   0   0  52   0  48]
 [  0   0   0   0  82   7  11]
 [  0   0   9  14  65  12   0]
 [  0   0   0   0 100   0   0]
 [  3   0   0   0  73   1  23]
 [  0   0   0  39  61   0   0]
 [  3   0   0   2  94   0   1]
 [  0   0   0   0  11   0  89]
 [  0  41   2  40  13   0   4]
 [  0   0   7   0  43   0  50]
 [  0   5  72   6  17   0   0]
 [  0   0   0   0  27  71   2]
 [  0   0   0   0  80   0  20]
 [  0   0   0   0  92   0   8]
 [  0   0   0   1  99   0   0]
 [  2   0   0  47  39   9   3]
 [  1   