In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model
import joblib
import os

In [2]:
def mc_dropout_predictions_class(model, inputs, label_encoder, n_samples=100):
    predictions = []
    for _ in range(n_samples):
        pred_output = model(inputs, training=True)  # Dropout active during inference
        pred_output = pred_output.numpy().argmax(1)
        pred_classes = label_encoder.inverse_transform(pred_output)  # Decode predicted classes
        predictions.append(pred_classes)
        
    predictions = np.stack(predictions)
    return predictions


In [3]:
# Load the saved model
model_dl = load_model('trained_model.h5')
le = joblib.load('label_encoder.pkl')

In [None]:
############################################
#Import all csv file under Train_data folder
############################################

folder_path = "cgc_input_reformat"
# List to hold DataFrames
dataframes = []
# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)
        dataframes.append(df)

combined_data = pd.concat(dataframes, ignore_index=True)
combined_data.to_csv('combined_predict_data.csv', index=False)
file_path = os.path.join('combined_predict_data.csv')
data = pd.read_csv(file_path)

In [5]:
test_seqs = np.array([test_item.replace("|", ",").replace(",", " ") for test_item in data["sequence"].values])
test_seqs2 = test_seqs[0:10]
test_seqs2.shape
label = data['cgc_id'][0:10]


In [6]:
result = mc_dropout_predictions_class(model_dl, test_seqs2, le)

In [7]:
t_result = result.T

In [8]:
t_result

array([['pectin', 'starch', 'pectin', 'pectin', 'pectin', 'pectin',
        'pectin', 'pectin', 'pectin', 'pectin', 'pectin', 'pectin',
        'pectin', 'host glycan', 'host glycan', 'pectin', 'starch',
        'pectin', 'pectin', 'pectin', 'pectin', 'pectin', 'pectin',
        'starch', 'pectin', 'starch', 'pectin', 'host glycan',
        'host glycan', 'pectin', 'pectin', 'alginate', 'pectin',
        'pectin', 'pectin', 'pectin', 'host glycan', 'pectin', 'pectin',
        'pectin', 'pectin', 'pectin', 'alginate', 'pectin',
        'host glycan', 'host glycan', 'pectin', 'pectin', 'pectin',
        'pectin', 'pectin', 'pectin', 'pectin', 'pectin', 'pectin',
        'pectin', 'pectin', 'pectin', 'pectin', 'pectin', 'pectin',
        'host glycan', 'pectin', 'host glycan', 'pectin', 'pectin',
        'pectin', 'pectin', 'pectin', 'pectin', 'pectin', 'pectin',
        'pectin', 'pectin', 'pectin', 'host glycan', 'pectin', 'pectin',
        'host glycan', 'pectin', 'host glycan', 'pecti

In [9]:
unique_numbers = np.unique(t_result)  # Get the unique numbers in the array

# Initialize an empty array to store the counts
counts_by_row = np.zeros((t_result.shape[0], len(unique_numbers)), dtype=int)

# Iterate over each row and count the occurrences of each unique number
for i, row in enumerate(t_result):
    counts = np.array([np.sum(row == num) for num in unique_numbers])
    counts_by_row[i] = counts

# Print the counts
print(counts_by_row)
result = pd.DataFrame(counts_by_row, columns = unique_numbers)
result.insert(0, 'ID', label)
result.to_csv('predictions_counts.csv', index=False)

[[ 2  0  0 15 78  5  0]
 [10  3  0 49  6  1 31]
 [ 0  0 15 10 69  0  6]
 [42  7  0 25 26  0  0]
 [43 55  0  1  0  1  0]
 [ 0  0  2  1 97  0  0]
 [31  0  1  0 68  0  0]
 [ 2  0  0  0 98  0  0]
 [ 0  0  0  0  1  0 99]
 [ 0  0  0  0 32  0 68]]
