In [26]:
import pandas as pd
from joblib import load
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import os


In [22]:
#test_3mers_df = pd.read_csv('encoded_data/test_3mers.csv',index_col=[0])
test_4mers_df = pd.read_csv('encoded_data/test_4mers.csv',index_col=[0])

In [23]:
# X_test = test_3mers_df.iloc[:, :-2]
# y_test = test_3mers_df['group']
X_test = test_4mers_df.iloc[:, :-2]
y_test = test_4mers_df['group']

In [32]:

performance_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

# Path to the directory containing the models
model_directory = "WGS_models/"

# Load and evaluate each model
for model_file in os.listdir(model_directory):
    if '_4mers' in model_file:  # Ensure the file is a joblib file
        model_path = os.path.join(model_directory, model_file)
        model = load(model_path)
        model_name = model_file.split('.')[0]  # Get model name from file name

        # Predicting using the loaded model
        y_pred = model.predict(X_test)

        # Calculating performance metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')

        # Create a temporary DataFrame for the current model and concatenate it
        temp_df = pd.DataFrame({
            'Model': [model_name],
            'Accuracy': [accuracy],
            'Precision': [precision],
            'Recall': [recall],
            'F1 Score': [f1]
        })
        performance_df = pd.concat([performance_df, temp_df], ignore_index=True)

In [14]:
performance_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,decision_tree,0.868376,0.867736,0.868376,0.868036
1,gaussian_naive_bayes,0.675524,0.677122,0.675524,0.676311
2,knn,0.902875,0.901278,0.902875,0.901179
3,logistic_regression,0.749495,0.748497,0.749495,0.679272
4,svm,0.841492,0.836774,0.841492,0.83283
5,random_forest,0.913598,0.912472,0.913598,0.91268
6,xgboost,0.835742,0.829885,0.835742,0.827745


In [25]:
performance_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,svm_4mers,0.859363,0.855414,0.859363,0.854849
1,xgboost_4mers,0.845688,0.840809,0.845688,0.838835
2,knn_4mers,0.906915,0.90547,0.906915,0.905426
3,logistic_regression_4mers,0.743434,0.761869,0.743434,0.658056
4,random_forest_4mers,0.912354,0.911313,0.912354,0.911594
5,decision_tree_4mers,0.873038,0.873196,0.873038,0.873116
6,gaussian_naive_bayes_4mers,0.679409,0.680819,0.679409,0.680104


In [2]:
import argparse
import pandas as pd
from Bio import SeqIO
import os
from itertools import product
from joblib import Parallel, delayed
from tqdm import tqdm

def encode_sequence(sequence, k):
    all_kmers = [''.join(p) for p in product('ATCG', repeat=k)]
    kmer_counts = {kmer: 0 for kmer in all_kmers}
    sequence = ''.join(char for char in sequence if char in ['A', 'T', 'C', 'G'])
    for i in range(len(sequence) - k + 1):
        kmer = sequence[i:i+k]
        if kmer in kmer_counts:
            kmer_counts[kmer] += 1
    encoded_df = pd.DataFrame([kmer_counts.values()], columns=all_kmers)
    sum_counts = encoded_df.sum(axis=1)
    encoded_df = encoded_df.divide(sum_counts, axis=0)
    encoded_df = encoded_df.fillna(0)
    return encoded_df


def read_fasta_to_kmers(file_path, k_mers):
    dfs_per_sequence = []
    for record in SeqIO.parse(file_path, "fasta"):
        sequence = str(record.seq)
        encoded_sequence_dfs = [encode_sequence(sequence, k) for k in k_mers]
        merged_sequence_df = pd.concat(encoded_sequence_dfs, axis=1)
        merged_sequence_df['filename_x'] = file_path
        dfs_per_sequence.append(merged_sequence_df)
    final_df = pd.concat(dfs_per_sequence, ignore_index=True)
    final_df = final_df.fillna(0)
    return final_df

In [3]:
mag_patho_df = read_fasta_to_kmers('../MAG/MAG_Pathogen.fna',[3])

In [4]:
mag_patho_df['group'] = 1

In [None]:
mag_non_patho_df = read_fasta_to_kmers('../MAG/MAG_NonPathogen.fna',[3])

In [None]:
mag_non_patho_df['group'] = 0

In [None]:
mag_df = pd.concat([mag_patho_df,mag_non_patho_df])

In [9]:
X_test = mag_df.iloc[:, :-2]
y_test = mag_df['group']

In [17]:
performance_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

# Path to the directory containing the models
model_directory = "WGS_models/"

# Load and evaluate each model
for model_file in os.listdir(model_directory):
    if '_4mers' not in model_file:  # Ensure the file is a joblib file
        model_path = os.path.join(model_directory, model_file)
        model = load(model_path)
        model_name = model_file.split('.')[0]  # Get model name from file name

        # Predicting using the loaded model
        y_pred = model.predict(X_test)

        # Calculating performance metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')

        # Create a temporary DataFrame for the current model and concatenate it
        temp_df = pd.DataFrame({
            'Model': [model_name],
            'Accuracy': [accuracy],
            'Precision': [precision],
            'Recall': [recall],
            'F1 Score': [f1]
        })
        performance_df = pd.concat([performance_df, temp_df], ignore_index=True)

In [18]:
performance_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,decision_tree,0.729483,0.814922,0.729483,0.751661
1,gaussian_naive_bayes,0.43769,0.736746,0.43769,0.461832
2,knn,0.782675,0.84893,0.782675,0.799237
3,logistic_regression,0.215805,0.831703,0.215805,0.078759
4,svm,0.586626,0.737981,0.586626,0.623654
5,random_forest,0.860182,0.886265,0.860182,0.867364
6,xgboost,0.471125,0.771562,0.471125,0.496371
