In [38]:
import pandas as pd
import numpy as np
# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

from joblib import dump, load

In [39]:
df_vibrio_data = pd.read_csv('../PyBact_1.0.1/vibrio_100_train_data.txt', sep='\t')
df_vibrio_data.rename({'Unnamed: 56': 'Bacteria'}, axis=1, inplace=True)
df_vibrio_data.head()

Unnamed: 0,Indole,MR,VP,Citrate,Urea,PD,AR,LD,OD,Motility,...,1%,6%,8%,10%,12%,Swarming,String-test,O129,PolymyxinB,Bacteria
0,1,1,0,1,0,0,0,1,1,1,...,1,0,1,0,0,0,1,1,0,V.cholerae
1,1,1,1,1,0,0,0,1,1,1,...,1,1,0,0,0,0,1,1,0,V.cholerae
2,1,1,1,1,0,0,0,1,1,1,...,1,1,0,0,0,0,1,1,0,V.cholerae
3,1,1,1,1,0,0,0,1,1,1,...,1,1,0,0,0,0,1,1,0,V.cholerae
4,1,1,0,1,0,0,0,1,1,1,...,1,1,0,0,0,0,1,1,1,V.cholerae


In [40]:
# Separate features and target
X = df_vibrio_data.drop(columns=['Bacteria'])
y = df_vibrio_data['Bacteria']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [41]:
# Create and train the RandomForest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [42]:
# Make predictions on the validation set
y_pred = clf.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 1.00


In [43]:
# Save the model
dump(clf, 'rf_vib_100.pkl')

# Load the model (for future use)
rf_model_loaded = load('rf_vib_100_model.pkl')

In [44]:
# Next, evaluate the model model on a new eval set created by PyBact
df_vibrio_data_eval = pd.read_csv('../PyBact_1.0.1/vibrio_10_eval_data.txt', sep='\t')
df_vibrio_data_eval.rename({'Unnamed: 56': 'Bacteria'}, axis=1, inplace=True)
predictions = rf_model_loaded.predict(df_vibrio_data_eval.drop(columns=['Bacteria']))
accuracy = accuracy_score(df_vibrio_data_eval['Bacteria'], predictions)
accuracy

1.0

### Repeat the same thing for enterobacteria

In [45]:
df_enterobac_data = pd.read_csv('../PyBact_1.0.1/enterobac_100_train_data.txt', sep='\t')
df_enterobac_data.rename({'Unnamed: 48': 'Bacteria'}, axis=1, inplace=True)
df_enterobac_data.head()

Unnamed: 0,Indole_production,Methyl_red,Voges-Proskauer,Citrate,Hydrogen_Sulfide_TSI,Urea_hydrolysis,Phenylanine_deaminase,Lysine_deaminase,Arginine_dihydrolase,Ornithine_decarboxylase,...,Acetate_utilization,Lipase_corn_oil,DNase_25_c,Nitrate_to_nitrite,Oxidase_Kovacs,ONPG_test,Yellow_pigment_25_c,D-Mannose_fermentation,Tyrosine_hydrolysis,Bacteria
0,0,1,0,1,0,1,0,1,0,1,...,0,0,0,1,0,1,0,1,0,A.dalhousiensis
1,0,1,0,1,0,1,0,1,0,1,...,0,0,0,1,0,1,0,1,0,A.dalhousiensis
2,0,1,0,1,0,1,0,1,0,1,...,1,0,0,1,0,1,0,1,0,A.dalhousiensis
3,0,1,0,1,0,1,0,1,0,1,...,0,0,0,1,0,1,0,1,0,A.dalhousiensis
4,0,1,0,1,0,1,0,1,0,1,...,0,0,0,1,0,1,0,1,0,A.dalhousiensis


In [46]:
# Separate features and target
X = df_enterobac_data.drop(columns=['Bacteria'])
y = df_enterobac_data['Bacteria']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [47]:
# Create and train the RandomForest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [48]:
# Make predictions on the validation set
y_pred = clf.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.94


In [49]:
# Save the model
dump(clf, 'rf_entero_100.pkl')

# Load the model (for future use)
rf_model_loaded = load('rf_entero_100.pkl')

In [51]:
# Next, evaluate the model model on a new eval set created by PyBact
df_enterobac_data_eval = pd.read_csv('../PyBact_1.0.1/enterobac_10_eval_data.txt', sep='\t')
df_enterobac_data_eval.rename({'Unnamed: 48': 'Bacteria'}, axis=1, inplace=True)
predictions = rf_model_loaded.predict(df_enterobac_data_eval.drop(columns=['Bacteria']))


In [None]:
predictions = rf_model_loaded.predict(df_enterobac_data_eval.drop(columns=['Bacteria']))
accuracy = accuracy_score(df_enterobac_data_eval['Bacteria'], predictions)
accuracy

0.9342465753424658