In [3]:
import pandas as pd
import numpy as np
# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

from joblib import dump, load
from pathlib import Path

In [4]:
entero_bac_100_data_path = Path('/Users/thomaslim/microbis/PyBact_1.0.1/enterobac_100_train_data.txt')
df_enterobac_data = pd.read_csv(entero_bac_100_data_path, sep='\t')
df_enterobac_data.rename({'Unnamed: 48': 'Target'}, axis=1, inplace=True)

In [3]:
DATA_8_PANEL_PATH = '/Users/thomaslim/microbis/bacterial_identifcation_models/enterobac_models/panels/panels_8.txt'
panels_8_dict = {}
with open(DATA_8_PANEL_PATH, 'r') as f:
    for line in f:
        name1, name2 = line.split(',', 1)
        panels_8_dict[name1] = name2.strip()
columns_8 = list(panels_8_dict.values())

In [5]:
DATA_20_PANEL_PATH = '/Users/thomaslim/microbis/bacterial_identifcation_models/enterobac_models/panels/panels_20.txt'
panels_20_dict = {}
with open(DATA_20_PANEL_PATH, 'r') as f:
    for line in f:
        name1, name2 = line.split(',', 1)
        panels_20_dict[name1] = name2.strip()
columns_20 = list(panels_20_dict.values())

In [5]:
X_8_panel= df_enterobac_data[columns_8]
y = df_enterobac_data['Target']
X_train, X_test, y_train, y_test = train_test_split(X_8_panel, y, test_size=0.2, random_state=42)

model_8_panel = RandomForestClassifier(n_estimators=100, random_state=42)
model_8_panel.fit(X_train, y_train)
y_pred = model_8_panel.predict(X_test)
y_pred_proba = model_8_panel.predict_proba(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.31404109589041096

In [8]:
# Save X_test and y_test together as the test set
X_test['Target'] = y_test
X_test.to_csv('rf_8_panel_enterobac_100_test_set.csv', index=False)

In [16]:
y_pred.tolist()

['C.gillenii',
 'C.warkmanii',
 'C.warkmanii',
 'E.aloacae',
 'K.ornithinolytica',
 'K.pneumoniae',
 'E.cancerogenus',
 'C.diversus_C.koseri',
 'S.rubidaea',
 'L.grimontii',
 'M.wisconsensis',
 'P.rustigianni',
 'Yersinia_ruckeri',
 'E.cancerogenus',
 'E.cancerogenus',
 'B.brennerae',
 'E.cancerogenus',
 'C.sedlakii',
 'E.pyrinus',
 'S.Gallinarum_Group_I',
 'Enteric_Group_64',
 'B.brennerae',
 'B.brennerae',
 'M.morganii_biogroup_1',
 'B.aquatica',
 'P.vulgeris',
 'P.fontium',
 'P.asymbiotica',
 'M.morganii_biogroup_1',
 'R.aquatilis',
 'S.Gallinarum_Group_I',
 'M.morganii_biogroup_1',
 'C.diversus_C.koseri',
 'E.cancerogenus',
 'Yersinia_ruckeri',
 'Y.pseudotuberculosis',
 'B.brennerae',
 'Yersinia_ruckeri',
 'S.entomoohila',
 'B.izardii',
 'B.brennerae',
 'S.Group_IV_strains',
 'C.diversus_C.koseri',
 'S.rubidaea',
 'C.warkmanii',
 'Yersinia_ruckeri',
 'Yersinia_ruckeri',
 'C.diversus_C.koseri',
 'E.dissolvens',
 'S.Choleresuis_Group_I',
 'K.georgiana',
 'B.izardii',
 'Yersinia_rucke

In [11]:
X_test

Unnamed: 0,Urea_hydrolysis,Lactose_fermentation,D-Glucose_acid,Citrate,Motility,Indole_production,Hydrogen_Sulfide_TSI,D-Glucose_Gas
169,0,1,1,0,0,0,1,1
9354,1,0,1,1,1,0,1,1
3013,1,0,1,1,1,0,1,1
1475,1,1,1,1,1,0,0,1
7420,1,1,1,1,0,1,0,1
...,...,...,...,...,...,...,...,...
1369,0,0,1,1,1,0,0,1
740,0,0,1,0,1,0,0,1
3461,1,1,1,0,1,1,0,0
5183,1,0,1,1,0,0,0,1


In [9]:
model_8_panel.predict

{}

In [20]:
y_pred[0]

'C.gillenii'

In [12]:
probability_assignments = []
for probs in y_pred_proba:
    sorted_dict = dict(sorted(zip(model_8_panel.classes_, list(probs)), key=lambda item: item[1], reverse=True))
    probability_assignments.append(sorted_dict)
df_results = pd.DataFrame({
    'true': y_test,
    'predicted': y_pred,
    'probability': probability_assignments
})

In [18]:
df_results['predicted']

169            C.gillenii
9354          C.warkmanii
3013          C.warkmanii
1475            E.aloacae
7420    K.ornithinolytica
              ...        
1369       E.cancerogenus
740           B.brennerae
3461          C.Group_137
5183         E.dissolvens
1188           B.agrestic
Name: predicted, Length: 2920, dtype: object

In [22]:
# Save the model
dump(model_8_panel, 'rf_8_panel_enterobac_100.pkl')
# Load the model (for future use)
rf_model_loaded = load('rf_8_panel_enterobac_100.pkl')

In [23]:
rf_model_loaded.predict(X_test)

array(['C.gillenii', 'C.warkmanii', 'C.warkmanii', ..., 'C.Group_137',
       'E.dissolvens', 'B.agrestic'], dtype=object)

In [167]:
print(df_results[df_results['true'] != df_results['predicted']].iloc[2])
df_results[df_results['true'] != df_results['predicted']].iloc[2].probability

true                                                  C.freundii
predicted                                              E.aloacae
probability    {'E.aloacae': 0.3508532857320039, 'E.gergoviae...
Name: 1475, dtype: object


{'E.aloacae': 0.3508532857320039,
 'E.gergoviae': 0.27121067672838584,
 'A.dalhousiensis': 0.10949588636023057,
 'C.freundii': 0.06585199181369887,
 'C.youngae': 0.057301970404054554,
 'C.murliniae': 0.04973692030024635,
 'Serratia_fonticola': 0.04683246009869316,
 'E.hormaechei': 0.016386892635965312,
 'E.aerogenes': 0.015106204021126937,
 'S.rubidaea': 0.009437873626837805,
 'S.marcescens': 0.007785838278756599,
 'B.agrestic': 0.0,
 'B.aquatica': 0.0,
 'B.brennerae': 0.0,
 'B.ferragutiae': 0.0,
 'B.gaviniae': 0.0,
 'B.izardii': 0.0,
 'B.noackiae': 0.0,
 'B.warmboldiae': 0.0,
 'C.Group_137': 0.0,
 'C.amalonaticus': 0.0,
 'C.davisae': 0.0,
 'C.diversus_C.koseri': 0.0,
 'C.farmeri': 0.0,
 'C.gillenii': 0.0,
 'C.lapagei': 0.0,
 'C.neteri': 0.0,
 'C.rodentium': 0.0,
 'C.sedlakii': 0.0,
 'C.species_3': 0.0,
 'C.species_5': 0.0,
 'C.warkmanii': 0.0,
 'E.agglomerans_complex': 0.0,
 'E.americana': 0.0,
 'E.amnigenus_biogroup_1': 0.0,
 'E.amnigenus_biogroup_2': 0.0,
 'E.asburiae': 0.0,
 'E.bla

### 20 Panel

In [9]:
X_20_panel= df_enterobac_data[list(set(columns_20).intersection(df_enterobac_data.columns))]
y = df_enterobac_data['Target']
X_train, X_test, y_train, y_test = train_test_split(X_20_panel, y, test_size=0.2, random_state=42)

model_20_panel = RandomForestClassifier(n_estimators=100, random_state=42)
model_20_panel.fit(X_train, y_train)
y_pred = model_20_panel.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7945205479452054

In [10]:
# Save X_test and y_test together as the test set
X_test['Target'] = y_test
X_test.to_csv('rf_20_panel_enterobac_100_test_set.csv', index=False)

In [8]:
X_test

Unnamed: 0,Indole_production,myo-Inositol_fermentation,Gelatin_hydrolysis_22_c,D-Sorbitol_fermentation,Arginine_dihydrolase,L-Rhamnose_fermentation,Melibiose_fermentation,Citrate,Phenylanine_deaminase,Ornithine_decarboxylase,D-Glucose_acid,Lysine_deaminase,Urea_hydrolysis,Voges-Proskauer,Hydrogen_Sulfide_TSI,Sucrose_fermentation,D-Mannose_fermentation,ONPG_test,L-Arabinose_fermentation,Target
169,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,1,B.aquatica
9354,0,0,0,0,0,0,0,1,1,1,1,0,1,0,1,0,0,0,0,P.mirabilis
3013,0,0,0,1,1,1,0,1,0,0,1,0,1,0,1,0,1,1,1,C.warkmanii
1475,0,0,0,1,1,1,1,1,0,0,1,0,1,0,0,1,1,1,1,C.freundii
7420,1,1,0,1,0,1,1,1,0,0,1,1,1,1,0,1,1,1,1,K.plantocila
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1369,0,0,0,1,0,0,1,1,0,1,1,0,0,1,0,1,1,1,0,C.species_5
740,0,0,0,0,1,1,0,0,1,0,1,0,0,0,0,0,1,1,1,B.noackiae
3461,1,0,0,1,0,1,1,0,0,1,1,0,1,0,0,1,1,1,1,C.Group_137
5183,0,0,0,0,1,1,0,1,0,1,1,0,1,1,0,1,1,1,1,E.hormaechei


In [170]:
probability_assignments = []
for probs in y_pred_proba:
    sorted_dict = dict(sorted(zip(model_8_panel.classes_, list(probs)), key=lambda item: item[1], reverse=True))
    probability_assignments.append(sorted_dict)
df_results = pd.DataFrame({
    'true': y_test,
    'predicted': y_pred,
    'probability': probability_assignments
})


In [171]:
df_results[df_results['true'] != df_results['predicted']]

Unnamed: 0,true,predicted,probability
7420,K.plantocila,K.oxytoca,"{'K.ornithinolytica': 0.4873866180017171, 'K.o..."
8650,M.morganii_subsp.sibonii,M.morganii_biogroup_1,"{'C.diversus_C.koseri': 0.32095787240513246, '..."
13422,Y.rohdei,Y.bercovieri,"{'Yersinia_ruckeri': 0.07841675216830658, 'K.r..."
14389,Enteric_Group_64,B.gaviniae,"{'Enteric_Group_64': 0.35627016487812446, 'B.g..."
10626,S.Paratyphi_A_Group_I,C.gillenii,"{'B.brennerae': 0.1276142238324297, 'S.Paratyp..."
...,...,...,...
718,B.noackiae,Enteric_Group_59,"{'E.cancerogenus': 0.10756657544505184, 'C.spe..."
3204,C.amalonaticus,C.sedlakii,"{'C.sedlakii': 0.41427152307112275, 'C.amalona..."
3488,C.Group_137,C.farmeri,"{'P.shigelloides': 0.527743256222928, 'C.Group..."
750,B.noackiae,Enteric_Group_59,"{'E.cancerogenus': 0.10756657544505184, 'C.spe..."


In [12]:
# Save the model
dump(model_20_panel, 'rf_20_panel_enterobac_100.pkl')
# Load the model (for future use)
rf_model_loaded = load('rf_20_panel_enterobac_100.pkl')

In [174]:
print(df_results[df_results['true'] != df_results['predicted']].iloc[0])
df_results[df_results['true'] != df_results['predicted']].iloc[0].probability

true                                                K.plantocila
predicted                                              K.oxytoca
probability    {'K.ornithinolytica': 0.4873866180017171, 'K.o...
Name: 7420, dtype: object


{'K.ornithinolytica': 0.4873866180017171,
 'K.oxytoca': 0.3782078694710493,
 'K.plantocila': 0.0890226125573437,
 'C.murliniae': 0.012298365781017595,
 'C.amalonaticus': 0.011264063501350619,
 'C.sedlakii': 0.01092188899681114,
 'Y.frederiksenii': 0.010898581690710568,
 'A.dalhousiensis': 0.0,
 'B.agrestic': 0.0,
 'B.aquatica': 0.0,
 'B.brennerae': 0.0,
 'B.ferragutiae': 0.0,
 'B.gaviniae': 0.0,
 'B.izardii': 0.0,
 'B.noackiae': 0.0,
 'B.warmboldiae': 0.0,
 'C.Group_137': 0.0,
 'C.davisae': 0.0,
 'C.diversus_C.koseri': 0.0,
 'C.farmeri': 0.0,
 'C.freundii': 0.0,
 'C.gillenii': 0.0,
 'C.lapagei': 0.0,
 'C.neteri': 0.0,
 'C.rodentium': 0.0,
 'C.species_3': 0.0,
 'C.species_5': 0.0,
 'C.warkmanii': 0.0,
 'C.youngae': 0.0,
 'E.aerogenes': 0.0,
 'E.agglomerans_complex': 0.0,
 'E.aloacae': 0.0,
 'E.americana': 0.0,
 'E.amnigenus_biogroup_1': 0.0,
 'E.amnigenus_biogroup_2': 0.0,
 'E.asburiae': 0.0,
 'E.blattae': 0.0,
 'E.cancerogenus': 0.0,
 'E.coil_invasive': 0.0,
 'E.coli': 0.0,
 'E.dissolv