Bacterial identification model 8 panel

In [4]:
import pandas as pd
from joblib import load
import pickle
import pandas as pd

In [1]:
# Load the model
with open('./models/rf_8_panel_enterobac_100.pkl', 'rb') as file:
    model = load(file)

# Load the test set
test_set = pd.read_csv('./test_sets/rf_8_panel_enterobac_100_test_set.csv')
X_test = test_set.drop('Target', axis=1)
y_test = test_set['Target']

# Display the features (column names) and the first row of data
features = X_test.columns
first_example = X_test.iloc[0]

# Predict first 3 examples of the test set
y_pred = model.predict(X_test.iloc[:3])
y_actual = y_test.iloc[:3]

for pred, actual in zip(y_pred, y_actual):
    print(f'Predicted: {pred}, Actual: {actual}')

Predicted: C.gillenii, Actual: B.aquatica
Predicted: C.warkmanii, Actual: P.mirabilis
Predicted: C.warkmanii, Actual: C.warkmanii


In [2]:
# Get the type and unique values for each column
for col in X_test.columns:
    print(f'{col}: Type: {X_test[col].dtype}, Range: {X_test[col].unique()}')

Urea_hydrolysis: Type: int64, Range: [0 1]
Lactose_fermentation: Type: int64, Range: [1 0]
D-Glucose_acid: Type: int64, Range: [1 0]
Citrate: Type: int64, Range: [0 1]
Motility: Type: int64, Range: [0 1]
Indole_production: Type: int64, Range: [0 1]
Hydrogen_Sulfide_TSI: Type: int64, Range: [1 0]
D-Glucose_Gas: Type: int64, Range: [1 0]


In [3]:
all_classes = model.classes_
print("All possible categories:", all_classes)

All possible categories: ['A.dalhousiensis' 'B.agrestic' 'B.aquatica' 'B.brennerae' 'B.ferragutiae'
 'B.gaviniae' 'B.izardii' 'B.noackiae' 'B.warmboldiae' 'C.Group_137'
 'C.amalonaticus' 'C.davisae' 'C.diversus_C.koseri' 'C.farmeri'
 'C.freundii' 'C.gillenii' 'C.lapagei' 'C.murliniae' 'C.neteri'
 'C.rodentium' 'C.sedlakii' 'C.species_3' 'C.species_5' 'C.warkmanii'
 'C.youngae' 'E.aerogenes' 'E.agglomerans_complex' 'E.aloacae'
 'E.americana' 'E.amnigenus_biogroup_1' 'E.amnigenus_biogroup_2'
 'E.asburiae' 'E.blattae' 'E.cancerogenus' 'E.coil_invasive' 'E.coli'
 'E.dissolvens' 'E.gergoviae' 'E.hormaechei' 'E.hoshinae' 'E.ictaluri'
 'E.nimipressuralis' 'E.pyrinus' 'E.tarda' 'E.tarda_biogroup_1'
 'E.taylorae_E.cancerogenus' 'E.vulneris' 'Enteric_Group_59'
 'Enteric_Group_60' 'Enteric_Group_63' 'Enteric_Group_64'
 'Enteric_Group_68' 'Enteric_Group_69' 'Esakazakii' 'Group_VI_strains'
 'H.alvei' 'H.alvei_biogroup_1' 'K.ascorbata' 'K.cryocrescens'
 'K.georgiana' 'K.intermedia' 'K.ornithinolytic

Bacterial identification model 20 panel

In [4]:
# Load the model and label encoder from the pickle file
with open('./models/rf_20_panel_enterobac_100.pkl', 'rb') as file:
    model = load(file)

test_set = pd.read_csv('./test_sets/rf_20_panel_enterobac_100_test_set.csv')
X_test = test_set.drop('Target', axis=1)
y_test = test_set['Target']

# Predict first 3 examples of the test set
y_pred = model.predict(X_test.iloc[:3])
y_actual = y_test.iloc[:3]

for pred, actual in zip(y_pred, y_actual):
    print(f'Predicted: {pred}, Actual: {actual}')

Predicted: B.aquatica, Actual: B.aquatica
Predicted: P.mirabilis, Actual: P.mirabilis
Predicted: C.warkmanii, Actual: C.warkmanii


In [5]:
# Get the type and unique values for each column
for col in X_test.columns:
    print(f'{col}: Type: {X_test[col].dtype}, Range: {X_test[col].unique()}')

Indole_production: Type: int64, Range: [0 1]
myo-Inositol_fermentation: Type: int64, Range: [0 1]
Gelatin_hydrolysis_22_c: Type: int64, Range: [0 1]
D-Sorbitol_fermentation: Type: int64, Range: [0 1]
Arginine_dihydrolase: Type: int64, Range: [0 1]
L-Rhamnose_fermentation: Type: int64, Range: [1 0]
Melibiose_fermentation: Type: int64, Range: [0 1]
Citrate: Type: int64, Range: [0 1]
Phenylanine_deaminase: Type: int64, Range: [0 1]
Ornithine_decarboxylase: Type: int64, Range: [0 1]
D-Glucose_acid: Type: int64, Range: [1 0]
Lysine_deaminase: Type: int64, Range: [0 1]
Urea_hydrolysis: Type: int64, Range: [0 1]
Voges-Proskauer: Type: int64, Range: [0 1]
Hydrogen_Sulfide_TSI: Type: int64, Range: [1 0]
Sucrose_fermentation: Type: int64, Range: [0 1]
D-Mannose_fermentation: Type: int64, Range: [0 1]
ONPG_test: Type: int64, Range: [1 0]
L-Arabinose_fermentation: Type: int64, Range: [1 0]


MIC Interpretation model

In [6]:
# Load the model and label encoder from the pickle file
with open('./models/mic_i_classification_best.pkl', 'rb') as file:
    model_and_encoder = pickle.load(file)

model = model_and_encoder['model']
label_encoder = model_and_encoder['label_encoder']

# Load the test set
test_set = pd.read_csv('./test_sets/MIC_Interpretation_test_set.csv')

# Ensure categorical columns are properly formatted
categorical_columns = test_set.select_dtypes(include=['object']).columns

# Convert the categorical columns to 'category' dtype
for col in categorical_columns:
    test_set[col] = test_set[col].astype('category')

# Separate the features and target for the entire test set
X_test = test_set.drop(columns=['MIC_Interpretation'])
y_test = test_set['MIC_Interpretation']

# Select the first two test examples
test_example_x = X_test.iloc[[0, 1]]  # Ensure it's a DataFrame
test_example_y = y_test.iloc[[0, 1]]  # Ensure it's a Series

# Predict the test examples
y_pred = model.predict(test_example_x)

# Decode the predicted labels
y_pred_decoded = label_encoder.inverse_transform(y_pred)

# Decode the actual labels
test_example_y_decoded = label_encoder.inverse_transform(test_example_y.values)

for i in range(len(y_pred_decoded)):
    print(f"Predicted: {y_pred_decoded[i]}, Actual: {test_example_y_decoded[i]}")

  Loading from a raw memory buffer (like pickle in Python, RDS in R) on a CPU-only
  machine. Consider using `save_model/load_model` instead. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.  Changing `tree_method` to `hist`.
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Predicted: Resistant, Actual: Resistant
Predicted: Susceptible, Actual: Susceptible


In [7]:
test_example_x

Unnamed: 0,Phenotype,Species,Family,Country,State,Gender,Age Group,Speciality,Source,In / Out Patient,Year,Antibiotic
0,MRSA,Staphylococcus aureus,Staphylococcus spp,Australia,,Male,85 and Over,Medicine General,Sputum,Inpatient,2013,Erythromycin
1,,Pseudomonas aeruginosa,Non-Enterobacterales,Brazil,,Female,19 to 64 Years,Medicine ICU,Peritoneal Fluid,,2022,Ceftazidime avibactam


In [8]:
# Get the type and unique values for each column
for col in X_test.columns:
    print(f'{col}: Type: {X_test[col].dtype}, {X_test[col].nunique()} unique values')

Phenotype: Type: category, 5 unique values
Species: Type: category, 361 unique values
Family: Type: category, 12 unique values
Country: Type: category, 83 unique values
State: Type: category, 46 unique values
Gender: Type: category, 2 unique values
Age Group: Type: category, 7 unique values
Speciality: Type: category, 12 unique values
Source: Type: category, 97 unique values
In / Out Patient: Type: category, 4 unique values
Year: Type: int64, 19 unique values
Antibiotic: Type: category, 42 unique values


MIC model

In [14]:
# Load the model and label encoder from the pickle file
with open('./models/mic_classification_best.pkl', 'rb') as file:
    model_and_encoder = pickle.load(file)

model = model_and_encoder['model']
label_encoder = model_and_encoder['label_encoder']

# Load the test set
test_set = pd.read_csv('./test_sets/MIC_test_set.csv')

# Ensure categorical columns are properly formatted
categorical_columns = test_set.select_dtypes(include=['object']).columns

# Convert the categorical columns to 'category' dtype
for col in categorical_columns:
    test_set[col] = test_set[col].astype('category')

# Separate the features and target for the entire test set
X_test = test_set.drop(columns=['MIC'])
y_test = test_set['MIC']

# Select the first test example
test_example_x = X_test.iloc[:5]  # Ensure it's a DataFrame
test_example_y = y_test.iloc[0:5]

# Predict the test example
y_pred = model.predict(test_example_x)

# Decode the predicted label
y_pred_decoded = label_encoder.inverse_transform(y_pred)
# Decode the actual label
test_example_y_decoded = label_encoder.inverse_transform(test_example_y)

print(f"Predicted: {y_pred_decoded}, Actual: {test_example_y_decoded}")

  Loading from a raw memory buffer (like pickle in Python, RDS in R) on a CPU-only
  machine. Consider using `save_model/load_model` instead. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.  Changing `tree_method` to `hist`.
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Predicted: ['>64' '0.12' '>8' '1' '1'], Actual: ['4' '>16' '2' '>4' '4']


In [13]:
# Get the type and unique values for each column
for col in X_test.columns:
    print(f'{col}: Type: {X_test[col].dtype}, {X_test[col].nunique()} unique values')

Phenotype: Type: category, 5 unique values
Species: Type: category, 365 unique values
Family: Type: category, 15 unique values
Country: Type: category, 83 unique values
State: Type: category, 46 unique values
Gender: Type: category, 2 unique values
Age Group: Type: category, 7 unique values
Speciality: Type: category, 12 unique values
Source: Type: category, 97 unique values
In / Out Patient: Type: category, 4 unique values
Year: Type: int64, 19 unique values
Antibiotic: Type: category, 49 unique values


Gene binary classification model

In [None]:
# Load the model and label encoder from the pickle file
with open('./models/gene_bin_classification_best.pkl', 'rb') as file:
    model_and_encoder = pickle.load(file)

model = model_and_encoder['model']
label_encoder = model_and_encoder['label_encoder']

# Load the test set
test_set = pd.read_csv('./test_sets/gene_bin_test_set.csv')

# Ensure categorical columns are properly formatted
categorical_columns = test_set.select_dtypes(include=['object']).columns

# Convert the categorical columns to 'category' dtype
for col in categorical_columns:
    test_set[col] = test_set[col].astype('category')

# Separate the features and target for the entire test set
X_test = test_set.drop(columns=['detected_binary'])
y_test = test_set['detected_binary']

# Initialize a set to keep track of the predicted categories and a dictionary to store inputs
predicted_categories = set()
inputs_for_categories = {}

# Iterate over the test set to find examples that yield different predictions
for i in range(len(X_test)):
    test_example_x = X_test.iloc[[i]]  # Ensure it's a DataFrame
    test_example_y = y_test.iloc[i]

    # Predict the test example
    y_pred = model.predict(test_example_x)

    # Decode the predicted label
    y_pred_decoded = label_encoder.inverse_transform(y_pred)[0]

    # Decode the actual label
    test_example_y_decoded = label_encoder.inverse_transform([test_example_y])[0]

    # Print the prediction, the actual value, and the input data
    print(f"Predicted: {y_pred_decoded}, Actual: {test_example_y_decoded}")
    print(f"Input data:\n{test_example_x}\n")

    # Add the predicted category to the set and store the input data if it's a new category
    if y_pred_decoded not in predicted_categories:
        predicted_categories.add(y_pred_decoded)
        inputs_for_categories[y_pred_decoded] = test_example_x

    # If we've seen all categories, stop the loop
    if len(predicted_categories) == len(label_encoder.classes_):
        break

# Print out the inputs for each unique category
print("\nInputs for each unique predicted category:")
for category, input_data in inputs_for_categories.items():
    print(f"\nCategory: {category}")
    print(input_data)

In [16]:
# Get the type and unique values for each column
for col in X_test.columns:
    print(f'{col}: Type: {X_test[col].dtype}, {X_test[col].nunique()} unique values')

Phenotype: Type: category, 1 unique values
Species: Type: category, 40 unique values
Family: Type: category, 2 unique values
Country: Type: category, 70 unique values
State: Type: category, 36 unique values
Gender: Type: category, 2 unique values
Age Group: Type: category, 7 unique values
Speciality: Type: category, 12 unique values
Source: Type: category, 80 unique values
In / Out Patient: Type: category, 3 unique values
Year: Type: int64, 18 unique values
gene: Type: category, 23 unique values


Gene mult classification model

In [18]:
# Load the model and label encoder from the pickle file
with open('./models/gene_mult_classification_best.pkl', 'rb') as file:
    model_and_encoder = pickle.load(file)

model = model_and_encoder['model']
label_encoder = model_and_encoder['label_encoder']

# Load the test set
test_set = pd.read_csv('./test_sets/gene_mult_test_set.csv')

# Ensure categorical columns are properly formatted
categorical_columns = test_set.select_dtypes(include=['object']).columns

# Convert the categorical columns to 'category' dtype
for col in categorical_columns:
    test_set[col] = test_set[col].astype('category')

# Separate the features and target for the entire test set
X_test = test_set.drop(columns=['detected_variant'])
y_test = test_set['detected_variant']

# Randomly select 4 test examples
test_example_x = X_test.iloc[[0,1,2,3]]  # Ensure it's a DataFrame
test_example_y = y_test.iloc[[0,1,2,3]]

# Predict the test example
y_pred = model.predict(test_example_x)

# Decode the predicted label
y_pred_decoded = label_encoder.inverse_transform(y_pred)
# Decode the actual label
test_example_y_decoded = label_encoder.inverse_transform(test_example_y)

for i in range(len(y_pred_decoded)):
    print(f"Predicted: {y_pred_decoded[i]}, Actual: {test_example_y_decoded[i]}")

  Loading from a raw memory buffer (like pickle in Python, RDS in R) on a CPU-only
  machine. Consider using `save_model/load_model` instead. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.  Changing `tree_method` to `hist`.
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


No example found for category '0' in the test set.
No example found for category 'ACC-1' in the test set.
No example found for category 'ACC-4' in the test set.
No example found for category 'ACT' in the test set.
No example found for category 'ACT-TYPE' in the test set.
No example found for category 'CMY' in the test set.
No example found for category 'CMY-13' in the test set.
No example found for category 'CMY-145' in the test set.
No example found for category 'CMY-16' in the test set.
No example found for category 'CMY-2' in the test set.
No example found for category 'CMY-2-TYPE' in the test set.
No example found for category 'CMY-4' in the test set.
No example found for category 'CMY-42' in the test set.
No example found for category 'CMY-6' in the test set.
No example found for category 'CMY-TYPE' in the test set.
No example found for category 'CTX-M-1' in the test set.
No example found for category 'CTX-M-1-TYPE' in the test set.
No example found for category 'CTX-M-12' in the 

In [20]:
# Get the type and unique values for each column
for col in X_test.columns:
    print(f'{col}: Type: {X_test[col].dtype}, {X_test[col].nunique()} unique values')

Phenotype: Type: category, 1 unique values
Species: Type: category, 40 unique values
Family: Type: category, 2 unique values
Country: Type: category, 70 unique values
State: Type: category, 36 unique values
Gender: Type: category, 2 unique values
Age Group: Type: category, 7 unique values
Speciality: Type: category, 12 unique values
Source: Type: category, 80 unique values
In / Out Patient: Type: category, 3 unique values
Year: Type: int64, 18 unique values
gene: Type: category, 23 unique values


In [21]:
test_example_x

Unnamed: 0,Phenotype,Species,Family,Country,State,Gender,Age Group,Speciality,Source,In / Out Patient,Year,gene
44658,ESBL,Escherichia coli,Enterobacteriaceae,Thailand,,Female,65 to 84 Years,Surgery ICU,Peritoneal Fluid,,2018,CMY1MOX
114043,ESBL,Escherichia coli,Enterobacteriaceae,Kuwait,,Female,19 to 64 Years,Medicine General,Skin: Other,Inpatient,2016,CTXM1
110760,ESBL,Klebsiella pneumoniae,Enterobacteriaceae,South Africa,,Male,Unknown,Medicine General,Sputum,None Given,2020,IMP
199397,,Pseudomonas aeruginosa,Non-Enterobacteriaceae,Russia,,Male,65 to 84 Years,Surgery General,Wound,Inpatient,2012,KPC
315253,ESBL,Escherichia coli,Enterobacteriaceae,Mexico,,Male,19 to 64 Years,Medicine ICU,Endotracheal aspirate,,2021,IMP
166811,ESBL,Klebsiella pneumoniae,Enterobacteriaceae,Portugal,,Male,65 to 84 Years,Medicine ICU,Sputum,Inpatient,2012,TEM
182423,ESBL,Escherichia coli,Enterobacteriaceae,Hungary,,Male,65 to 84 Years,Medicine General,Sputum,Inpatient,2016,ACC
259895,,Pseudomonas aeruginosa,Non-Enterobacteriaceae,Spain,,Male,19 to 64 Years,Medicine ICU,Respiratory: Other,,2019,SHV
272484,ESBL,Escherichia coli,Enterobacteriaceae,"Korea, South",,Female,65 to 84 Years,Medicine General,Urine,Inpatient,2016,VIM
23669,,Klebsiella pneumoniae,Enterobacteriaceae,Italy,,Female,19 to 64 Years,Medicine ICU,Endotracheal aspirate,,2017,FOX


In [20]:
# randomly select 10 examples from the test set and predict
test_example_x = X_test.sample(40)
test_example_y = y_test.loc[test_example_x.index]

# Predict the test examples

y_pred = model.predict(test_example_x)

# Decode the predicted labels

y_pred_decoded = label_encoder.inverse_transform(y_pred)

# Decode the actual labels

test_example_y_decoded = label_encoder.inverse_transform(test_example_y)

for i in range(len(y_pred_decoded)):
    print(f"Predicted: {y_pred_decoded[i]}, Actual: {test_example_y_decoded[i]}")

Predicted: NEG, Actual: NEG
Predicted: CTX-M-15, Actual: CTX-M-15
Predicted: NEG, Actual: NEG
Predicted: NEG, Actual: NEG
Predicted: NEG, Actual: NEG
Predicted: TEM-1, Actual: NEG
Predicted: NEG, Actual: NEG
Predicted: NEG, Actual: NEG
Predicted: NEG, Actual: NEG
Predicted: NEG, Actual: NEG
Predicted: NEG, Actual: NEG
Predicted: SHV-OSBL, Actual: SHV-OSBL
Predicted: NEG, Actual: NEG
Predicted: NEG, Actual: NEG
Predicted: NEG, Actual: NEG
Predicted: NEG, Actual: NEG
Predicted: NEG, Actual: NEG
Predicted: NEG, Actual: NEG
Predicted: NEG, Actual: NEG
Predicted: NEG, Actual: NEG
Predicted: NEG, Actual: NEG
Predicted: NEG, Actual: NEG
Predicted: SHV-OSBL(b), Actual: SHV-OSBL(b)
Predicted: NEG, Actual: NEG
Predicted: NEG, Actual: NEG
Predicted: NEG, Actual: NEG
Predicted: NEG, Actual: NEG
Predicted: NEG, Actual: NEG
Predicted: NEG, Actual: NEG
Predicted: TEM-OSBL(b), Actual: TEM-OSBL(b)
Predicted: NEG, Actual: NEG
Predicted: NEG, Actual: NEG
Predicted: NEG, Actual: NEG
Predicted: TEM-OSBL, A

In [20]:
label_encoder.inverse_transform(model.predict(test_example_x.loc[[109620]]))

array(['CTX-M-15'], dtype=object)