In [1]:
import os
import sys
sys.path.insert(0, os.path.abspath('../../qbiome'))

from data_formatter import DataFormatter
from quantizer import Quantizer
from qnet_orchestrator import QnetOrchestrator

In [2]:
dirname = 'example_data/'
data = dirname + 'SamplesByMetadata_otuDADA2_EMP_10249_ECAM_RSRC_TaxaRelativeAbundance.csv'
meta = dirname + 'SamplesByMetadata_otuDADA2_EMP_10249_ECAM_RSRC_Characteristics.csv'

# Instantiate the data formatter, the quantizer, and the qnet orchestrator

In [3]:
formatter = DataFormatter()
quantizer = Quantizer()
qnet_orchestrator = QnetOrchestrator(quantizer)

# Load, quantize, and convert the data to qnet input format

In [4]:
data = formatter.load_data(data, meta)
quantized = quantizer.quantize_df(data)
features, label_matrix = quantizer.get_qnet_inputs(quantized)

There are 29 unique biomes and 1216 unique samples
There are 311 unique days
There are 99 unique weeks


In [5]:
features

Index(['Acidobacteriota_1', 'Acidobacteriota_9', 'Acidobacteriota_27',
       'Acidobacteriota_31', 'Acidobacteriota_35', 'Actinobacteriota_1',
       'Actinobacteriota_2', 'Actinobacteriota_3', 'Actinobacteriota_5',
       'Actinobacteriota_6',
       ...
       'unclassified_Bacteria_31', 'unclassified_Bacteria_34',
       'unclassified_Bacteria_36', 'unclassified_Bacteria_40',
       'unclassified_Bacteria_48', 'unclassified_Bacteria_57',
       'unclassified_Bacteria_62', 'unclassified_Bacteria_79',
       'unclassified_Bacteria_80', 'unclassified_Bacteria_88'],
      dtype='object', length=621)

In [6]:
label_matrix

array([['', '', '', ..., '', '', ''],
       ['', '', '', ..., '', '', ''],
       ['', '', '', ..., '', '', ''],
       ...,
       ['', '', '', ..., '', '', ''],
       ['', '', '', ..., '', '', ''],
       ['E', '', '', ..., '', '', '']], dtype='<U1')

# Train, save, and load the model

In [7]:
qnet_orchestrator.model

In [8]:
# should take < 5 min to run
qnet_orchestrator.train_qnet(
    features, label_matrix, alpha=0.2, min_samples_split=2, 
    out_fname='biome_net.joblib')

In [9]:
qnet_orchestrator.model.feature_names

Index(['Acidobacteriota_1', 'Acidobacteriota_9', 'Acidobacteriota_27',
       'Acidobacteriota_31', 'Acidobacteriota_35', 'Actinobacteriota_1',
       'Actinobacteriota_2', 'Actinobacteriota_3', 'Actinobacteriota_5',
       'Actinobacteriota_6',
       ...
       'unclassified_Bacteria_31', 'unclassified_Bacteria_34',
       'unclassified_Bacteria_36', 'unclassified_Bacteria_40',
       'unclassified_Bacteria_48', 'unclassified_Bacteria_57',
       'unclassified_Bacteria_62', 'unclassified_Bacteria_79',
       'unclassified_Bacteria_80', 'unclassified_Bacteria_88'],
      dtype='object', length=621)

In [10]:
# this is to demonstrate loading a trained model
qnet_orchestrator.model = None
qnet_orchestrator.load_qnet('biome_net.joblib')

In [11]:
# we get the same feature names from the loaded model
qnet_orchestrator.model.feature_names

Index(['Acidobacteriota_1', 'Acidobacteriota_9', 'Acidobacteriota_27',
       'Acidobacteriota_31', 'Acidobacteriota_35', 'Actinobacteriota_1',
       'Actinobacteriota_2', 'Actinobacteriota_3', 'Actinobacteriota_5',
       'Actinobacteriota_6',
       ...
       'unclassified_Bacteria_31', 'unclassified_Bacteria_34',
       'unclassified_Bacteria_36', 'unclassified_Bacteria_40',
       'unclassified_Bacteria_48', 'unclassified_Bacteria_57',
       'unclassified_Bacteria_62', 'unclassified_Bacteria_79',
       'unclassified_Bacteria_80', 'unclassified_Bacteria_88'],
      dtype='object', length=621)