In [13]:
import os
import sys
sys.path.insert(0, os.path.abspath('../qbiome'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
plt.style.use('ggplot')

from data_formatter import DataFormatter
from quantizer import Quantizer
from qnet_orchestrator import QnetOrchestrator
from forecaster import Forecaster

In [2]:
dirname = 'example_data/'
data = dirname + 'SamplesByMetadata_otuDADA2_EMP_10249_ECAM_RSRC_TaxaRelativeAbundance.csv'
meta = dirname + 'SamplesByMetadata_otuDADA2_EMP_10249_ECAM_RSRC_Characteristics.csv'

# Instantiate the data formatter, quantizer, qnet orchestrator, and forecaster

In [3]:
formatter = DataFormatter()
quantizer = Quantizer()
qnet_orchestrator = QnetOrchestrator(quantizer)
forecaster = Forecaster(qnet_orchestrator)

# Load, quantize, and convert the data to qnet input format

In [4]:
data = formatter.load_data(data, meta)
quantized = quantizer.quantize_df(data)
features, label_matrix = quantizer.get_qnet_inputs(quantized)

There are 29 unique biomes and 1216 unique samples
There are 311 unique days
There are 99 unique weeks


In [5]:
# how many weeks of data do we have?
# how many biome_week feature names do we have?
data.week.max(), len(features)

(98, 621)

In [6]:
data.head()

Unnamed: 0,sample_id,subject_id,variable,week,value
7,MBSMPL0020-6-10,1,Actinobacteriota,27,0.36665
8,MBSMPL0020-6-10,1,Bacteroidota,27,0.507248
9,MBSMPL0020-6-10,1,Campilobacterota,27,0.002032
10,MBSMPL0020-6-10,1,Desulfobacterota,27,0.005058
11,MBSMPL0020-6-10,1,Firmicutes,27,0.057767


# Load a pre-trained qnet

In [7]:
qnet_orchestrator.load_qnet('biome_net.joblib')

# Construct a new patient

In [45]:
new_patient = data[data.subject_id == '1'].drop(columns='sample_id')
new_patient.subject_id = '101'
new_patient.value *= 0.8 # random perturbation

In [46]:
# suppose we only have 6 weeks of data for the new patient
# we mask out the remainder of the weeks
new_patient.loc[new_patient.week > 6, 'value'] = np.nan

In [47]:
new_patient

Unnamed: 0,subject_id,variable,week,value
7,101,Actinobacteriota,27,
8,101,Bacteroidota,27,
9,101,Campilobacterota,27,
10,101,Desulfobacterota,27,
11,101,Firmicutes,27,
...,...,...,...,...
6202,101,Campilobacterota,6,0.066674
6203,101,Desulfobacterota,6,0.006581
6204,101,Firmicutes,6,0.398054
6205,101,Fusobacteriota,6,0.095685


In [48]:
melted = pd.concat([
            new_patient.subject_id,
            new_patient.variable + '_' + new_patient.week.astype(str),
            new_patient.value
        ], axis=1).rename(columns={0: 'variable'})

to_quantize = melted.pivot_table(
index='subject_id', columns='variable')['value'].reset_index()

In [49]:
to_quantize

variable,subject_id,Actinobacteriota_1,Actinobacteriota_3,Actinobacteriota_6,Bacteroidota_1,Bacteroidota_3,Bacteroidota_6,Campilobacterota_1,Campilobacterota_3,Campilobacterota_6,...,Fusobacteriota_3,Fusobacteriota_6,Proteobacteria_1,Proteobacteria_3,Proteobacteria_6,Synergistota_1,Synergistota_3,Verrucomicrobiota_1,Verrucomicrobiota_3,Verrucomicrobiota_6
0,101,0.025334,0.038834,0.035042,0.165764,0.223907,0.211332,9.3e-05,0.007591,0.066674,...,0.012895,0.095685,0.009486,0.009009,0.126614,0.000278,5.8e-05,0.028718,0.001395,0.045441


In [41]:
# quantize
quantized = quantizer.quantize_df(new_patient)
# convert this patient into a format acceptable by the qnet
_, new_patient_matrix = quantizer.get_qnet_inputs(quantized)

In [43]:
new_patient_matrix.shape

(1, 166)

# Forecast the data starting from `week 10` to the last week

In [10]:
# takes 10 minutes to run
# can specify an end week or default to the max end week in the data
forecasted = forecaster.forecast_data(
    label_matrix, 
    start_week=10,
#     end_week=data.week.max()
)

KeyboardInterrupt: 

# Inspect the original data and the forecasted data

In [None]:
data.head()

In [None]:
forecasted.head()

In [None]:
BIOMES = ['Actinobacteriota', 'Bacteroidota', 'Firmicutes', 'Proteobacteria', 'unclassified_Bacteria']

concat = pd.concat([
    data.assign(source='original'), 
    forecasted.assign(source='forecasted')
])
concat = concat[concat.variable.isin(BIOMES)]

g = sns.FacetGrid(concat, col='variable', col_wrap=2, sharey=False, margin_titles=True)
g.map_dataframe(sns.lineplot, 'week', 'value', hue='source', ci=None, marker='o',
                    linewidth=2, alpha=0.75, markersize=5)
g.set_titles(row_template = '{row_name}', col_template = '{col_name}')
g.add_legend()

The forecast aligns pretty well with the original data.

Next we zoom in to look at the first 20 weeks. Note that there is still some conversion distortion due to quantization-dequantization.

In [None]:
concat = concat[(concat.week <= 20)]

g = sns.FacetGrid(concat, col='variable', col_wrap=2, sharey=False, margin_titles=True)
g.map_dataframe(sns.lineplot, 'week', 'value', hue='source', ci=None, marker='o',
                    linewidth=2, alpha=0.75, markersize=5)
g.set_titles(row_template = '{row_name}', col_template = '{col_name}')
g.add_legend()

# Apply the random forest regressor in the quantizer

In [None]:
dequantized = quantizer.dequantize_to_df(label_matrix)
dequantized_plot = quantizer.melt_into_plot_format(dequantized)
quantizer.fit_random_forest(data, dequantized_plot)

In [None]:
avg_forecasted_regressed = quantizer.apply_random_forest_regressor(
    forecasted)

In [None]:
concat = pd.concat([
    data.assign(source='original'), 
    avg_forecasted_regressed.assign(source='forecasted')
])
concat = concat[concat.variable.isin(BIOMES)]
concat = concat[(concat.week <= 20)]

g = sns.FacetGrid(concat, col='variable', col_wrap=2, sharey=False, margin_titles=True)
g.map_dataframe(sns.lineplot, 'week', 'value', hue='source', ci=None, marker='o',
                    linewidth=2, alpha=0.75, markersize=5)
g.set_titles(row_template = '{row_name}', col_template = '{col_name}')
g.add_legend()