## MICROBIOME DATA PROCESSING PIPELINE
#### THIS CODE SHOWS HOW TO USE CLASSES IN ProcessingFunctions.py to prepare microbiome data for further use in regression models
#

#### STEPS
##### 1. filtering rare bacteria using MicrobiomeDataPreprocessing.filter_rare_features()
##### 2. transform data using Centered Log Ration transform using MicrobiomeDataPreprocessing.CLRTransformer()
##### 3. change dataframe into a sueprvised problem using MicrobiomeDataPreprocessing.make_supervised()
##### 4. split data into train and test using MicrobiomeTraintestSplit.last_block_split() or MicrobiomeTraintestSplit.blocked_split()


#

In [None]:
import pandas as pd
import random
import numpy as np

import statsmodels.api as sm

import os
import sys

In [2]:
MODULE_PATH = os.path.abspath('.') 
if MODULE_PATH not in sys.path:
    sys.path.append(MODULE_PATH)

from ProcessingFunctions import MicrobiomeDataPreprocessing, CLRTransformer, MicrobiomeTraintestSplit

In [3]:
names = ['asv1',
         'asv2',
         'asv3']


counts = np.array([[ 0.,  10.,  12.],
                   [19.,  20.,  10.],
                   [ 0.,  22.,  3.],
                   [26.,  35.,  3.],
                   [ 8.,  18.,  55.],
                   [ 0.,  0.,  60.],
                   [13.,  0.,  65.],
                   [ 0.,  0.,  60.],
                   [13.,  0.,  65.],
                   [ 0.,  0.,  60.],
                   [13.,  0.,  65.],
                   [ 4.,  3.,  70.],
                   [ 0.,  0.,  30.],
                   [ 0.,  12.,  25.]])

df = pd.DataFrame(counts, columns = names)

In [4]:
df.head(2)

Unnamed: 0,asv1,asv2,asv3
0,0.0,10.0,12.0
1,19.0,20.0,10.0


In [5]:
processing= MicrobiomeDataPreprocessing()

#### 1. FILTER RARE BACTERIA

In [6]:
filtered_df = processing.filter_rare_features(df)

#### 2. CLR TRANSFORM

In [7]:
transformer = CLRTransformer(is_pseudo_global=True, axis=1)
transformer.fit(filtered_df)

df_transformed = transformer.transform(filtered_df)

In [8]:
df_transformed.head(2)

Unnamed: 0,asv1,asv2,asv3
0,-3.933536,1.875607,2.057929
1,0.196854,0.248147,-0.445


#### 3. CONVERT DATAFRAME TO SUPERVISED PROBLEM
#### _a. with transformed target variable_

In [9]:
lag = 3
df_clr_supervised = processing.make_supervised(df_transformed, lag)

In [10]:
df_clr_supervised.head(2)

Unnamed: 0,asv1_lag0,asv2_lag0,asv3_lag0,asv1_lag1,asv2_lag1,asv3_lag1,asv1_lag2,asv2_lag2,asv3_lag2,asv1_lag3,asv2_lag3,asv3_lag3
3,0.620744,0.917996,-1.53874,-3.734257,2.863344,0.870913,0.196854,0.248147,-0.445,-3.933536,1.875607,2.057929
4,-0.912941,-0.10201,1.014951,0.620744,0.917996,-1.53874,-3.734257,2.863344,0.870913,0.196854,0.248147,-0.445


#### _b. target variable on original scale_

In [11]:
y = filtered_df.iloc[3:]
df_clr_supervised.iloc[:, :3] = y.values

In [12]:
df_clr_supervised.head(2)

Unnamed: 0,asv1_lag0,asv2_lag0,asv3_lag0,asv1_lag1,asv2_lag1,asv3_lag1,asv1_lag2,asv2_lag2,asv3_lag2,asv1_lag3,asv2_lag3,asv3_lag3
3,26.0,35.0,3.0,-3.734257,2.863344,0.870913,0.196854,0.248147,-0.445,-3.933536,1.875607,2.057929
4,8.0,18.0,55.0,0.620744,0.917996,-1.53874,-3.734257,2.863344,0.870913,0.196854,0.248147,-0.445


#### 4. SPLIT TO TRAN AND TEST

In [13]:
splitter = MicrobiomeTraintestSplit(prc_split=0.1)

#### _a. use last timesteps as test_

In [29]:
train, test = splitter.last_block_split(df_clr_supervised)

In [30]:
train.shape, test.shape

((10, 12), (1, 12))

In [31]:
train.head(2)

Unnamed: 0,asv1_lag0,asv2_lag0,asv3_lag0,asv1_lag1,asv2_lag1,asv3_lag1,asv1_lag2,asv2_lag2,asv3_lag2,asv1_lag3,asv2_lag3,asv3_lag3
3,26.0,35.0,3.0,-3.734257,2.863344,0.870913,0.196854,0.248147,-0.445,-3.933536,1.875607,2.057929
4,8.0,18.0,55.0,0.620744,0.917996,-1.53874,-3.734257,2.863344,0.870913,0.196854,0.248147,-0.445


In [32]:
test.head()

Unnamed: 0,asv1_lag0,asv2_lag0,asv3_lag0,asv1_lag1,asv2_lag1,asv3_lag1,asv1_lag2,asv2_lag2,asv3_lag2,asv1_lag3,asv2_lag3,asv3_lag3
13,0.0,12.0,25.0,-2.302585,-2.302585,4.60517,-0.858173,-1.145855,2.004028,1.487356,-4.584151,3.096794


#### _b. use timesteps from different parts from series as test_

In [22]:
train, test = splitter.blocked_split(df_clr_supervised)

In [23]:
train.shape, test.shape

((10, 12), (1, 12))

In [24]:
train.head(2)

Unnamed: 0,asv1_lag0,asv2_lag0,asv3_lag0,asv1_lag1,asv2_lag1,asv3_lag1,asv1_lag2,asv2_lag2,asv3_lag2,asv1_lag3,asv2_lag3,asv3_lag3
3,26.0,35.0,3.0,-3.734257,2.863344,0.870913,0.196854,0.248147,-0.445,-3.933536,1.875607,2.057929
4,8.0,18.0,55.0,0.620744,0.917996,-1.53874,-3.734257,2.863344,0.870913,0.196854,0.248147,-0.445


In [26]:
test.head()

Unnamed: 0,asv1_lag0,asv2_lag0,asv3_lag0,asv1_lag1,asv2_lag1,asv3_lag1,asv1_lag2,asv2_lag2,asv3_lag2,asv1_lag3,asv2_lag3,asv3_lag3
11,4.0,3.0,70.0,1.487356,-4.584151,3.096794,-2.533634,-2.533634,5.067268,1.487356,-4.584151,3.096794
