In [1]:
import os
import sys
import numpy as np
import pandas as pd
import joblib

from scipy.stats import gmean
from pathlib import Path
from sklearn.preprocessing import QuantileTransformer, MinMaxScaler, StandardScaler

# Load self-written transformers

In [2]:
MODULE_PATH = os.path.abspath('/storage/pszczerbiak/microbiome_interactions_project/')  # TODO load from CONFIG file
# MODULE_PATH = os.path.abspath(os.path.join('..'))
if MODULE_PATH not in sys.path:
    sys.path.append(MODULE_PATH)

In [3]:
from utils.transformers import RCLRTransformer, CLRTransformer, Log1pMinMaxScaler, IdentityScaler

# Load data

In [4]:
MAIN_PATH = Path("/storage/zkarwowska/microbiome-interactions/")  # TODO load from CONFIG file
INPUT_PATH = MAIN_PATH / "datasets/processed/ready_datasets"
OUTPUT_PATH = MAIN_PATH / "datasets/processed/ready_datasets_transformed"

In [5]:
def load_df(filepath, clear_cols=False):
    df = pd.read_csv(filepath, index_col=0, header=0)
    if clear_cols:
        df.columns = range(1, len(data_raw.columns)+1)
    df.sort_index(inplace=True)
    df.index.name = None
    return df

In [6]:
data_raw = {}
data_raw['donorA'] = load_df(INPUT_PATH / "donorA_rarefied_interpolated_feces.csv")
data_raw['donorB'] = load_df(INPUT_PATH / "donorB_rarefied_interpolated_feces.csv")
data_raw['female'] = load_df(INPUT_PATH / "female_rarefied_interpolated_feces.csv")
data_raw['male'] = load_df(INPUT_PATH / "male_rarefied_interpolated_feces.csv")

In [7]:
for k, v in data_raw.items():
    print(k, v.shape)

donorA (365, 1531)
donorB (253, 1545)
female (186, 544)
male (443, 1219)


## Common part

In [20]:
cols_common = pd.concat(data_raw.values(), join='inner').columns

In [21]:
data_common = {}
for k, v in data_raw.items():
    data_common[k] = v[cols_common]

In [22]:
for k, v in data_common.items():
    print(k, v.shape)

donorA (365, 166)
donorB (253, 166)
female (186, 166)
male (443, 166)


### Sort by abundance

In [23]:
cols_sorted_by_abundance = pd.concat(data_common).sum().sort_values(ascending=False).index

In [24]:
for k, v in data_common.items():
    data_common[k] = data_common[k][cols_sorted_by_abundance]

### Reset column names

In [25]:
# for k, v in data_raw.items():
#     data_common[k].columns = range(1, len(data_common[k].columns)+1)         
#     data_raw[k].columns = range(1, len(data_raw[k].columns)+1) 

# Transform data

## Create scalers / transformers

In [None]:
scaler_id = IdentityScaler()
scaler_std = StandardScaler()
scaler_minmax = MinMaxScaler()
scaler_quantile10 = QuantileTransformer(n_quantiles=10, output_distribution='uniform')
scaler_quantile50 = QuantileTransformer(n_quantiles=50, output_distribution='uniform')
scaler_quantile100 = QuantileTransformer(n_quantiles=100, output_distribution='uniform')
scaler_quantile150 = QuantileTransformer(n_quantiles=150, output_distribution='uniform')
# transformer_rclr = RCLRTransformer(axis=0)  # compute gmean for each species <-- cannot use it since we need `mask` paramter always present....
transformer_clr_0_False = CLRTransformer(axis=0, is_pseudo_global=False)  # compute gmean for each species
transformer_clr_0_True  = CLRTransformer(axis=0, is_pseudo_global=True)  # compute gmean for each species
transformer_clr_None_False  = CLRTransformer(axis=None, is_pseudo_global=False)  # compute gmean globally
transformer_clr_None_True  = CLRTransformer(axis=None, is_pseudo_global=True)  # compute gmean globally
scaler_log1pminmax = Log1pMinMaxScaler()

In [None]:
scalers = {'id' : scaler_id,
           'std' : scaler_std,
           'minmax' : scaler_minmax,
           'quantile10': scaler_quantile10,
           'quantile50': scaler_quantile50,
           'quantile100': scaler_quantile100,
           'quantile150': scaler_quantile150,
           # 'rclr': transformer_rclr,
           'clr_0_False': transformer_clr_0_False,
           'clr_0_True': transformer_clr_0_True,
           'clr_None_False': transformer_clr_None_False,
           'clr_None_True': transformer_clr_None_True,
           'log1pminmax': scaler_log1pminmax,
          }

## Common part

### Save original datasets

In [28]:
for k, v in data_common.items():
    v.to_csv(OUTPUT_PATH / 'common' / f'{k}.csv')

### Save scaled datasets

In [29]:
for name, scaler in scalers.items():
    for k, v in data_common.items():
        scaled_df = pd.DataFrame(scaler.fit_transform(v))
        scaled_df.columns = v.columns
        # Save transformed dataframe
        scaled_df.to_csv(OUTPUT_PATH / 'common' / f'{k}_{name}.csv')
        # Save scaler
        joblib.dump(scaler, OUTPUT_PATH / 'common' / f'scaler_{k}_{name}.obj')