In [11]:
import warnings
warnings.filterwarnings("ignore")
from skbio.stats.composition import clr

from statsmodels.stats import multitest
import numpy as np
import pandas as pd
import pyreadr
from scipy.stats import mannwhitneyu
from pathlib import Path

In [12]:
def to_clr(df, pseudocount_vector):
    ''' data - features as index and samples as columns '''
    
    df = df.div(df.sum(axis=1), axis=0)
    data = df + pseudocount_vector.reshape(len(pseudocount_vector), 1)
    data = data.T.copy()
    #data += 1e-2                                  # add pseudocount
    return pd.DataFrame(clr(data.T), columns=data.index, index=data.columns)

In [13]:
def process_data(i, N, transformation, sim_dir, efs):
        
    base_path = Path(sim_dir) / f"rep{i}"
    
    # Load data more efficiently
    counts_intervention = pyreadr.read_r(base_path / f"counts_n{N}.rds")[None]
    metadata_intervention = pyreadr.read_r(base_path / f"metadata_n{N}.rds")[None]
    
    counts_placebo = pyreadr.read_r(base_path / f"placebo_counts_n{N}.rds")[None]
    metadata_placebo = pyreadr.read_r(base_path / f"placebo_metadata_n{N}.rds")[None]
    
    gt_file = pd.read_csv(base_path / 'gt_features.tsv', sep='\t')
    
    filtered_metadata_intervention = metadata_intervention[metadata_intervention['delta.t'] == 15].copy()
    filtered_metadata_placebo = metadata_placebo[metadata_placebo['delta.t'] == 15].copy()
    
    # Filter counts
    filtered_counts_intervention = counts_intervention[filtered_metadata_intervention['sample_name']].T
    filtered_counts_placebo = counts_placebo[filtered_metadata_placebo['sample_name']].T
    
    if transformation == 'clr':
    
        pseudocount_vector_intervention = filtered_counts_intervention[filtered_counts_intervention > 0].min(axis=1).values * 0.1
        pseudocount_vector_placebo = filtered_counts_placebo[filtered_counts_placebo > 0].min(axis=1).values * 0.1
    
        intervention_counts_clr = to_clr(filtered_counts_intervention, pseudocount_vector_intervention)
        placebo_counts_clr = to_clr(filtered_counts_placebo, pseudocount_vector_placebo)
    
    elif transformation == 'log':
        intervention_counts_clr = np.log(filtered_counts_intervention + 0.001)
        placebo_counts_clr = np.log(filtered_counts_placebo + 0.001)
    
    
    results = []
    for feature in intervention_counts_clr.columns:
        
        x1 = intervention_counts_clr[[feature]].values.reshape(len(intervention_counts_clr), ); 
        x2 = placebo_counts_clr[[feature]].values.reshape(len(placebo_counts_clr), )
        
        stat, p_value = mannwhitneyu(x1, x2, 
                                 alternative='two-sided')
        results.append({'feature':feature,
                        'pvalue':p_value})
    
    results_df = pd.DataFrame(results)
    results_df = pd.merge(results_df.set_index('feature'), gt_file.set_index('feature'), left_index=True, right_index=True)
    results_df['effect_size'] = efs
    results_df['transformation'] = transformation
    results_df.to_csv(f'/Users/zkarwowska/Desktop/EMBL_project/zeevi_dataset_v5/results/baseline/{transformation}/baseline_sampling_{N}_efs_{efs}_rep_{i}.csv')

In [15]:
Wilcoxon_results = pd.DataFrame()
for efs in [125, 15, 2, 3, 5]:
    sim_dir = f"/Users/zkarwowska/new_embl_folder/zeevi_dataset_v5/simulation/efs{efs}/"
    for transformation in ['clr', 'log']:
            for N in [40]:
                for i in range(1, 11):
                    results = process_data(i, N, transformation, sim_dir, efs)