In [1]:
import warnings
warnings.filterwarnings("ignore")
from skbio.stats.composition import clr

from statsmodels.stats import multitest
import numpy as np
import pandas as pd
import pyreadr
from scipy.stats import wilcoxon
from pathlib import Path

### WILCOXON PAIRED TEST

In [2]:
def to_clr(df, pseudocount_vector):
    ''' data - features as index and samples as columns '''
    
    df = df.div(df.sum(axis=1), axis=0)
    data = df + pseudocount_vector.reshape(len(pseudocount_vector), 1)
    data = data.T.copy()
    #data += 1e-2                                  # add pseudocount
    return pd.DataFrame(clr(data.T), columns=data.index, index=data.columns)

In [3]:
# Use pathlib for path handling
#i=1; N=10; sampling =[10, 19]
#sim_dir = "/Users/zkarwowska/Desktop/EMBL_project/zeevi_dataset_v5/simulation/efs2/"
def process_data(i, N, sampling, transformation, sim_dir, efs):
    
    base_path = Path(sim_dir) / f"rep{i}"

    # Load data more efficiently
    counts_intervention = pyreadr.read_r(base_path / f"counts_n{N}.rds")[None]
    metadata_intervention = pyreadr.read_r(base_path / f"metadata_n{N}.rds")[None]
    
    counts_placebo = pyreadr.read_r(base_path / f"placebo_counts_n{N}.rds")[None]
    metadata_placebo = pyreadr.read_r(base_path / f"placebo_metadata_n{N}.rds")[None]
    
    gt_file = pd.read_csv(base_path / 'gt_features.tsv', sep='\t')
    
    # Filter metadata once
    filtered_metadata_intervention = metadata_intervention[metadata_intervention['delta.t'].isin(sampling)].copy()
    filtered_metadata_placebo = metadata_placebo[metadata_placebo['delta.t'].isin(sampling)].copy()
    
    # Filter counts
    filtered_counts_intervention = counts_intervention[filtered_metadata_intervention['sample_name']].T
    filtered_counts_placebo = counts_placebo[filtered_metadata_placebo['sample_name']].T

    if transformation == 'clr':
    
        pseudocount_vector_intervention = filtered_counts_intervention[filtered_counts_intervention > 0].min(axis=1).values * 0.1
        pseudocount_vector_placebo = filtered_counts_placebo[filtered_counts_placebo > 0].min(axis=1).values * 0.1
    
        intervention_counts_clr = to_clr(filtered_counts_intervention, pseudocount_vector_intervention)
        placebo_counts_clr = to_clr(filtered_counts_placebo, pseudocount_vector_placebo)
    
    elif transformation == 'log_relab':
        intervention_counts_clr = np.log(filtered_counts_intervention + 0.01)
        placebo_counts_clr = np.log(filtered_counts_placebo + 0.01)


    results = []
    for feature in intervention_counts_clr.columns:
        
        x1 = intervention_counts_clr[[feature]].values.reshape(len(intervention_counts_clr), ); 
        x2 = placebo_counts_clr[[feature]].values.reshape(len(placebo_counts_clr), )
        
        stat, p_value = wilcoxon(x1, x2, 
                                 zero_method='zsplit', 
                                 alternative='two-sided')
        results.append({'feature':feature,
                        'pvalue':p_value})
    
    results_df = pd.DataFrame(results)
    results_df = pd.merge(results_df.set_index('feature'), gt_file.set_index('feature'), left_index=True, right_index=True)
    results_df['effect_size'] = efs

    #out_path = '/Users/zkarwowska/Desktop/EMBL_project/zeevi_dataset_v5/results/wilcoxon/placebo/'
    #results_df.to_csv(f'{out_path}{transformation}/wilcoxon_sampling_{N}_efs_{efs}_rep_{i}_d_{len(sampling)}.csv')
    results_df.to_csv(f'/Users/zkarwowska/Desktop/EMBL_project/zeevi_dataset_v5/results/wilcoxon/placebo/{transformation}/wilcoxon_sampling_{N}_efs_{efs}_rep_{i}_d_{len(sampling)}.csv')


In [4]:
#sim_dir = "/Users/zkarwowska/Desktop/EMBL_project/zeevi_dataset_v5/simulation/efs2/"
wd = '/Users/zkarwowska/Desktop/EMBL_project/zeevi_dataset_v5/results/wilcoxon/one_arm_all/'

Sampling = [[10, 19], [10, 12, 16, 18], [10, 12, 14, 16, 18, 19]]

Wilcoxon_results = pd.DataFrame()
for efs in [125, 15, 2, 3, 5]:
    sim_dir = f"/Users/zkarwowska/new_embl_folder/zeevi_dataset_v5/simulation/efs{efs}/"
    for transformation in ['clr', 'log_relab']:
            for N in [10, 20, 30, 40, 50, 60, 70, 80]:
                for sampling in Sampling:
                        for i in range(1, 11):

                            file = f'{wd}/{transformation}/wilcoxon_sampling_{N}_efs_{efs}_rep_{i}_d_{len(sampling)}.csv'
                            my_file = Path(file)
                            if my_file.is_file():
                                pass
                            else:
                                process_data(i, N, sampling, transformation, sim_dir, efs)