# [GSE85217] Tracking outliers - Cook's Distance

In [1]:
# import lib
import pandas as pd
#import modin.pandas as pd
import numpy as np
import os
import math
from scipy.stats import f
from collections import OrderedDict

import statsmodels.formula.api as smf
import statsmodels.api as sm

from concurrent.futures import ProcessPoolExecutor, as_completed

import sys
sys.path.insert(1,'/home/thomas/Documents/git/medulloblastoma_cavalli_kaggle/code/local_version/fun')

from parser import Data

In [2]:
path_data='/home/thomas/Documents/git/medulloblastoma_cavalli_kaggle/data/in/'
path_exp_mat = path_data + 'GSE85217_M_exp_763_MB_SubtypeStudy_TaylorLab_parsed.txt'
path_meta = path_data + 'GSE85217_Cavalli_subgroups_information_parsed.csv'

data=Data()
exp_mat=data.add_exp_mat(path_exp_mat,index_col="genes_name",inplace=True)
metadata=data.add_meta(path_meta=path_meta,index_col="samples_name",inplace=True)

exp_mat = exp_mat.T

In [3]:
import threading
import psutil
import time

class ResourceMonitor:
    def __init__(self):
        # Event to control when to stop the monitoring thread
        self._stop_event = threading.Event()
        self.monitor_thread = threading.Thread(target=self.monitor_resources, daemon=True)

    def monitor_resources(self):
        while not self._stop_event.is_set():
            print(f"RAM Usage: {psutil.virtual_memory().percent}% | CPU Usage: {psutil.cpu_percent()}%")
            time.sleep(1)

    def start(self):
        # Start the daemon thread
        self.monitor_thread.start()

    def stop(self):
        # Signal the thread to stop and wait for it to finish
        self._stop_event.set()
        self.monitor_thread.join()  # Wait for the thread to terminate cleanly

In [14]:
def trimfn(x: float) -> int:
    return 2 if x >= 23.5 else 1 if x >= 3.5 else 0

def trimmed_mean(x: np.ndarray, trim: float = 0.1, **kwargs) -> np.ndarray:
    
    assert trim <= 0.5
    
    kwargs.setdefault('axis',0)
        
    axis = kwargs.get("axis")
    s = np.sort(x,**kwargs)
    n = x.shape[axis]
    ntrim = math.floor(n * trim)
    return np.take(s, np.arange(ntrim, n - ntrim), axis).mean(**kwargs)

def chunkify_trim(exp_mat:pd.DataFrame,design_series:pd.Series):

    ns = design_series.value_counts()
        
    for var in design_series.unique():
        lvl = trimfn(ns[var])
        yield exp_mat[design_series == var, :], lvl

def process_variance(exp_mat:np.ndarray,lvl:int,trim_ratio:tuple=(1 / 3, 1 / 4, 1 / 8),scale:list=[2.04, 1.86, 1.51]) -> np.ndarray:
    var_means = trimmed_mean(x=exp_mat, trim=trim_ratio[lvl], axis = 0)
    sqerror_var = (exp_mat - var_means) ** 2
    variance_means = trimmed_mean(sqerror_var, trim=trim_ratio[lvl], axis=0)
    variance_means *= scale[lvl]
    return variance_means

def trimmed_design_variance(exp_mat: np.ndarray, design_series: pd.Series) -> np.ndarray:
        
    trim_variances = [process_variance(exp_mat,lvl).tolist() for exp_mat, lvl in chunkify_trim(np.array(exp_mat),design_series)]
    return np.array(trim_variances).max(axis=0)

def robust_method_of_moment_disp(exp_mat:np.ndarray,design_series:pd.Series) -> np.ndarray:
    
    v = trimmed_design_variance(np.array(exp_mat),design_series)

    m = exp_mat.mean(axis=0)
    alphas = (v - m) / m**2

    # cannot use the typical min_disp = 1e-8 here or else all counts in the same
    # group as the outlier count will get an extreme Cook's distance
    min_disp = 0.04
    np.maximum(alphas, min_disp, out=alphas)
    return alphas

def estimate_cook_distance(regdata:np.ndarray,formula:str,alpha:np.ndarray):
    model = smf.glm(formula=formula, data=regdata,family=sm.families.NegativeBinomial(alpha=1/alpha)).fit()
    cooks_d = model.get_influence().cooks_distance[0]
    outliers_index = np.where(cooks_d > f_cutoff)[0]
    return outliers_index if len(outliers_index)>0 else None

def gen_reg_data(exp_mat:pd.DataFrame,design_series:pd.Series,alphas:np.ndarray):

    for i,gene in enumerate(exp_mat.columns):
        yield pd.concat([exp_mat.iloc[:,i],design_series],axis=1), f"{gene} ~ C({design_series.name})", alphas[i]

In [12]:
design_series = metadata['Subtype']

outliers={}

alphas = robust_method_of_moment_disp(exp_mat,design_series)

#genes = exp_mat.columns
#exp_mat[design_series.name] = design_series

exp_mat.columns = exp_mat.columns.str.replace('-','_')
exp_mat.columns = exp_mat.columns.str.replace(':', '', regex=False)

m = len(design_series)
p = len(design_series.unique()) + 1
f_cutoff = f.ppf(0.99, p, m - p)

monitor = ResourceMonitor()
monitor.start()

with ProcessPoolExecutor(max_workers=7) as executor: 
    
    futures = {executor.submit(estimate_cook_distance, regd, form, alpha):regd.columns[0] for regd, form, alpha in gen_reg_data(exp_mat,design_series,alphas)}

    for future in as_completed(futures):
        idx = futures[future]
        try:
            outliers[idx] = future.result()
        except Exception as e:
            print(f"Erreur dans le job {idx}: {e}")

monitor.stop()

  yield pd.concat([exp_mat.iloc[:,i],design_series],axis=1), f"{gene} ~ C({design_series.name})", alphas[i]


RAM Usage: 73.9% | CPU Usage: 6.9%
RAM Usage: 76.9% | CPU Usage: 92.5%
RAM Usage: 78.6% | CPU Usage: 76.6%
RAM Usage: 79.1% | CPU Usage: 100.0%
RAM Usage: 79.7% | CPU Usage: 100.0%
RAM Usage: 79.7% | CPU Usage: 100.0%
RAM Usage: 79.8% | CPU Usage: 100.0%
RAM Usage: 80.0% | CPU Usage: 100.0%
RAM Usage: 80.1% | CPU Usage: 100.0%
RAM Usage: 80.2% | CPU Usage: 100.0%
RAM Usage: 80.3% | CPU Usage: 100.0%
RAM Usage: 80.3% | CPU Usage: 100.0%
RAM Usage: 80.3% | CPU Usage: 100.0%
RAM Usage: 81.0% | CPU Usage: 100.0%
RAM Usage: 81.6% | CPU Usage: 100.0%
RAM Usage: 81.6% | CPU Usage: 100.0%
RAM Usage: 81.7% | CPU Usage: 100.0%
RAM Usage: 81.7% | CPU Usage: 100.0%
RAM Usage: 81.8% | CPU Usage: 100.0%
RAM Usage: 81.9% | CPU Usage: 82.2%
RAM Usage: 81.8% | CPU Usage: 100.0%
RAM Usage: 81.9% | CPU Usage: 100.0%
RAM Usage: 81.9% | CPU Usage: 100.0%
RAM Usage: 82.0% | CPU Usage: 100.0%
RAM Usage: 82.0% | CPU Usage: 100.0%
RAM Usage: 82.0% | CPU Usage: 100.0%
RAM Usage: 82.1% | CPU Usage: 99.9%
RAM Usa

In [13]:
[i for i in outliers.values() if i is not None]

[]

Il ne semble pas y avoir d'outliers dans le jeu de donn√©es