# [GSE85217] Tracking outliers - Cook's Distance

In [1]:
!jupyter-lab enable widgetsnbextension

[35m[C 2024-11-03 22:54:56.040 ServerApp][m No such file or directory: /home/thomas/Documents/git/medulloblastoma_cavalli_kaggle/code/local_version/nb/enable


## Import lib

In [5]:
# lib
#import modin.pandas as pd
import pandas as pd
import numpy as np
import os
from collections import OrderedDict
import umap

# fig
import matplotlib.pyplot as plt
import seaborn as sns

# local lib
import sys
sys.path.insert(1,'/home/thomas/Documents/git/medulloblastoma_cavalli_kaggle/code/local_version/fun')

from parser import Data

In [6]:
path_data='/home/thomas/Documents/git/medulloblastoma_cavalli_kaggle/data/in/'
path_exp_mat = path_data + 'GSE85217_M_exp_763_MB_SubtypeStudy_TaylorLab_parsed.txt'
path_meta = path_data + 'GSE85217_Cavalli_subgroups_information_parsed.csv'

data=Data()
data.add_exp_mat(path_exp_mat,index_col="genes_name")
data.add_meta(path_meta=path_meta,index_col="samples_name")

In [7]:
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [8]:
def get_design_matrix(metadata,colname,**kwargs):
    return pd.get_dummies(metadata[colname],dtype=int,**kwargs)

In [76]:
design=get_design_matrix(data.meta,colname="Subtype")

In [10]:
data.meta["Subtype"].value_counts()

Subtype
Group4_gamma    119
Group4_beta     109
Group4_alpha     98
SHH_delta        76
Group3_alpha     67
SHH_alpha        65
WNT_alpha        49
SHH_gamma        47
Group3_gamma     40
Group3_beta      37
SHH_beta         35
WNT_beta         21
Name: count, dtype: int64

In [11]:
data_nb=pd.concat([data.exp_mat.loc['TSPAN6',data.meta.index.to_list()],data.meta['Subtype']],axis=1)

In [12]:
nb_model=smf.glm('TSPAN6 ~ Subtype',data=data_nb,family=sm.families.NegativeBinomial()).fit()



**PyDESeq2**

In [13]:
import math

def trimmed_mean(x: np.ndarray, trim: float = 0.1):

    assert trim <= 0.5

    assert trim <= 0.5
    if "axis" in kwargs:
        axis = kwargs.get("axis")
        s = np.sort(x, axis=axis)
        n = x.shape[axis]
        ntrim = floor(n * trim)
        return np.take(s, np.arange(ntrim, n - ntrim), axis).mean(axis)
    else:
        n = len(x)
        s = np.sort(x)
        ntrim = math.floor(n * trim)
        return s[ntrim : n - ntrim].mean()

def trimmed_variance(x:np.ndarray, trim:float = 0.125, axis:int = 0):

    rm = trimmed_mean(x, trim=trim)
    sqerror = (x - rm) ** 2
    return 1.51 * trimmed_mean(sqerror, trim=trim)

def robust_method_of_moments_disp(norm_counts:  np.ndarray, **kwargs):

    v = trimmed_mean(norm_counts, **kwargs)

    m = norm_counts.mean()
    alpha = (v - m) / m**2
    min_disp=0.04
    np.maximum(alpha, min_disp, out=alpha)
    return alpha

def trimmed_group_variance(counts:np.ndarray, design:pd.DataFrame):

    trimratio=(1/3,1/4,1/8)

    def trimfn(x:float)-> int :
        return 2 if x >= 23.5 else 1 if x >= 3.5 else 0

    group_ratio=np.array([trimratio[trimfn(x)] for x in design.sum(axis=0)])

    group_means=pd.DataFrame(data=np.zeros((design.shape[0],1)),index=design.index.to_list())

    def get_index(design:pd.DataFrame,colname:str,value:int=1):
        return design.index[design[colname]==value].to_list()

    for i,colname in enumerate(design.columns):
        samples_group=get_index(design,colname)
        
        #print(samples_group)
        #group_means.loc[samples_group,]=1
        #group_means[samples_group,]=trimmed_mean(x=counts.loc[samples_group])

        

    
    #design_ratio = design * group_ratio

    #exp_sum=np.dot(counts,design_ratio.sum(axis=1))

    return group_means

In [47]:
import math

def trimmed_mean(x: np.ndarray, trim: float = 0.1,**kwargs):

    assert trim <= 0.5

    assert trim <= 0.5
    if "axis" in kwargs:
        axis = kwargs.get("axis")
        s = np.sort(x, axis=axis)
        n = x.shape[axis]
        ntrim = math.floor(n * trim)
        return np.take(s, np.arange(ntrim, n - ntrim), axis).mean(axis)
    else:
        n = len(x)
        s = np.sort(x)
        ntrim = math.floor(n * trim)
        return s[ntrim : n - ntrim].mean()

In [48]:
def trimmed_cell_variance(counts: np.ndarray, cells: pd.Series) -> np.ndarray:
    """Return trimmed variance of counts according to condition.

    Compute the variance after trimming data of its smallest and largest elements,
    grouped by cohorts, and return the max across cohorts.
    The trim factor is a function of data size.

    Parameters
    ----------
    counts : ndarray
        Sample-wise gene counts.

    cells : pandas.Series
        Cohort affiliation of each sample.

    Returns
    -------
    ndarray :
        Gene-wise trimmed variance estimate.
    """
    # how much to trim at different n
    trimratio = (1 / 3, 1 / 4, 1 / 8)
    # returns an index for the vector above for three sample size bins

    def trimfn(x: float) -> int:
        return 2 if x >= 23.5 else 1 if x >= 3.5 else 0

    ns = cells.value_counts()
    sqerror = np.zeros_like(counts)

    for lvl in cells.unique():
        cell_means = np.array(trimmed_mean(counts[cells == lvl, :], trim=trimratio[trimfn(ns[lvl])], axis=0))
        sqerror[cells == lvl, :] = counts[cells == lvl, :] - cell_means[None, :]

    sqerror **= 2

    varEst = np.zeros((len(ns), counts.shape[1]), dtype=float)
    for i, lvl in enumerate(cells.unique()):
        scale = [2.04, 1.86, 1.51][trimfn(ns[lvl])]
        varEst[i, :] = scale * trimmed_mean(
            sqerror[cells == lvl, :], trim=trimratio[trimfn(ns[lvl])], axis=0
        )

    return varEst.max(axis=0)

In [49]:
#group_ratio=trimmed_group_variance(counts=data.exp_mat.loc['TSPAN6'],design=design)
#group_ratio
#data.exp_mat.loc['TSPAN6',get_index(design,colname='Group3_alpha']
#trimmed_mean(x=data.exp_mat.loc['TSPAN6',get_index(design,colname='Group3_alpha')])

In [55]:
norm=np.array(data.exp_mat.T)

In [54]:
v=trimmed_cell_variance(counts=np.array(data.exp_mat.T),cells=data.meta['Subtype'])

In [56]:
m = norm.mean(0)

In [58]:
alpha = (v - m) / m**2

In [61]:
minDisp = 0.04

In [64]:
np.maximum(alpha, minDisp, out=alpha)

array([0.04, 0.04, 0.04, ..., 0.04, 0.04, 0.04])

## Test

In [77]:
design

(763, 12)

In [85]:
df=pd.concat([data.exp_mat.T,design],axis=1)

In [86]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import f


In [89]:
model_nb = smf.glm("TSPAN6 ~ Group3_alpha+Group3_beta+Group3_gamma+Group4_alpha+Group4_beta+Group4_gamma+SHH_alpha+SHH_beta+SHH_delta+SHH_gamma+WNT_alpha+WNT_beta", data=df, family=sm.families.NegativeBinomial(alpha=0.04)).fit()

In [93]:
#Calcul de la distance de Cook pour chaque observation
influence_nb = model_nb.get_influence()
cooks_d_nb = influence_nb.cooks_distance[0]

# Définir le seuil de la loi F
m = design.shape[0]  # Nombre total d'échantillons
p = len(df.columns)  # Nombre total de paramètres (intercept + groupes)
f_cutoff = f.ppf(0.99, p, m - p)  # Seuil de Cook à 99%

# Identification des outliers potentiels
outliers_nb = np.where(cooks_d_nb > f_cutoff)[0]
print("Indices des valeurs aberrantes potentielles:", outliers_nb)
print("Distances de Cook des valeurs aberrantes:", cooks_d_nb[outliers_nb])
print("Seuil de la distance de Cook (F-distribution):", f_cutoff)


Indices des valeurs aberrantes potentielles: []
Distances de Cook des valeurs aberrantes: []
Seuil de la distance de Cook (F-distribution): nan


In [87]:
design.columns

Index(['Group3_alpha', 'Group3_beta', 'Group3_gamma', 'Group4_alpha',
       'Group4_beta', 'Group4_gamma', 'SHH_alpha', 'SHH_beta', 'SHH_delta',
       'SHH_gamma', 'WNT_alpha', 'WNT_beta'],
      dtype='object')

In [88]:
data.exp_mat

Unnamed: 0_level_0,MB_SubtypeStudy_55001,MB_SubtypeStudy_55002,MB_SubtypeStudy_55003,MB_SubtypeStudy_55004,MB_SubtypeStudy_55005,MB_SubtypeStudy_55006,MB_SubtypeStudy_55007,MB_SubtypeStudy_55008,MB_SubtypeStudy_55009,MB_SubtypeStudy_55010,...,MB_SubtypeStudy_55754,MB_SubtypeStudy_55755,MB_SubtypeStudy_55756,MB_SubtypeStudy_55757,MB_SubtypeStudy_55758,MB_SubtypeStudy_55759,MB_SubtypeStudy_55760,MB_SubtypeStudy_55761,MB_SubtypeStudy_55762,MB_SubtypeStudy_55763
genes_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TSPAN6,8.810256,8.406470,8.006128,8.156085,7.731048,8.805311,8.902279,7.729933,8.280679,8.030932,...,7.325913,7.965514,7.667200,8.048013,7.583528,8.237443,8.879233,8.027227,8.198219,7.813899
TNMD,4.022827,3.911187,3.686010,3.555897,4.026585,3.935621,4.212156,3.799432,3.906987,3.751994,...,4.012010,3.900344,3.985817,3.617402,3.818528,4.539719,4.023908,4.211934,3.863124,3.740711
DPM1,10.107225,9.861771,10.564233,9.967972,10.090847,10.286749,10.454325,10.752282,10.537412,9.996035,...,9.524450,10.299305,10.364686,10.427957,9.797075,10.238513,10.914443,10.017931,10.507630,10.277027
SCYL3,7.582515,8.461777,8.186035,7.879770,8.216220,8.145683,7.570284,8.240852,7.967473,7.892234,...,7.657312,8.073610,7.709410,7.699548,8.630737,7.963948,7.969378,7.563576,7.645268,7.704992
C1orf112,6.831354,8.085670,8.229447,7.873316,7.916045,7.199585,7.594701,7.830568,7.487867,6.658625,...,7.188234,7.432958,6.604045,7.385075,8.161891,7.505519,7.918162,7.380863,7.078234,7.551968
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TMEM75,4.234460,4.087673,3.908403,3.874417,4.017526,3.832650,3.817916,3.848514,3.850468,3.875604,...,4.029876,3.843575,3.934983,3.939291,4.075465,3.891615,3.819015,3.856994,3.977876,3.861199
C18orf12,3.616717,4.373555,3.663950,3.806443,3.855606,3.648231,3.837931,3.855303,3.624859,3.447341,...,4.423391,3.917796,3.850005,3.722847,4.441848,3.749983,3.761595,4.210553,4.231942,3.872206
OR12D2,2.229336,2.713571,4.108732,2.820107,2.961778,2.343902,2.302823,2.596473,2.879190,2.301474,...,2.559844,3.709618,2.706699,2.574850,2.502059,2.716143,2.831135,2.785208,3.051605,2.670175
MIR4697HG,7.862705,9.117510,7.531721,7.103699,7.207010,7.995286,7.872361,8.724588,7.355940,9.163241,...,9.190299,8.685701,9.407069,6.754088,7.260324,8.282783,6.366572,5.098592,8.404701,7.297745


In [None]:
def get_index(design:pd.DataFrame,colname:str,value:int=1):
        return design.index[design[colname]==value].to_list()

In [None]:
counts=data.exp_mat

In [None]:
count_trim=pd.DataFrame(np.zeros_like(counts),index=counts.index.to_list(),columns=counts.columns)

In [None]:
group_series=data.meta['Subtype']
[i for i,j in enumerate(list(group_series=='SHH_beta')) if j]

In [None]:
group_series

In [None]:
def trimmed_cell_variance(counts: np.ndarray, cells: pd.Series) -> np.ndarray:
    """Return trimmed variance of counts according to condition.

    Compute the variance after trimming data of its smallest and largest elements,
    grouped by cohorts, and return the max across cohorts.
    The trim factor is a function of data size.

    Parameters
    ----------
    counts : ndarray
        Sample-wise gene counts.

    cells : pandas.Series
        Cohort affiliation of each sample.

    Returns
    -------
    ndarray :
        Gene-wise trimmed variance estimate.
    """
    # how much to trim at different n
    trimratio = (1 / 3, 1 / 4, 1 / 8)
    # returns an index for the vector above for three sample size bins

    def trimfn(x: float) -> int:
        return 2 if x >= 23.5 else 1 if x >= 3.5 else 0

    ns = cells.value_counts()
    sqerror = np.zeros_like(counts)

    for lvl in cells.unique():
        cell_means = cast(
            np.ndarray,
            trimmed_mean(
                counts[cells == lvl, :], trim=trimratio[trimfn(ns[lvl])], axis=0
            ),
        )
        sqerror[cells == lvl, :] = counts[cells == lvl, :] - cell_means[None, :]

    sqerror **= 2

    varEst = np.zeros((len(ns), counts.shape[1]), dtype=float)
    for i, lvl in enumerate(cells.unique()):
        scale = [2.04, 1.86, 1.51][trimfn(ns[lvl])]
        varEst[i, :] = scale * trimmed_mean(
            sqerror[cells == lvl, :], trim=trimratio[trimfn(ns[lvl])], axis=0
        )

    return varEst.max(axis=0)


In [None]:
def trimmed_variance(
    x: np.ndarray, trim: float = 0.125, axis: int = 0
) -> Union[float, np.ndarray]:
    """Return trimmed variance.

    Compute the variance after trimming data of its smallest and largest quantiles.

    Parameters
    ----------
    features : ndarray
        Data whose trimmed variance to compute.

    trim : float
        Fraction of data to trim at each end. (default: ``0.125``).

    axis : int
        Dimension along which to compute variance. (default: ``0``).

    Returns
    -------
    float or ndarray
        Trimmed variances.
    """
    rm = trimmed_mean(x, trim=trim, axis=axis)
    sqerror = (x - rm) ** 2
    # scale due to trimming of large squares
    return 1.51 * trimmed_mean(sqerror, trim=trim, axis=axis)

In [None]:
def trimmed_mean(x: np.ndarray, trim: float = 0.1, **kwargs) -> Union[float, np.ndarray]:
    """Return trimmed mean.

    Compute the mean after trimming data of its smallest and largest quantiles.

    Parameters
    ----------
    x : ndarray
        Data whose mean to compute.

    trim : float
        Fraction of data to trim at each end. (default: ``0.1``).

    **kwargs
        Keyword arguments, useful to pass axis.

    Returns
    -------
    float or ndarray :
        Trimmed mean.
    """
    assert trim <= 0.5
    if "axis" in kwargs:
        axis = kwargs.get("axis")
        s = np.sort(x, axis=axis)
        n = x.shape[axis]
        ntrim = floor(n * trim)
        return np.take(s, np.arange(ntrim, n - ntrim), axis).mean(axis)
    else:
        n = len(x)
        s = np.sort(x)
        ntrim = floor(n * trim)
        return s[ntrim : n - ntrim].mean()



In [None]:
def robust_method_of_moments_disp(
    normed_counts: np.ndarray, design_matrix: pd.DataFrame
) -> np.ndarray:
    """Perform dispersion estimation using a method of trimmed moments.

    Used for outlier detection based on Cook's distance.

    Parameters
    ----------
    normed_counts : ndarray
        Array of deseq2-normalized read counts. Rows: samples, columns: genes.

    design_matrix : pandas.DataFrame
        A DataFrame with experiment design information (to split cohorts).
        Indexed by sample barcodes. Unexpanded, *with* intercept.

    Returns
    -------
    ndarray
        Trimmed method of moment dispersion estimates.
        Used for outlier detection based on Cook's distance.
    """
    # if there are 3 or more replicates in any cell
    three_or_more = n_or_more_replicates(design_matrix, 3)
    if three_or_more.any():
        # 1 - group rows by unique combinations of design factors
        # 2 - keep only groups with 3 or more replicates
        # 3 - filter the counts matrix to only keep rows in those groups
        filtered_counts = normed_counts[three_or_more.values, :]
        filtered_design = design_matrix.loc[three_or_more, :]
        cell_id = pd.Series(
            filtered_design.groupby(
                filtered_design.columns.values.tolist()
            ).grouper.group_info[0],
            index=filtered_design.index,
        )
        v = trimmed_cell_variance(filtered_counts, cell_id)
    else:
        v = trimmed_variance(normed_counts)

    m = normed_counts.mean(0)
    alpha = (v - m) / m**2
    # cannot use the typical min_disp = 1e-8 here or else all counts in the same
    # group as the outlier count will get an extreme Cook's distance
    minDisp = 0.04
    np.maximum(alpha, minDisp, out=alpha)
    return alpha
