# Statistical analysis

---

In [1]:
import os
import sys
import glob
import logging
import time

import matplotlib.pyplot as plt
import plotly.express as px

import numpy as np
import pandas as pd
from clustergram import Clustergram as CGram
from itertools import combinations

import scipy
from scipy import signal
from scipy import stats

from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression as LR

# Data frames combining

In [3]:
samp_list = ['C0002', 'D0001', 'E0002']
data_path = os.path.join(''.join(sys.path[0].split('glia')), 'data_glia')

components_df_list = []
peaks_df_list = []
for samp_to_upload in samp_list:
    samp_upload_path = f'{data_path}/{samp_to_upload}'
    components_df_list.append(pd.read_csv(f'{samp_upload_path}/components_df.csv'))
    peaks_df_list.append(pd.read_csv(f'{samp_upload_path}/peaks_properties_df.csv'))
    print(samp_upload_path)

total_component_df = pd.concat(components_df_list)
total_component_df.to_csv(f'{data_path}/total_components_df.csv')

total_peak_df = pd.concat(peaks_df_list)
total_peak_df.to_csv(f'{data_path}/total_peaks_properties_df.csv')

/home/wisstock/bio/scripts/microca/data_glia/C0002
/home/wisstock/bio/scripts/microca/data_glia/D0001
/home/wisstock/bio/scripts/microca/data_glia/E0002


# Preprocessing

#### DF uploading

In [18]:
samp_list = ['C0002', 'D0001', 'E0002']
data_path = os.path.join(''.join(sys.path[0].split('glia')), 'data_glia')

total_component_df = pd.read_csv(f'{data_path}/total_components_df.csv')
print(total_component_df.head())

total_peak_df = pd.read_csv(f'{data_path}/total_peaks_properties_df.csv')
print(total_peak_df.head())

  reg_name  frame_num      time  comp  profile_raw   profile_C  profile_ddf
0    C0002          0  0.000000     0    53.587025 -161.175541     0.019450
1    C0002          1  1.091983     0    52.102883 -161.175541     0.005571
2    C0002          2  2.183965     0    51.916756 -161.175541    -0.006909
3    C0002          3  3.275948     0    52.376493 -161.175541     0.023312
4    C0002          4  4.367931     0    53.452555 -161.175541     0.028068


### DF sorting

#### Mark profiles inactive in ctrl period 

In [51]:
sort_peaks_df = total_peak_df.copy()
groups = sort_peaks_df.comp.unique()

no_ctrl_comp = {}
for s in range(len(samp_list)):
    s_val = samp_list[s]
    comps = []
    for g in range(len(groups)):
        g_val = groups[g]
        df = sort_peaks_df[(sort_peaks_df['sample'] == s_val) & (sort_peaks_df['comp'] == g_val)]
        app_val = list(df.app_group.unique())
        if app_val:
            if 'ctrl' in app_val:
                continue
            else:
                comps.append(g_val)
                print(s_val, g_val, df.shape, app_val)
        else:
            continue
    no_ctrl_comp.update({s_val:comps})
    
print(no_ctrl_comp)

C0002 1 (1, 12) ['C5a']
C0002 15 (5, 12) ['C5a', 'wash']
C0002 16 (1, 12) ['C5a']
D0001 5 (2, 12) ['wash']
D0001 11 (4, 12) ['wash']
D0001 14 (6, 12) ['C5a', 'wash']
E0002 15 (2, 12) ['wash']
E0002 16 (8, 12) ['C5a', 'wash']
E0002 14 (4, 12) ['C5a', 'wash']
{'C0002': [1, 15, 16], 'D0001': [5, 11, 14], 'E0002': [15, 16, 14]}


In [27]:
sort_peaks_df = total_peak_df.copy()

no_ctrl_peaks_rows = sort_peaks_df.groupby(['sample', 'comp']).apply(lambda x: pd.eval("x['app_group'].isin 'ctrl'"))
sort_peaks_df['no_ctrl'] = no_ctrl_peaks_rows.values  
print(sort_peaks_df.shape)


(604, 13)


# Descriptive stat.

### Pairwise KS-test

#### No ctrl profiles

In [26]:
no_ctrl_df = sort_peaks_df[sort_peaks_df['no_ctrl'] == True]
print(no_ctrl_df.shape)

stat_param = 'AUC_dF'

groups = no_ctrl_df.app_group.unique()
groups_combinations = list(combinations(groups, 2))

for samp in samp_list:
    samp_df = no_ctrl_df[no_ctrl_df['sample'] == samp]
    print(f'Sample {samp}')
    for g_comb in groups_combinations:
        group_1 = samp_df[stat_param][samp_df['app_group'] == g_comb[0]]
        group_2 = samp_df[stat_param][samp_df['app_group'] == g_comb[1]]
        ks_test = stats.ks_2samp(group_1, group_2)
        print(f'KS test for {g_comb} groups, p-value={ks_test[1]}')
    print('')

(508, 13)
Sample C0002
KS test for ('C5a', 'wash') groups, p-value=0.8503118422929637

Sample D0001
KS test for ('C5a', 'wash') groups, p-value=0.21473938572276535

Sample E0002
KS test for ('C5a', 'wash') groups, p-value=0.7611074603092557



In [23]:
# https://towardsdatascience.com/comparing-sample-distributions-with-the-kolmogorov-smirnov-ks-test-a2292ad6fee5
fig = px.ecdf(no_ctrl_df,
              x=stat_param,
              color='app_group',
              animation_frame='sample')
fig.show()

fig = px.histogram(no_ctrl_df,
              x=stat_param,
              color='app_group',
              animation_frame='sample',
              marginal='box',
              opacity=0.75)
fig.show()

#### Ctrl profiles

In [29]:
ctrl_df = sort_peaks_df[sort_peaks_df['no_ctrl'] == False]
print(ctrl_df.shape)
stat_param = 'AUC_dF'

groups = ctrl_df.app_group.unique()
groups_combinations = list(combinations(groups, 2))

print(groups)

for samp in samp_list:
    samp_df = ctrl_df[ctrl_df['sample'] == samp]
    print(f'Sample {samp}')
    for g_comb in groups_combinations:
        group_1 = samp_df[stat_param][samp_df['app_group'] == g_comb[0]]
        group_2 = samp_df[stat_param][samp_df['app_group'] == g_comb[1]]
        ks_test = stats.ks_2samp(group_1, group_2)
        print(f'KS test for {g_comb} groups, p-value={ks_test[1]}')
    print('')

(96, 13)
['ctrl']
Sample C0002

Sample D0001

Sample E0002



# Dim. reduction and clasterization

#### Features importance estimation

In [None]:
# https://python-bloggers.com/2021/01/3-essential-ways-to-calculate-feature-importance-in-python/
# https://stackoverflow.com/questions/50796024/feature-variable-importance-after-a-pca-analysis

test_features_vals = work_df[['rise', 'decay', 'FWHM', 'integral_dF', 'amp_dF']].values

test_pca = PCA()
test_features_pca = test_pca.fit(X=test_features_vals)

print(test_features_pca)

plt.plot(test_features_pca.explained_variance_ratio_.cumsum(), lw=3, color='#087E8B')
plt.show()


## PCA

#### Features estimation

In [None]:
# https://stackoverflow.com/questions/50796024/feature-variable-importance-after-a-pca-analysis

def myplot(score,coeff,labels=None):
    xs = score[:,0]
    ys = score[:,1]
    n = coeff.shape[0]
    scalex = 1.0/(xs.max() - xs.min())
    scaley = 1.0/(ys.max() - ys.min())
    plt.scatter(xs * scalex,ys * scaley, c = y)
    for i in range(n):
        plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
        if labels is None:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'g', ha = 'center', va = 'center')
        else:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')


features = work_df[['rise', 'decay', 'FWHM', 'integral_dF', 'amp_dF']].values
pca = PCA()
principal = pca.fit_transform(features)

plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xlabel("PC{}".format(1))
plt.ylabel("PC{}".format(2))
plt.grid()
myplot(principal[:,0:2], np.transpose(pca.components_[0:2, :]))
plt.show()

#### PCA calc

In [None]:
def PCA_calc(features_vals, factor_df, n=2):
    tic = time.perf_counter()

    if n == 2:
        ax_list = ['x', 'y']
    elif n == 3:
        ax_list = ['x', 'y', 'z']
    else:
        logging.fatal('Incorrect dimension number!')

    pca = PCA(n_components=n)
    principal_res = pca.fit_transform(features_vals)
    df_principal = pd.DataFrame(data = principal_res,
                                columns = ax_list)
    df_principal = pd.concat([factor_df, df_principal], axis = 1)

    toc = time.perf_counter()
    logging.info(f'PCA calc in {toc - tic:0.4f} seconds')
    return df_principal

n_components = 3
features_vals = work_df[['rise', 'decay', 'FWHM', 'integral_dF', 'amp_dF']].values
factor_df = work_df[['sample', 'comp', 'app_group', 'peak_i', 'peak_time', ]]

pca_df = PCA_calc(features_vals=features_vals, factor_df=factor_df, n=n_components)

#### PCA plot

In [None]:
group_factor = 'app_group'

if n_components == 2:
    fig = px.scatter(pca_df,
                     x='x', y='y',
                     color=group_factor,
                     symbol=group_factor)
    dot_size = 6
elif n_components == 3:
    fig = px.scatter_3d(pca_df,
                        x='x', y='y', z='z',
                        color=group_factor,
                        symbol=group_factor)
    dot_size = 2
else:
    logging.fatal('Incorrect n')

fig.update_traces(marker=dict(size=8,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0),
                  legend= {'itemsizing': 'constant'},
                  scene=dict(xaxis=dict(showaxeslabels=False, showticklabels=False, showbackground=False, title=''),
                             yaxis=dict(showaxeslabels=False, showticklabels=False, showbackground=False, title=''), 
                             zaxis=dict(showaxeslabels=False, showticklabels=False, showbackground=False, title='')))
fig.show()

## LDA

#### LDA calc

In [None]:
def LDA_calc(features_vals, factor_df, group_column=None):
    tic = time.perf_counter()

    group_vals = factor_df.loc[:,group_column].values

    lda = LDA()
    lda_fit = lda.fit_transform(X=features_vals, y=group_vals)
    df_lda = pd.DataFrame({'lda1':lda_fit[:,0], 'lda2':lda_fit[:,1]})
    df_lda = pd.concat([factor_df, df_lda], axis = 1)

    toc = time.perf_counter()
    logging.info(f'PCA calc in {toc - tic:0.4f} seconds')
    return df_lda

features_vals = work_df[['rise', 'decay', 'FWHM', 'integral_abs', 'amp_dF']].values
factor_df = work_df[['sample', 'comp', 'app_group', 'peak_i', 'peak_time', ]]
lda_df = LDA_calc(features_vals=features_vals, factor_df=factor_df, group_column='app_group')


print(lda_df.head())

# plotting
# fig = px.box(lda_df, x="app_group", y="lda", color='app_group', points='all')
fig = px.scatter(lda_df, x="lda1", y="lda2", color='app_group', symbol='app_group', width=700, height=500)

fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

fig.show()

## Clustergram

In [None]:
cgram = CGram(range(1, 5), n_init=1000)

cgram.fit(work_df[['rise', 'decay', 'FWHM', 'integral_abs', 'amp_dF']])

In [None]:
ax = cgram.plot(figsize=(10, 8))
ax.yaxis.grid(False)