In [3]:
import pandas as pd
from sklearn import linear_model
import numpy as np
from scipy.stats import gmean
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import MultiTaskLassoCV
from sklearn.model_selection import RepeatedKFold
import warnings
from scipy.stats import pearsonr


In [2]:
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

In [4]:
file_donorA = '/storage/zkarwowska/microbiome-interactions/datasets/processed/ready_datasets_transformed/common/donorA.csv'

In [5]:
df = pd.read_csv(file_donorA, index_col = [0])
df = df.iloc[:, :30]

In [6]:
# transform using CLR
def clr_transform(x):
    
    x = x + 1e-10
    g_mean = gmean(x)
    clr = np.log(x) - np.log(g_mean)
    
    return clr

In [7]:
clr_df = df.apply(clr_transform)

In [8]:
# differenciate
clr_diff_df = clr_df.diff(periods=1)
clr_diff_df = clr_diff_df.iloc[1:]

In [9]:
def make_lagged_df(df, maxlag):
    
    lagged_df = pd.DataFrame() 

    colnames = []
    for i in range(1, maxlag):
        a = df.shift(i)
        lagged_df = pd.concat([lagged_df, a],axis=1) 
        columns = [col + '_lag{}'.format(i) for col in df.columns]
        colnames.append(columns)

    colnames = [item for sublist in colnames for item in sublist]
    lagged_df.columns = colnames

    lagged_df = lagged_df.iloc[maxlag:]
    
    return lagged_df

## lasso regression

In [10]:
def partial_granger_lasso(df_lag0, maxlag, lagged_df):

    partial_causality_df = []    
    for t in df_lag0.columns:

        target = np.array(df_lag0[t]).reshape(-1,1)

        interactor = []
        causality_coeff = []
        interactor_lag = []
        for col in df_lag0.columns:

            for lag in range(1, maxlag):

                y_name = col  + '_lag{}'.format(lag)

                confounders = np.array(lagged_df.drop([y_name], axis=1))
                all_variables = np.array(lagged_df)

                cv = RepeatedKFold(n_repeats=3, n_splits=5, random_state=1)
                lasso_alphas = np.linspace(1e-1, 1, 10)

                # x conditioned on confounders
                reg1 = MultiTaskLassoCV(cv=cv, alphas=lasso_alphas, random_state=0).fit(confounders, target)
                reg1_resid = reg1.score(confounders, target)

                # x y conditioned on confounders
                reg2 = MultiTaskLassoCV(cv=cv, alphas=lasso_alphas, random_state=0).fit(all_variables, target)
                reg2_resid = reg2.score(all_variables, target)

                #partial_granger = np.log(reg1_resid) - np.log(reg2_resid)
                partial_granger = reg2_resid - reg1_resid
                
                interactor.append(col)
                interactor_lag.append(lag)
                causality_coeff.append(np.round(partial_granger, 3))
                
        df = pd.DataFrame(list(zip(interactor, interactor_lag, causality_coeff)), columns = ['otu', 'lag', 'partial_causality'])
        df['target'] = t
        partial_causality_df.append(df)

    partial_causality_df = pd.concat(partial_causality_df)    
    
    return partial_causality_df

In [11]:
maxlag = 2
lagged_df = make_lagged_df(clr_diff_df, maxlag)
df_lag0 = clr_diff_df.iloc[maxlag:]

partial_causality_df = partial_granger_lasso(df_lag0, maxlag, lagged_df)

# TESTS

### test on shuffled data

In [12]:
def permutate_cols(x):
    shuffled_x = np.random.permutation(x)
    
    return shuffled_x

In [13]:
shuffled_df = df.apply(permutate_cols)

In [14]:
shuffled_clr_df = shuffled_df.apply(clr_transform)
shuffled_clr_diff_df = shuffled_clr_df.diff(periods=1)
shuffled_clr_diff_df = shuffled_clr_diff_df.iloc[1:]

In [15]:
maxlag = 2
shuffled_lagged_df = make_lagged_df(shuffled_clr_diff_df, maxlag)
shuffled_df_lag0 = shuffled_clr_diff_df.iloc[maxlag:]

In [16]:
shuffled_partial_df = partial_granger_lasso(shuffled_df_lag0, maxlag,shuffled_lagged_df)

In [None]:
#compare results on original and shuffled data

from scipy.stats import pearsonr
pearsonr(shuffled_partial_df['partial_causality'], partial_causality_df['partial_causality'])

In [None]:
df = pd.merge(partial_causality_df, shuffled_partial_df, on = ['target', 'otu', 'lag'])

In [None]:
plt.figure(figsize = [8, 8])
ax = sns.jointplot(data = df,
              x = 'partial_causality_x',
              y = 'partial_causality_y', 
              hue = 'lag',
              s=200,
              edgecolor = 'black',
              alpha = .5,
                   height = 8,
             )
x = 'causality coefficient on true data'
y = 'causality coefficient on shuffled data'

ax.set_axis_labels(x, y, fontsize=12)
plt.tight_layout()
plt.savefig('plots/partial_causality_shuffle2.png')

In [None]:
import matplotlib.patches as mpatches

plt.figure(figsize = [8, 8])
sns.scatterplot(x=shuffled_partial_df['partial_causality'].index, y=shuffled_partial_df['partial_causality'], s = 100, edgecolor = 'black', alpha = .5)
sns.scatterplot(x=partial_causality_df['partial_causality'].index, y=partial_causality_df['partial_causality'], s = 100, marker='o', edgecolor = 'black', alpha = .5, color = 'orange')

blue = mpatches.Patch(color='lightblue', label='partial_causality on true data')
orange = mpatches.Patch(color='orange', label='partial_causality on shuffled data')

plt.legend(handles=[blue, orange], loc='center', bbox_to_anchor=(1.25, 1))

plt.savefig('plots/partial_causality_shuffle.png')
#plt.ylabel('shuffled')
#plt.xlabel('original')
#plt.xlim([-0.1, 0.5])
#plt.ylim([-0.1, 0.5])


## test 2: partial causality vs PACF

In [None]:
from statsmodels.tsa.stattools import pacf

In [None]:
DF = []
for col in df_lag0.columns:

    partial_coeff = partial_causality_df[(partial_causality_df['otu'] == col) & (partial_causality_df['target'] == col)]
    
    DF.append(partial_coeff)
    
DF = pd.concat(DF)

In [None]:
pacfLag1 = []
pacfLag2 = []
pacfLag3 = []

for col in df_lag0.columns:
    pacf_coeff = pacf(df_lag0[col], nlags = maxlag , method='ols')
    pacfLag1.append(pacf_coeff[1])
    pacfLag2.append(pacf_coeff[2])
    pacfLag3.append(pacf_coeff[3])


In [None]:
DF_lag1 = DF[DF['lag'] == 1]
DF_lag1['pacf'] = pacfLag1

DF_lag2 = DF[DF['lag'] == 2]
DF_lag2['pacf'] = pacfLag2

DF_lag3 = DF[DF['lag'] == 3]
DF_lag3['pacf'] = pacfLag3


pacf_causality_df = DF_lag1.append(DF_lag2).append(DF_lag3)

In [None]:
plt.figure(figsize = [6, 6])

sns.scatterplot(data = pacf_causality_df,
                x = 'partial_causality',
                y = 'pacf',
                s = 150,
                alpha = 0.4)

In [None]:
#linear regression
def partial_granger(target, target_name):
    
    #target variable
    x = np.array(df_lag0.iloc[:,target]).reshape(-1,1)

    name = []
    partial_granger_coeff = []

    for i in range(0, len(lagged_df.columns)):
        #confounder
        z = np.array(lagged_df.drop(lagged_df.columns[i], axis=1))
        #predictors
        y = np.array(lagged_df)    

        # X conditioned on confounders
        r1_lm = linear_model.LinearRegression()
        r1_model = r1_lm.fit(x, z)
        r1_resid = r1_lm.score(x, z) #r2

        # lm(X,Y) conditioned on confounders
        r2_lm = linear_model.LinearRegression()
        r2_model = r2_lm.fit(x, y)
        r2_resid = r2_lm.score(x, y) #r2

        #partial Granger's causality
        
        partial_granger = np.log(r1_resid) - np.log(r2_resid)

        name.append(lagged_df.columns[i])
        partial_granger_coeff.append(partial_granger)
        
    part_granger_df = pd.DataFrame(list(zip(name, partial_granger_coeff)), columns = ['predictor', 'partial_granger'])
    part_granger_df['target'] = target_name
    
    return part_granger_df

pgc = []
for col_idx in range(0,len(df_lag0.columns)):
    
    c = partial_granger(col_idx, df_lag0.iloc[:,col_idx].name)
    pgc.append(c)