In [24]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../src')

import numpy as np
import pandas as pd
from pingouin import partial_corr
from pingouin import pairwise_corr
from corr_networks import partial_correlations

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
def partial_correlations(vars, data, method):
    corr_dfs = []
    corr_mat = np.zeros((len(vars), len(vars)))

    for i in range(len(vars)):
        for j in range(i+1, len(vars)):
            covar_list = vars[:i] + vars[i+1:j] + vars[j+1:]
            corr_df = partial_corr(data=data, x=vars[i], y=vars[j], covar=covar_list, alternative='two-sided', method=method)
            corr_df["x"] = i
            corr_df["y"] = j
            corr_mat[i, j] = corr_df.loc[method, "r"]
            corr_mat[j, i] = corr_df.loc[method, "r"]
            corr_dfs.append(corr_df)

    corr_info = pd.concat(corr_dfs)
    np.fill_diagonal(corr_mat, 1)
    return corr_info, corr_mat  

In [None]:
gss_file = "C:/Users/vicvi/big-datasets/social_values/GSS_sas/gss7222_r3.sas7bdat"
variable_list = ["VOTE68", "PARTYID", "POLVIEWS"]

df = make_variable_df(gss_file, variable_list)

In [3]:
# Define mean vector and covariance matrix
dim = 4
random_mat = 2 * np.random.rand(dim, dim) - 1 # get a matrix
random_cov_mat = np.dot(random_mat, random_mat.T) # make it pos semi-definite

std_deviations = np.sqrt(np.diag(random_cov_mat))
random_cor_mat = random_cov_mat / np.outer(std_deviations, std_deviations)
    
mean = np.zeros((dim,))  # Mean vector

# Generate random samples from the multivariate normal distribution
num_samples = 5000
samples = np.random.multivariate_normal(mean, random_cov_mat, size=num_samples)

sample_df = pd.DataFrame(samples)

# Print the first few samples

precision_matrix = np.linalg.inv(random_cov_mat)
    
# Calculate the partial correlation matrix
partial_cor_mat = - precision_matrix / np.sqrt(np.outer(np.diag(precision_matrix), np.diag(precision_matrix)))
    
# Set diagonal elements to 1
np.fill_diagonal(partial_cor_mat, 1)


print("Cov mat")
print(random_cov_mat)

print("Corr mat")
print(random_cor_mat)

print("Partial corr mat")
print(partial_cor_mat)

sample_df.head()

Cov mat
[[1.55660147 0.9113298  1.55170456 1.20734355]
 [0.9113298  1.31098807 0.57686224 0.60363693]
 [1.55170456 0.57686224 1.85921498 1.22803975]
 [1.20734355 0.60363693 1.22803975 1.13586251]]
Corr mat
[[1.         0.63795123 0.91212742 0.90798624]
 [0.63795123 1.         0.36949447 0.49466769]
 [0.91212742 0.36949447 1.         0.84505453]
 [0.90798624 0.49466769 0.84505453 1.        ]]
Partial corr mat
[[ 1.          0.77228542  0.82384801  0.58900551]
 [ 0.77228542  1.         -0.67384914 -0.26638182]
 [ 0.82384801 -0.67384914  1.         -0.10960967]
 [ 0.58900551 -0.26638182 -0.10960967  1.        ]]


Unnamed: 0,0,1,2,3
0,0.175278,0.798412,-0.28848,0.018721
1,-0.667435,0.421628,-0.002318,-0.581811
2,-0.291943,0.455635,0.04581,-0.378361
3,-1.328126,-1.597854,-0.724473,-0.61851
4,1.291938,3.8e-05,1.421131,1.708675


In [11]:
pearson_corr_df, pearson_corr_mat = partial_correlations(list(range(dim)), sample_df, "pearson")
print(pearson_corr_mat)

[[ 1.          0.77289132  0.82567284  0.5860598 ]
 [ 0.77289132  1.         -0.67569273 -0.26306476]
 [ 0.82567284 -0.67569273  1.         -0.10943745]
 [ 0.5860598  -0.26306476 -0.10943745  1.        ]]


In [12]:
sample_df_ord = pd.DataFrame()

for var in list(range(dim)):
    num_ordinal_values = np.random.randint(2, 11)
    # signed = np.random.rand() > 0.5
    var_std = np.sqrt(random_cov_mat[var, var])

    interval_spread = np.random.rand() * var_std
    leftmost_border = interval_spread * ((num_ordinal_values - 2)/ 2) + (np.random.rand() - 0.5)
    
    print(num_ordinal_values)
    cutoffs = interval_spread * np.arange(num_ordinal_values - 1) - leftmost_border
    cutoffs = np.concatenate(([-np.inf], cutoffs, [np.inf]))
    print(cutoffs)

    sample_df_ord[var] = pd.cut(sample_df[var], bins=cutoffs, labels=np.arange(num_ordinal_values)).cat.codes

7
[       -inf -1.33249301 -0.91784874 -0.50320447 -0.0885602   0.32608407
  0.74072834         inf]
9
[       -inf -4.03282245 -2.98433953 -1.9358566  -0.88737368  0.16110925
  1.20959217  2.2580751   3.30655802         inf]
2
[      -inf 0.49778988        inf]
5
[       -inf -0.28341825 -0.19380304 -0.10418783 -0.01457262         inf]


In [17]:
spearman_corr_df, spearman_corr_mat = partial_correlations(list(range(dim)), sample_df_ord, "spearman")
spearman_corr_mat

array([[ 1.        ,  0.53125325,  0.58348842,  0.63130475],
       [ 0.53125325,  1.        , -0.27885483, -0.14230966],
       [ 0.58348842, -0.27885483,  1.        , -0.01317824],
       [ 0.63130475, -0.14230966, -0.01317824,  1.        ]])

In [18]:
print(pearson_corr_mat)
pearson_corr_mat

[[ 1.          0.77289132  0.82567284  0.5860598 ]
 [ 0.77289132  1.         -0.67569273 -0.26306476]
 [ 0.82567284 -0.67569273  1.         -0.10943745]
 [ 0.5860598  -0.26306476 -0.10943745  1.        ]]


array([[ 1.        ,  0.77289132,  0.82567284,  0.5860598 ],
       [ 0.77289132,  1.        , -0.67569273, -0.26306476],
       [ 0.82567284, -0.67569273,  1.        , -0.10943745],
       [ 0.5860598 , -0.26306476, -0.10943745,  1.        ]])

In [21]:
random_cor_mat

array([[1.        , 0.63795123, 0.91212742, 0.90798624],
       [0.63795123, 1.        , 0.36949447, 0.49466769],
       [0.91212742, 0.36949447, 1.        , 0.84505453],
       [0.90798624, 0.49466769, 0.84505453, 1.        ]])

In [61]:
corr_dfs = []
for i in range(len(vars)):
    for j in range(i+1, len(vars)):
        covar_list = vars[:i] + vars[i+1:j] + vars[j+1:]
        print(f"{vars[i]}, {vars[j]}", covar_list)
        corr_df = partial_corr(data=sample_df_ord, x=vars[i], y=vars[j], covar=covar_list, alternative='two-sided', method='spearman')
        corr_df["x"] = vars[i]
        corr_df["y"] = vars[j]
        corr_dfs.append(corr_df)

0, 1 [2, 3]
0, 2 [1, 3]
0, 3 [1, 2]
1, 2 [0, 3]
1, 3 [0, 2]
2, 3 [0, 1]


In [51]:
corr_df = pd.concat(corr_dfs)

In [62]:
corr_df

Unnamed: 0,n,r,CI95%,p-val,x,y
spearman,5000,-0.033487,"[-0.06, -0.01]",0.017909,2,3


In [None]:
from data_metadata import load_gss_sas



load_gss_sas(filename, vars_to_load, metadataonly=False)