In [95]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../src')

import numpy as np
import pandas as pd
from pingouin import partial_corr
from pingouin import pairwise_corr

from corr_networks import pairwise_correlations
from corr_networks import pairwise_polychoric_correlations
from corr_networks import precision_mat_to_partial_corr
from corr_networks import cov_mat_to_regularized_partial_corr
from corr_networks import my_pairwise_correlations

from data_metadata import load_gss_sas 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [72]:
gss_file = "C:/Users/vicvi/big-datasets/social_values/GSS_sas/gss7222_r3.sas7bdat"
variable_list = ["VOTE68", "PARTYID", "POLVIEWS", "HOMOSEX"]

df, meta = load_gss_sas(gss_file, variable_list)

In [81]:
# Define mean vector and covariance matrix
dim = 4
random_mat = 2 * np.random.rand(dim, dim) - 1 # get a matrix
random_cov_mat = np.dot(random_mat, random_mat.T) # make it pos semi-definite

std_deviations = np.sqrt(np.diag(random_cov_mat))
random_cor_mat = random_cov_mat / np.outer(std_deviations, std_deviations)
    
mean = np.zeros((dim,))  # Mean vector

# Generate random samples from the multivariate normal distribution
num_samples = 5000
samples = np.random.multivariate_normal(mean, random_cov_mat, size=num_samples)

sample_df = pd.DataFrame(samples)

# Print the first few samples

precision_matrix = np.linalg.inv(random_cov_mat)
partial_corr_mat = precision_mat_to_partial_corr(precision_matrix)

sample_df_ord = pd.DataFrame()

for var in list(range(dim)):
    num_ordinal_values = np.random.randint(2, 11)
    # signed = np.random.rand() > 0.5
    var_std = np.sqrt(random_cov_mat[var, var])

    interval_spread = np.random.rand() * var_std
    leftmost_border = interval_spread * ((num_ordinal_values - 2)/ 2) + (np.random.rand() - 0.5)
    
    cutoffs = interval_spread * np.arange(num_ordinal_values - 1) - leftmost_border
    cutoffs = np.concatenate(([-np.inf], cutoffs, [np.inf]))
    sample_df_ord[var] = pd.cut(sample_df[var], bins=cutoffs, labels=np.arange(num_ordinal_values)).cat.codes


print(sample_df.head())
print(sample_df_ord.head())

print("Cov mat")
print(random_cov_mat)

print("Corr mat")
print(random_cor_mat)

print("Partial corr mat")
print(partial_corr_mat)

          0         1         2         3
0  1.628361 -0.259324 -1.050410 -0.527896
1  3.205095 -0.699433 -0.934915 -0.493218
2  1.279144 -0.314402  0.753300  0.314517
3  0.191228  0.125083  0.773049 -0.384813
4  2.365126 -0.484185 -0.996463 -0.869565
   0  1  2  3
0  4  0  4  0
1  4  0  4  0
2  4  0  6  1
3  4  4  6  0
4  4  0  4  0
Cov mat
[[ 1.54695214 -0.26597986 -0.30160722 -0.3323204 ]
 [-0.26597986  0.09889098 -0.03375579 -0.10324549]
 [-0.30160722 -0.03375579  0.87176609  0.49419805]
 [-0.3323204  -0.10324549  0.49419805  0.76987995]]
Corr mat
[[ 1.         -0.68003609 -0.25971876 -0.30451385]
 [-0.68003609  1.         -0.11496614 -0.37418058]
 [-0.25971876 -0.11496614  1.          0.60323918]
 [-0.30451385 -0.37418058  0.60323918  1.        ]]
Partial corr mat
[[ 1.         -0.89846877  0.07968497 -0.76784893]
 [-0.89846877  1.          0.13712877 -0.79810409]
 [ 0.07968497  0.13712877  1.          0.44953927]
 [-0.76784893 -0.79810409  0.44953927  1.        ]]


In [84]:
# polychoric procedure 
polychoric_corr_mat = pairwise_polychoric_correlations([0, 1, 2, 3], sample_df_ord)
polychoric_partial_corr_mat = cov_mat_to_regularized_partial_corr(polychoric_corr_mat)
print("polychoric_partial_corr")
print(polychoric_partial_corr_mat)

polychoric_partial_corr
[[ 1.         -0.90986514  0.05393421 -0.78272285]
 [-0.90986514  1.          0.11767968 -0.81152128]
 [ 0.05393421  0.11767968  1.          0.42499887]
 [-0.78272285 -0.81152128  0.42499887  1.        ]]


In [96]:
# pearson procedure 1 on original data
my_pairwise_correlations(list(range(dim)), sample_df, method="pearson")

array([[ 1.        , -0.8978681 ,  0.06998312, -0.7674722 ],
       [-0.8978681 ,  1.        ,  0.12756936, -0.79385832],
       [ 0.06998312,  0.12756936,  1.        ,  0.44069435],
       [-0.7674722 , -0.79385832,  0.44069435,  1.        ]])

In [98]:
# pearson procedure 2 on original data
pearson_corr_df, pearson_corr_mat = pairwise_correlations(list(range(dim)), sample_df, "pearson")
pearson_corr_mat

array([[ 1.        , -0.8978681 ,  0.06998312, -0.7674722 ],
       [-0.8978681 ,  1.        ,  0.12756936, -0.79385832],
       [ 0.06998312,  0.12756936,  1.        ,  0.44069435],
       [-0.7674722 , -0.79385832,  0.44069435,  1.        ]])

In [89]:
# spearman procedure 1 on original data
my_pairwise_correlations(list(range(dim)), sample_df, method="spearman")

array([[ 1.        , -0.85919144,  0.01690616, -0.69490074],
       [-0.85919144,  1.        ,  0.079675  , -0.72644659],
       [ 0.01690616,  0.079675  ,  1.        ,  0.42953566],
       [-0.69490074, -0.72644659,  0.42953566,  1.        ]])

In [93]:
# spearman procedure 2 on original data
spearman_corr_df, spearman_corr_mat = pairwise_correlations(list(range(dim)), sample_df, "spearman")
spearman_corr_mat

array([[ 1.        , -0.85919144,  0.01690616, -0.69490074],
       [-0.85919144,  1.        ,  0.079675  , -0.72644659],
       [ 0.01690616,  0.079675  ,  1.        ,  0.42953566],
       [-0.69490074, -0.72644659,  0.42953566,  1.        ]])

In [91]:
# spearman procedure 1 on ordinalized data
my_pairwise_correlations(list(range(dim)), sample_df_ord, method="spearman")

array([[ 1.        , -0.6166378 , -0.10222241, -0.36117204],
       [-0.6166378 ,  1.        , -0.00269639, -0.4182649 ],
       [-0.10222241, -0.00269639,  1.        ,  0.4415108 ],
       [-0.36117204, -0.4182649 ,  0.4415108 ,  1.        ]])

In [92]:
# spearman procedure 2 on ordinalized data
spearman_corr_df, spearman_corr_mat = pairwise_correlations(list(range(dim)), sample_df_ord, "spearman")
spearman_corr_mat


array([[ 1.        , -0.6166378 , -0.10222241, -0.36117204],
       [-0.6166378 ,  1.        , -0.00269639, -0.4182649 ],
       [-0.10222241, -0.00269639,  1.        ,  0.4415108 ],
       [-0.36117204, -0.4182649 ,  0.4415108 ,  1.        ]])

In [75]:
pearson_corr_df, pearson_corr_mat = partial_correlations(list(range(dim)), sample_df, "pearson")
print(pearson_corr_mat)

[[ 1.          0.99923368  0.99857396]
 [ 0.99923368  1.         -0.99691586]
 [ 0.99857396 -0.99691586  1.        ]]


In [17]:
spearman_corr_df, spearman_corr_mat = partial_correlations(list(range(dim)), sample_df_ord, "spearman")
spearman_corr_mat

array([[ 1.        ,  0.53125325,  0.58348842,  0.63130475],
       [ 0.53125325,  1.        , -0.27885483, -0.14230966],
       [ 0.58348842, -0.27885483,  1.        , -0.01317824],
       [ 0.63130475, -0.14230966, -0.01317824,  1.        ]])

In [51]:
corr_df = pd.concat(corr_dfs)

In [62]:
corr_df

Unnamed: 0,n,r,CI95%,p-val,x,y
spearman,5000,-0.033487,"[-0.06, -0.01]",0.017909,2,3


In [None]:
from data_metadata import load_gss_sas



load_gss_sas(filename, vars_to_load, metadataonly=False)