In [2]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../src')

import numpy as np
import pandas as pd
from pingouin import partial_corr
from pingouin import pairwise_corr

from corr_networks import pairwise_correlations
from corr_networks import pairwise_polychoric_correlations
from corr_networks import precision_mat_to_partial_corr
from corr_networks import cov_mat_to_regularized_partial_corr
from corr_networks import my_pairwise_correlations

from data_metadata import load_gss_sas 

In [109]:
np.set_printoptions(precision=3)

In [81]:
# Define mean vector and covariance matrix
dim = 4
random_mat = 2 * np.random.rand(dim, dim) - 1 # get a matrix
random_cov_mat = np.dot(random_mat, random_mat.T) # make it pos semi-definite

std_deviations = np.sqrt(np.diag(random_cov_mat))
random_cor_mat = random_cov_mat / np.outer(std_deviations, std_deviations)
    
mean = np.zeros((dim,))  # Mean vector

# Generate random samples from the multivariate normal distribution
num_samples = 5000
samples = np.random.multivariate_normal(mean, random_cov_mat, size=num_samples)

sample_df = pd.DataFrame(samples)

# Print the first few samples

precision_matrix = np.linalg.inv(random_cov_mat)
partial_corr_mat = precision_mat_to_partial_corr(precision_matrix)

sample_df_ord = pd.DataFrame()

for var in list(range(dim)):
    num_ordinal_values = np.random.randint(2, 11)
    # signed = np.random.rand() > 0.5
    var_std = np.sqrt(random_cov_mat[var, var])

    interval_spread = np.random.rand() * var_std
    leftmost_border = interval_spread * ((num_ordinal_values - 2)/ 2) + (np.random.rand() - 0.5)
    
    cutoffs = interval_spread * np.arange(num_ordinal_values - 1) - leftmost_border
    cutoffs = np.concatenate(([-np.inf], cutoffs, [np.inf]))
    sample_df_ord[var] = pd.cut(sample_df[var], bins=cutoffs, labels=np.arange(num_ordinal_values)).cat.codes


print(sample_df.head())
print(sample_df_ord.head())

print("Cov mat")
print(random_cov_mat)

print("Corr mat")
print(random_cor_mat)

print("Partial corr mat")
print(partial_corr_mat)

          0         1         2         3
0  1.628361 -0.259324 -1.050410 -0.527896
1  3.205095 -0.699433 -0.934915 -0.493218
2  1.279144 -0.314402  0.753300  0.314517
3  0.191228  0.125083  0.773049 -0.384813
4  2.365126 -0.484185 -0.996463 -0.869565
   0  1  2  3
0  4  0  4  0
1  4  0  4  0
2  4  0  6  1
3  4  4  6  0
4  4  0  4  0
Cov mat
[[ 1.54695214 -0.26597986 -0.30160722 -0.3323204 ]
 [-0.26597986  0.09889098 -0.03375579 -0.10324549]
 [-0.30160722 -0.03375579  0.87176609  0.49419805]
 [-0.3323204  -0.10324549  0.49419805  0.76987995]]
Corr mat
[[ 1.         -0.68003609 -0.25971876 -0.30451385]
 [-0.68003609  1.         -0.11496614 -0.37418058]
 [-0.25971876 -0.11496614  1.          0.60323918]
 [-0.30451385 -0.37418058  0.60323918  1.        ]]
Partial corr mat
[[ 1.         -0.89846877  0.07968497 -0.76784893]
 [-0.89846877  1.          0.13712877 -0.79810409]
 [ 0.07968497  0.13712877  1.          0.44953927]
 [-0.76784893 -0.79810409  0.44953927  1.        ]]


In [101]:
# polychoric procedure 
polychoric_corr_mat = pairwise_polychoric_correlations([0, 1, 2, 3], sample_df_ord)
polychoric_partial_corr_mat = cov_mat_to_regularized_partial_corr(polychoric_corr_mat, alpha=0.1)
print("polychoric_partial_corr")
print(polychoric_partial_corr_mat)

polychoric_partial_corr
[[ 1.         -0.68614763 -0.06857862 -0.39751576]
 [-0.68614763  1.         -0.         -0.44634694]
 [-0.06857862 -0.          1.          0.43508295]
 [-0.39751576 -0.44634694  0.43508295  1.        ]]


In [96]:
# pearson procedure 1 on original data
my_pairwise_correlations(list(range(dim)), sample_df, method="pearson")

array([[ 1.        , -0.8978681 ,  0.06998312, -0.7674722 ],
       [-0.8978681 ,  1.        ,  0.12756936, -0.79385832],
       [ 0.06998312,  0.12756936,  1.        ,  0.44069435],
       [-0.7674722 , -0.79385832,  0.44069435,  1.        ]])

In [98]:
# pearson procedure 2 on original data
pearson_corr_df, pearson_corr_mat = pairwise_correlations(list(range(dim)), sample_df, "pearson")
pearson_corr_mat

array([[ 1.        , -0.8978681 ,  0.06998312, -0.7674722 ],
       [-0.8978681 ,  1.        ,  0.12756936, -0.79385832],
       [ 0.06998312,  0.12756936,  1.        ,  0.44069435],
       [-0.7674722 , -0.79385832,  0.44069435,  1.        ]])

In [89]:
# spearman procedure 1 on original data
my_pairwise_correlations(list(range(dim)), sample_df, method="spearman")

array([[ 1.        , -0.85919144,  0.01690616, -0.69490074],
       [-0.85919144,  1.        ,  0.079675  , -0.72644659],
       [ 0.01690616,  0.079675  ,  1.        ,  0.42953566],
       [-0.69490074, -0.72644659,  0.42953566,  1.        ]])

In [93]:
# spearman procedure 2 on original data
spearman_corr_df, spearman_corr_mat = pairwise_correlations(list(range(dim)), sample_df, "spearman")
spearman_corr_mat

array([[ 1.        , -0.85919144,  0.01690616, -0.69490074],
       [-0.85919144,  1.        ,  0.079675  , -0.72644659],
       [ 0.01690616,  0.079675  ,  1.        ,  0.42953566],
       [-0.69490074, -0.72644659,  0.42953566,  1.        ]])

In [91]:
# spearman procedure 1 on ordinalized data
my_pairwise_correlations(list(range(dim)), sample_df_ord, method="spearman")

array([[ 1.        , -0.6166378 , -0.10222241, -0.36117204],
       [-0.6166378 ,  1.        , -0.00269639, -0.4182649 ],
       [-0.10222241, -0.00269639,  1.        ,  0.4415108 ],
       [-0.36117204, -0.4182649 ,  0.4415108 ,  1.        ]])

In [92]:
# spearman procedure 2 on ordinalized data
spearman_corr_df, spearman_corr_mat = pairwise_correlations(list(range(dim)), sample_df_ord, "spearman")
spearman_corr_mat


array([[ 1.        , -0.6166378 , -0.10222241, -0.36117204],
       [-0.6166378 ,  1.        , -0.00269639, -0.4182649 ],
       [-0.10222241, -0.00269639,  1.        ,  0.4415108 ],
       [-0.36117204, -0.4182649 ,  0.4415108 ,  1.        ]])

In [3]:
gss_file = "C:/Users/vicvi/big-datasets/social_values/GSS_sas/gss7222_r3.sas7bdat"
variable_list = ["VOTE68", "VOTE72", "PARTYID", "POLVIEWS", "HOMOSEX", "NATENRGY", "AFFRMACT", "CONJUDGE", "HELPPOOR", "NATSPAC", "NATENVIR", "NATCITY", "NATDRUG"]

df, meta = load_gss_sas(gss_file, variable_list)

In [157]:
vars, corr_mat = my_pairwise_correlations(variable_list, df, method="spearman")
print(variable_list)
print(vars)
print(corr_mat)

            VOTE68   PARTYID  POLVIEWS   HOMOSEX  NATENRGY  AFFRMACT  \
VOTE68    1.000000  0.051568       NaN  0.080867       NaN       NaN   
PARTYID   0.051568  1.000000  0.387075 -0.083713  0.226117  0.262673   
POLVIEWS       NaN  0.387075  1.000000 -0.282770  0.280324  0.237890   
HOMOSEX   0.080867 -0.083713 -0.282770  1.000000 -0.187196 -0.089864   
NATENRGY       NaN  0.226117  0.280324 -0.187196  1.000000  0.095734   
AFFRMACT       NaN  0.262673  0.237890 -0.089864  0.095734  1.000000   
CONJUDGE -0.054491 -0.026190 -0.003245 -0.029414 -0.038925 -0.038012   
HELPPOOR       NaN  0.306078  0.283892 -0.093198  0.188198  0.269168   
NATSPAC   0.057836 -0.117738 -0.004816 -0.201215  0.095704 -0.024902   
NATENVIR -0.074510  0.157388  0.230077 -0.183861  0.398716  0.132512   
NATCITY  -0.106682  0.151537  0.166147 -0.091848  0.213761  0.135285   
NATDRUG   0.049842  0.099848  0.057658  0.036699  0.124395  0.098139   

          CONJUDGE  HELPPOOR   NATSPAC  NATENVIR   NATCITY   NA

In [5]:
vars, corr_mat = my_pairwise_correlations(["VOTE68", "VOTE72", "PARTYID", "POLVIEWS", "HOMOSEX", "NATENRGY", "AFFRMACT", "CONJUDGE"], df, method="spearman")
print(["VOTE68", "VOTE72",  "PARTYID", "POLVIEWS", "HOMOSEX", "NATENRGY", "AFFRMACT", "CONJUDGE"])
print(vars)
print(corr_mat)

[[ 1.          0.49947949  0.05156841         nan  0.080867           nan
          nan -0.05449085]
 [ 0.49947949  1.         -0.00756989 -0.05765403 -0.03853483         nan
          nan  0.00397666]
 [ 0.05156841 -0.00756989  1.          0.38707504 -0.08371342  0.2261172
   0.26267347 -0.02618952]
 [        nan -0.05765403  0.38707504  1.         -0.28277007  0.28032371
   0.23788984 -0.00324458]
 [ 0.080867   -0.03853483 -0.08371342 -0.28277007  1.         -0.18719565
  -0.0898642  -0.02941421]
 [        nan         nan  0.2261172   0.28032371 -0.18719565  1.
   0.09573424 -0.0389247 ]
 [        nan         nan  0.26267347  0.23788984 -0.0898642   0.09573424
   1.         -0.03801178]
 [-0.05449085  0.00397666 -0.02618952 -0.00324458 -0.02941421 -0.0389247
  -0.03801178  1.        ]]
['VOTE68', 'VOTE72', 'PARTYID', 'POLVIEWS', 'HOMOSEX', 'NATENRGY', 'AFFRMACT', 'CONJUDGE']
['PARTYID', 'POLVIEWS', 'HOMOSEX', 'NATENRGY', 'AFFRMACT', 'CONJUDGE']
[[ 1.00000000e+00  3.11129040e-01  4.96

In [118]:
corr_df, corr_mat = pairwise_correlations(["NATSPAC", "NATENVIR", "NATCITY", "NATDRUG"], df, method="spearman")
print(corr_mat)

[[ 1.     0.07  -0.057 -0.061]
 [ 0.07   1.     0.209  0.095]
 [-0.057  0.209  1.     0.186]
 [-0.061  0.095  0.186  1.   ]]


In [132]:
arr = np.arange(16).reshape(4, 4).astype(float)
arr[2, 3] = np.nan
arr[1, 1] = np.nan

In [133]:
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4., nan,  6.,  7.],
       [ 8.,  9., 10., nan],
       [12., 13., 14., 15.]])

In [147]:
rows_to_keep = np.where(~np.bitwise_or.reduce(np.isnan(arr), axis=1))[0].reshape(-1, 1)

In [146]:
cols_to_keep = np.where(~np.bitwise_or.reduce(np.isnan(arr), axis=1))[0].reshape(1, -1)

In [148]:
arr[rows_to_keep,cols_to_keep]

array([[ 0.,  3.],
       [12., 15.]])