In [1]:
# This is an example of how we see 
# the package work. The functions listed here
# are probably the only ones that should be exposed, ie documented.
# others should br prepended with a double underscore
#  
# The cognet directory has the following "modules"
# which are seprate .py files containing clases and functions
# The modules are cognet.py, dataFormatter.py, model.py, util.py, viz.py
# we will write the viz.py later.
import sys

from quasinet.qnet import qdistance
from cognet.cognet import cognet as cg
from cognet.dataFormatter import dataFormatter
from cognet.model import model 
#import cognet.util
import pandas as pd
import numpy as np

yr = '2018'
POLEFILE='GSS/data/polar_vectors.csv'
QPATH='GSS/data/gss_'+yr+'.joblib'
IMMUTABLE_FILE='GSS/data/immutable.csv'
GSSDATA = 'GSS/data/gss_'+yr+'.csv'

In [2]:
# testing dataFormatter
data = dataFormatter(samples=GSSDATA)
# load the sample data
# have option for test/train split
# make checks to ensure we will not throw errors at qnet construction 
print(data.samples[:2])
features,samples = data.format_samples('train') # default trains and tests using half
all_samples = True
if all_samples: # use all samples to train, instead of half
    features,samples = data.Qnet_formatter()

# format data for Qnet training and fitting
print(samples.shape)

# set mutable and immutable vars either from list or file
im_vars_df = pd.read_csv(IMMUTABLE_FILE, names=['vars'])
im_vars_list = im_vars_df.vars.to_list()
mutable_vars, immutable_vars = data.mutable_variables(immutable_list=im_vars_list)
mutable_vars, immutable_vars = data.mutable_variables(IMMUTABLE_FILE=IMMUTABLE_FILE)

            wrkstat HRS1 HRS2 evwork        wrkslf  wrkgovt OCC10 PRESTG10  \
0  temp not working    e    c    NaN  someone else  private     b        c   
1  working fulltime    c    e    NaN  someone else  private     b        d   

  PRESTG105PLUS INDUS10  ...    neisafe rlooks rgroomed rweight rhlthend wtss  \
0             c       c  ...  very safe    NaN      NaN     NaN      NaN    e   
1             d       c  ...  very safe    NaN      NaN     NaN      NaN    c   

  wtssnr wtssall vstrat vpsu  
0      e       e   3301    1  
1      c       c   3301    1  

[2 rows x 1034 columns]
(1784, 1034)


In [3]:
# testing model functionality
# can either input features and samples directly, or infer from data obj
model_ = model()

# qnet construction parameters, 
# choose to either load or fit qnet from scratch
# and to either load from url or local repo
test_model_buildqnet = False
url_load = True
if test_model_buildqnet:
        print("fitting")
        model_.fit(data_obj=data,
                   min_samples_split=2,
                   alpha=0.05,
                   max_depth=-1,
                   max_feats=-1,
                   early_stopping=False,
                   verbose=0,
                   random_state=None,
                   njobs=8)
        print("fitted")
        model_.export_dot("GSS/results/tmp_dot_modelclass.dot",
                        generate_trees=True)
        model_.save("GSS/results/tmp_nodelclass.joblib")
        #model_.load("tmp_nodelclass.joblib")
else:
    if url_load:
        QNETFILE = 'https://zenodo.org/record/5781768/files/gss_2018.joblib'
    else:
        QNETFILE = 'GSS/data/gss_2018.joblib'
    model_.load(QNETFILE)

In [4]:
# testing cognet
# set some paramaters in instantiating cognet class 
# if loading from model obj, no need to use load_data func, otherwise, load_data
Cg = cg()
print(len(model_.features))
Cg.load_from_model(model_, data, 'all')

1034


In [5]:
# distance calculation for individual samples    
# we have a nsteps parameter (for sample 1 and sample2)
# which qsamples the sample1 and sample2 if set before
# computing distance. Note qsampling must only 
# change mutable varaibles, so need to compute base-freq
distance = Cg.distance(samples[1],samples[3],nsteps1=5, nsteps2=5)
print("class-computed distance:", distance)
qdistance_ = qdistance(samples[1],samples[3],Cg.qnet,Cg.qnet)
print("actual:{}".format(qdistance_))

class-computed distance: 0.11440414114623486
actual:0.11418596000727711


In [6]:
# produce stats on how many column names actually match
stats = Cg.set_poles(POLEFILE,"R","L",steps=120, VERBOSE=True)

# compute polar distance matrix
dmatrix = Cg.polar_separation(nsteps=0)

4 pole features not found in sample features


In [7]:
Cg.samples

Unnamed: 0,wrkstat,HRS1,HRS2,evwork,wrkslf,wrkgovt,OCC10,PRESTG10,PRESTG105PLUS,INDUS10,...,neisafe,rlooks,rgroomed,rweight,rhlthend,wtss,wtssnr,wtssall,vstrat,vpsu
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1779,,,,,,,,,,,...,,,,,,,,,,
1780,,,,,,,,,,,...,,,,,,,,,,
1781,,,,,,,,,,,...,,,,,,,,,,
1782,,,,,,,,,,,...,,,,,,,,,,


In [None]:
# distance calculation for individual samples    
# we have a nsteps parameter (for sample 1 and sample2)
# which qsamples the sample1 and sample2 if set before
# computing distance. Note qsampling must only 
# change mutable varaibles, so need to compute base-freq
distance = Cg.distance(samples[1],samples[3],nsteps1=5, nsteps2=5)
print("class-computed distance:", distance)
qdistance_ = qdistance(samples[1],samples[3],Cg.qnet,Cg.qnet)
print("actual:{}".format(qdistance_))

In [8]:
# distance calculation for individual samples after setting poles
print("distance calculations")
distance = Cg.distance(Cg.samples.fillna('').values.astype(str)[3],Cg.samples.iloc[5].values.astype(str),nsteps1=0, nsteps2=0)
print("class-computed distance:", distance)
qdistance_ = qdistance(samples[3],samples[5],Cg.qnet,Cg.qnet)
print("actual:{}".format(qdistance_))

distance calculations
class-computed distance: 0.019529232334225974
actual:0.10533663368244263


In [10]:
#------------------
# the following are for single samples

# dissonance
dissonance_array = Cg.dissonance(1)
print("dissonance:", dissonance_array)

#ideology
Cg.num_qsamples = 5
ideology_index = Cg.ideology(3,pole_1="R",pole_2="L")
print("ideology:", ideology_index)

# disperion
dispersion_ = Cg.dispersion(3)
print("Dispersion:", dispersion_)

# compute distance from each pole
array_distances = Cg.polarDistance(1)
print("distance from poles:", array_distances)

# random mask and reconstruction
returndict = {}
rederr,r_prob,rand_err,s,qs,s_rand,mask_ = Cg.randomMaskReconstruction(index=1, 
                                                                       return_dict=returndict,
                                                                       index_colname="feature_names",
                                                                       output_dir="GSS/results/recon_results/",
                                                                       file_name="recon_tmp.csv",
                                                                       save_samples=True)# sample=np.array(samples[1]))
print("reconstruction results:", rederr, r_prob, rand_err)
#-------------------

dissonance: [0. 0. 0. ... 0. 0. 0.]
ideology: [0.003099287738931686, 0.08561949144556606, 0.08523027395823583, 0.1255828822993992]
Dispersion: [0.046598937906543014, 0.12738269562063353]
distance from poles: [0.021635271857159282, 0.022334688414062873]
reconstruction results: 57.24466031539579 0.24629629629629626 0.5578646696579473


In [None]:
# the following are for arrays of samples
# multiprocessing suffices

# set sammple sizeN
Cg.set_nsamples(40)
    
# computing polar_indices makes sure that dissonance matrix only takes in polar cols
Cg.compute_polar_indices()
dissonance_array = Cg.dissonance_matrix(outfile='GSS/results/DISSONANCE_matrix.csv')
print("dissonance array:", dissonance_array[:2])
print('----------------------------------------------------------------------\n')

# random mask and reconstruction
recon_df = Cg.randomMaskReconstruction_multiple('GSS/results/randomMaskRecon_test.csv')
print("reconstruction results", recon_df[:2])
print('----------------------------------------------------------------------\n')

# ideology indices
ideology_index = Cg.compute_DLI_samples('ideology','GSS/results/ideology.csv')
print("ideology indices", ideology_index)
print('----------------------------------------------------------------------\n')

# dispersion
local_dispersion = Cg.compute_DLI_samples('dispersion', 'GSS/results/dispersion_test.csv')
print("dispersion array", local_dispersion)
print('----------------------------------------------------------------------\n')

# polar distances
polar_array = Cg.polarDistance_multiple('GSS/results/polarDistance_multiple_test.csv')
print("polar distances array",polar_array)
print('----------------------------------------------------------------------\n')

dissonance array:    spkcom  colcom  libcom  spkmil  colmil  libmil  libhomo  libmslm  gunlaw  \
0     0.0     0.0     0.0     0.0     0.0     0.0      0.0      0.0     0.0   
1     0.0     0.0     0.0     0.0     0.0     0.0      0.0      0.0     0.0   

   grass  ...  shotgun  rowngun  viruses  intmil   abpoorw  godchnge  \
0    0.0  ...      0.0      0.0      0.0     0.0  0.000000  0.853822   
1    0.0  ...      0.0      0.0      0.0     0.0  0.769949  0.916401   

   prayfreq  religcon  religint  comfort  
0  0.000000  0.000000  0.827436      0.0  
1  0.960481  0.323764  0.000000      0.0  

[2 rows x 35 columns]
----------------------------------------------------------------------

reconstruction results       rederr    r_prob  rand_err  \
0  27.094920  0.312887  0.700943   
1  21.486421  0.296487  0.671613   

                                               mask_  
0  [wrkstat, HRS1, wrkslf, wrkgovt, PRESTG10, IND...  
1  [wrkstat, wrkgovt, OCC10, PRESTG10, PRESTG105P...  
------

In [None]:
# compute qdistance matrix for small set of samples
# set nsamples first to set the number of samples to be included in matrix
Cg.MAX_PROCESSES = 2
Cg.set_nsamples(30)
distance_matrix=Cg.distfunc_multiples("GSS/results/distfunc_multiples_testing.csv")
print(distance_matrix)

In [None]:
# compute qdistance matrix for small set of samples
# set nsamples first to set the number of samples to be included in matrix
distance_matrix=Cg.distfunc_multiples("GSS/results/distfunc_multiples_testing.csv")
print("local distance matrix:", distance_matrix)

In [None]:
# write files to compute qdistance matrix for large set of samples
# execute generated shell script to run mpi parallelization on midway
Cg.dmat_filewriter("GSS/GSS_cognet.py", "GSS/data/gss_2018.joblib",
                   MPI_SETUP_FILE="GSS/GSS_mpi_setup.sh",
                   MPI_RUN_FILE="GSS/GSS_mpi_run.sh",
                   MPI_LAUNCHER_FILE="GSS/GSS_mpi_launcher.sh",
                   YEARS='2018',NODES=4,T=14)

In [None]:
## embedding
## embed generated Qdist Matrix
Cg.year = '2018'
Cg.embed('examples_results/distfunc_multiples_testing.csv', 'embed', 'examples_results/',EMBED_BINARY='cognet/cognet/bin/__embed__.so')
#pd.read_csv('examples_results/embed_E_2018.csv')
