In [1]:
# This is an example of how we see 
# the package work. The functions listed here
# are probably the only ones that should be exposed, ie documented.
# others should br prepended with a double underscore
#  
# The cognet directory has the following "modules"
# which are seprate .py files containing clases and functions
# The modules are cognet.py, dataFormatter.py, model.py, util.py, viz.py
# we will write the viz.py later.
import sys

from quasinet.qnet import qdistance
from cognet.cognet import cognet as cg
from cognet.dataFormatter import dataFormatter
from cognet.model import model 
#import cognet.util
import pandas as pd
import numpy as np

yr = '2018'
POLEFILE='examples_data/polar_vectors.csv'
QPATH='examples_data/gss_'+yr+'.joblib'
IMMUTABLE_FILE='examples_data/immutable.csv'
GSSDATA = 'examples_data/gss_'+yr+'.csv'

In [2]:
# testing dataFormatter
data = dataFormatter(samples=GSSDATA)
# load the sample data
# have option for test/train split
# make checks to ensure we will not throw errors at qnet construction 
print(data.samples[:2])
features,samples = data.format_samples('train') # default trains and tests using half
print(samples.shape)

# format data for Qnet training and fitting
data.Qnet_formatter()

# set mutable and immutable vars either from list or file
im_vars_df = pd.read_csv(IMMUTABLE_FILE, names=['vars'])
im_vars_list = im_vars_df.vars.to_list()
mutable_vars, immutable_vars = data.mutable_variables(immutable_list=im_vars_list)
mutable_vars, immutable_vars = data.mutable_variables(IMMUTABLE_FILE=IMMUTABLE_FILE)

            wrkstat HRS1 HRS2 evwork        wrkslf  wrkgovt OCC10 PRESTG10  \
0  temp not working    e    c    NaN  someone else  private     b        c   
1  working fulltime    c    e    NaN  someone else  private     b        d   

  PRESTG105PLUS INDUS10  ...    neisafe rlooks rgroomed rweight rhlthend wtss  \
0             c       c  ...  very safe    NaN      NaN     NaN      NaN    e   
1             d       c  ...  very safe    NaN      NaN     NaN      NaN    c   

  wtssnr wtssall vstrat vpsu  
0      e       e   3301    1  
1      c       c   3301    1  

[2 rows x 1034 columns]
(892, 1034)


In [3]:
# testing model functionality
# can either input features and samples directly, or infer from data obj
model_ = model()

# qnet construction parameters, 
# choose to either load or fit qnet from scratch
test_model_buildqnet = False
if test_model_buildqnet:
        print("fitting")
        model_.fit(data_obj=data,
                   min_samples_split=2,
                   alpha=0.05,
                   max_depth=-1,
                   max_feats=-1,
                   early_stopping=False,
                   verbose=0,
                   random_state=None,
                   njobs=8)
        print("fitted")
        model_.export_dot("tmp_dot_modelclass.dot",
                        generate_trees=True)
        model_.save("tmp_nodelclass.joblib")
        #model_.load("tmp_nodelclass.joblib")
else:
    model_.load("examples_data/gss_2018.joblib")

In [4]:
featurenames, samples=data.Qnet_formatter()
samples.shape

(1784, 1034)

In [19]:
# testing cognet
# set some paramaters in instantiating cognet class 
# if loading from model obj, no need to use load_data func, otherwise, load_data
Cg = cognet()
print(len(model_.features))
Cg.load_from_model(model_, data, 'all')
# produce stats on how many column names actually match
stats = Cg.set_poles(POLEFILE,"R","L",steps=120, VERBOSE=True)

# compute polar distance matrix
dmatrix = Cg.polar_separation(nsteps=0)

1034
0 pole features not found in sample features


In [20]:
returndict = {}
rederr,r_prob,rand_err,sample,qsampled,random_sample,mask_ = Cg.randomMaskReconstruction(index=4, return_dict=returndict)# sample=np.array(samples[1]))
print("reconstruction results:", rederr, r_prob, rand_err,sample, random_sample, mask_)

count = 0
if count == 0:
    Cg.set_nsamples(10)
    count +=1
    
recon_df = Cg.randomMaskReconstruction_multiple('examples_results/randomMaskRecon_test.csv')
print("reconstruction results", recon_df[:2])

reconstruction results: 66.13180499038299 0.2944444444444445 0.4652875671546094 ['' '' '' ... '' '' ''] ['' '' '' ... '' '' ''] ['pray', 'prayer', 'bible', 'conlabor', 'pillok', 'viruses', 'abpoorw', 'religcon', 'religint']
reconstruction results           0         2         3         1         4         6         7  \
0   18.8008    24.097    18.248    19.171   17.1936   21.9266   25.8351   
1  0.304011  0.291859  0.302863  0.299534  0.297779  0.299763  0.301303   

          8         5         9  
0   9.22127   17.8168   19.6548  
1  0.298393  0.298786  0.288187  


In [19]:
from quasinet.qnet import Qnet
import time
for i in range(1, 10):
    nsample = 10

    start = time.time()
    print(start)
    Qnet_ = Qnet(n_jobs=i, feature_names=featurenames)
    print("fitting")

    test_samples = samples[:nsample]
    print(test_samples)
    Qnet_.fit(test_samples)
    end = time.time()
    print(end)
    total_time=end-start
    print("njobs: ",i, " | elapsed time: ", total_time)


1638657521.265449
fitting
[['temp not working' 'e' 'c' ... 'e' '3301' '1']
 ['working fulltime' 'c' 'e' ... 'c' '3301' '1']
 ['working fulltime' 'c' 'e' ... 'c' '3301' '1']
 ...
 ['retired' 'e' 'e' ... 'c' '3301' '2']
 ['working fulltime' 'c' 'e' ... 'c' '3301' '2']
 ['keeping house' 'e' 'e' ... 'c' '3301' '2']]
1638658564.3172483
njobs:  1  | elapsed time:  1043.0517992973328
1638658564.3179076
fitting
[['temp not working' 'e' 'c' ... 'e' '3301' '1']
 ['working fulltime' 'c' 'e' ... 'c' '3301' '1']
 ['working fulltime' 'c' 'e' ... 'c' '3301' '1']
 ...
 ['retired' 'e' 'e' ... 'c' '3301' '2']
 ['working fulltime' 'c' 'e' ... 'c' '3301' '2']
 ['keeping house' 'e' 'e' ... 'c' '3301' '2']]
1638659168.0808773
njobs:  2  | elapsed time:  603.7629697322845
1638659168.0825222
fitting
[['temp not working' 'e' 'c' ... 'e' '3301' '1']
 ['working fulltime' 'c' 'e' ... 'c' '3301' '1']
 ['working fulltime' 'c' 'e' ... 'c' '3301' '1']
 ...
 ['retired' 'e' 'e' ... 'c' '3301' '2']
 ['working fulltime' 

KeyboardInterrupt: 

In [12]:
from quasinet.qnet import Qnet
import time
for i in range(3):
    nsample = 10 ** i

    start = time.time()
    print(start)
    Qnet_ = Qnet(n_jobs=6, feature_names=featurenames)
    print("fitting")
    test_samples = samples[:nsample]
    Qnet_.fit(test_samples)
    end = time.time()
    print(end)
    total_time=end-start
    print("nsamples: ",nsample, " | elapsed time: ", total_time)


1638660651.9638221
fitting
1638660690.0304549
nsamples:  1  | elapsed time:  38.06663274765015
1638660690.0307827
fitting
1638660899.1668923
nsamples:  10  | elapsed time:  209.1361095905304
1638660899.1671965
fitting
1638661693.904029
nsamples:  100  | elapsed time:  794.7368323802948


In [6]:
# testing cognet
# set some paramaters in instantiating cognet class 
# if loading from model obj, no need to use load_data func, otherwise, load_data
Cg = cg()
print(len(model_.features))
Cg.load_from_model(model_, data, 'all')

1034


In [7]:
# distance calculation for individual samples    
# we have a nsteps parameter (for sample 1 and sample2)
# which qsamples the sample1 and sample2 if set before
# computing distance. Note qsampling must only 
# change mutable varaibles, so need to compute base-freq
distance = Cg.distance(samples[1],samples[3],nsteps1=5, nsteps2=5)
print("class-computed distance:", distance)
qdistance_ = qdistance(samples[1],samples[3],Cg.qnet,Cg.qnet)
print("actual:{}".format(qdistance_))

class-computed distance: 0.09605605005536405
actual:0.09726189681525353


In [8]:
# produce stats on how many column names actually match
stats = Cg.set_poles(POLEFILE,"R","L",steps=120, VERBOSE=True)

# compute polar distance matrix
dmatrix = Cg.polar_separation(nsteps=0)

0 pole features not found in sample features


In [22]:
#------------------
# the following are for single samples

# dissonance
dissonance_array = Cg.dissonance(1)
print("dissonance:", dissonance_array)

# random mask and reconstruction
returndict = {}
rederr,r_prob,rand_err,s,qs,s_rand,mask_ = Cg.randomMaskReconstruction(1, returndict)# sample=np.array(samples[1]))
print("reconstruction results:", rederr, r_prob, rand_err)

#ideology
Cg.num_qsamples = 5
ideology_index = Cg.ideology(3,pole_1="R",pole_2="L")
print("ideology:", ideology_index)

# disperion
dispersion_ = Cg.dispersion(3)
print("Dispersion:", dispersion_)

# compute distance from each pole
array_distances = Cg.polarDistance(1, returndict)
print("distance from poles:", array_distances)
#-------------------

dissonance: [0. 0. 0. ... 0. 0. 0.]
reconstruction results: 60.041315105813254 0.34206349206349207 0.6079032691205803
ideology: [0.13563834025518842, 0.09057258009984209, 0.07426032641060697, 0.12026285236567653]
Dispersion: [0.04480500841639735, 0.1194594397221057]
distance from poles: [0.021635271857159282, 0.022334688414062866]


In [9]:
# the following are for arrays of samples
# multiprocessing suffices

# set sammple size to 10
count = 0
if count == 0:
    Cg.set_nsamples(10)
    count +=1
    
# computing polar_indices makes sure that dissonance matrix only takes in polar cols
Cg.compute_polar_indices()
dissonance_array = Cg.dissonance_matrix(output_file='examples_results/DISSONANCE_matrix.csv')
print("dissonance array:", dissonance_array[:2])
print('----------------------------------------------------------------------\n')

# random mask and reconstruction
recon_df = Cg.randomMaskReconstruction_multiple('examples_results/randomMaskRecon_test.csv')
print("reconstruction results", recon_df[:2])
print('----------------------------------------------------------------------\n')

# ideology indices
ideology_index = Cg.compute_DLI_samples('ideology','examples_results/ideology.csv')
print("ideology indices", ideology_index)
print('----------------------------------------------------------------------\n')

# dispersion
local_dispersion = Cg.compute_DLI_samples('dispersion', 'examples_results/dispersion_test.csv')
print("dispersion array", local_dispersion)
print('----------------------------------------------------------------------\n')

# polar distances
polar_array = Cg.polarDistance_multiple('examples_results/polarDistance_multiple_test.csv')
print("polar distances array",polar_array)
print('----------------------------------------------------------------------\n')

dissonance array:      0    1    2         3    4    5    6    8         7    9
0  0.0  0.0  0.0  0.742767  0.0  0.0  0.0  0.0  0.675424  0.0
1  0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.000000  0.0
----------------------------------------------------------------------

reconstruction results            1          0         2          6          3          5          4  \
0  22.131751  21.218220  6.905767  22.586894  20.704863  16.464651  14.335385   
1   0.287309   0.310591  0.300113   0.291010   0.292836   0.294539   0.304725   

           9          7          8  
0  20.479017  27.600058  18.311126  
1   0.292954   0.292608   0.299661  
----------------------------------------------------------------------

ideology indices           0         3         1         2         5         7         4  \
0  0.092942  0.075258  0.052030  0.028209  0.013110  0.113126  0.057992   
1  0.153999  0.159029  0.150432  0.165401  0.149743  0.151961  0.147609   
2  0.141856  0.149197  0.143635 

In [14]:
# compute qdistance matrix for small set of samples
# set nsamples first to set the number of samples to be included in matrix
distance_matrix=Cg.distfunc_multiples("examples_results/distfunc_multiples_testing.csv")
print("local distance matrix:", distance_matrix))

1034
0 pole features not found in sample features
local distance matrix:           0         1         2         3         4         5         6  \
0  0.000000  0.091232  0.134658  0.113836  0.086175  0.120154  0.079885   
1  0.091232  0.000000  0.108381  0.114186  0.079264  0.085646  0.086029   
2  0.134658  0.108381  0.000000  0.129788  0.120588  0.076976  0.130525   
3  0.113836  0.114186  0.129788  0.000000  0.109474  0.105337  0.127201   
4  0.086175  0.079264  0.120588  0.109474  0.000000  0.101664  0.093314   
5  0.120154  0.085646  0.076976  0.105337  0.101664  0.000000  0.115825   
6  0.079885  0.086029  0.130525  0.127201  0.093314  0.115825  0.000000   
7  0.079909  0.102618  0.145231  0.117701  0.075187  0.126603  0.094005   
8  0.126323  0.098386  0.111161  0.093618  0.124169  0.094520  0.115584   
9  0.096016  0.092799  0.139336  0.130243  0.066111  0.124740  0.110569   

          7         8         9  
0  0.079909  0.126323  0.096016  
1  0.102618  0.098386  0.092799  

In [None]:
# write files to compute qdistance matrix for large set of samples
# execute generated shell script to run mpi parallelization on midway
Cg.dmat_filewriter("GSS_cognet.py", "examples_data/gss_2018.joblib",
                   MPI_SETUP_FILE="GSS_mpi_setup.sh",
                   MPI_RUN_FILE="GSS_mpi_run.sh",
                   MPI_LAUNCHER_FILE="GSS_mpi_launcher.sh",
                   YEARS='2018',NODES=4,T=14)

In [None]:
## embedding
## embed generated Qdist Matrix
Cg.year = '2018'
Cg.embed('examples_results/distfunc_multiples_testing.csv', 'embed', 'examples_results/',EMBED_BINARY='cognet/cognet/bin/__embed__.so')
#pd.read_csv('examples_results/embed_E_2018.csv')
