In [1]:
# This is an example of how we see 
# the package work. The functions listed here
# are probably the only ones that should be exposed, ie documented.
# others should br prepended with a double underscore
#  
# The cognet directory has the following "modules"
# which are seprate .py files containing clases and functions
# The modules are cognet.py, dataFormatter.py, model.py, util.py, viz.py
# we will write the viz.py later.
import sys

from quasinet.qnet import qdistance
from cognet.cognet import cognet as cg
from cognet.dataFormatter import dataFormatter
from cognet.model import model 
#import cognet.util
import pandas as pd
import numpy as np

yr = '2018'
POLEFILE='examples_data/polar_vectors.csv'
QPATH='examples_data/gss_'+yr+'.joblib'
IMMUTABLE_FILE='examples_data/immutable.csv'
GSSDATA = 'examples_data/gss_'+yr+'.csv'

In [2]:
# ---------------------------------
# testing dataFormatter
data = dataFormatter(samples=GSSDATA)
# load the sample data
# have option for test/train split
# make checks to ensure we will not bark at qnet construction 
# data.train() returns traininh data
# data.test() returns test data
print(data.samples[:2])
features,samples = data.format_samples('train')
# we can set mutable and immutable vars from list or file
im_vars_df = pd.read_csv(IMMUTABLE_FILE, names=['vars'])
im_vars_list = im_vars_df.vars.to_list()
mutable_vars, immutable_vars = data.mutable_variables(immutable_list=im_vars_list)
mutable_vars, immutable_vars = data.mutable_variables(IMMUTABLE_FILE=IMMUTABLE_FILE)
# -------------------------------------

            wrkstat HRS1 HRS2 evwork        wrkslf  wrkgovt OCC10 PRESTG10  \
0  temp not working    e    c    NaN  someone else  private     b        c   
1  working fulltime    c    e    NaN  someone else  private     b        d   

  PRESTG105PLUS INDUS10  ...    neisafe rlooks rgroomed rweight rhlthend wtss  \
0             c       c  ...  very safe    NaN      NaN     NaN      NaN    e   
1             d       c  ...  very safe    NaN      NaN     NaN      NaN    c   

  wtssnr wtssall vstrat vpsu  
0      e       e   3301    1  
1      c       c   3301    1  

[2 rows x 1034 columns]


In [3]:
# -------------------------------------
# testing model functionality
# can either input features and samples directly, or infer from data obj
model_ = model()

# qnet construction parameters
test_model_buildqnet = False
# infer qnet
if test_model_buildqnet:
        model_.fit(data_obj=data)
        model_.export_dot("tmp_dot_modelclass.dot",
                        generate_trees=True)
        model_.save("tmp_nodelclass.joblib")
        #model_.load("tmp_nodelclass.joblib")
else:
    model_.load("examples_data/gss_2018.joblib")
# -------------------------------------

updating


In [4]:
# -------------------------------------
# testing cognet
# set some paramaters in instantiating cognet class 
# if loading from model obj, no need to load_data, otherwise, load_data

Cg = cg()
print(len(model_.features))
Cg.load_from_model(model_, data, 'all')

# distance calculation for individual samples    
# we have a nsteps parameter (for sample 1 and sample2)
# which qsamples the sample1 and sample2 if set before
# computing distance. Note qsampling must only 
# change mutable varaibles, so need to compute base-freq
distance = Cg.distance(samples[1],samples[3],nsteps1=5, nsteps2=5)
print(distance)
qdistance_ = qdistance(samples[1],samples[3],Cg.qnet,Cg.qnet)
print("actual:{}".format(qdistance_))
#distance = Cg.distance(data.samples[0],data.samples[1])
#distance = Cg.distance(data.test[0],data.test[1])
# -------------------------------------

1034
0.05510828907752185
actual:0.05007398056944518


In [5]:
#------------------
# produce stats on how many column names actually match
stats = Cg.set_poles(POLEFILE,"R","L",steps=120)

# compute polar distance matrix
dmatrix = Cg.polar_separation(nsteps=0)
#------------------

4 pole features not found in sample features


In [6]:
#------------------
# the following are for single samples
dissonance_array = Cg.dissonance(1)
returndict = {}
rederr,r_prob,rand_err,s,qs,s_rand,mask_ = Cg.randomMaskReconstruction(1, returndict)# sample=np.array(samples[1]))

#ideology_index = Cg.compute_DLI_sample(3)
Cg.num_qsamples = 5
ideology_index = Cg.ideology(3,pole_1="R",pole_2="L")

# get dispersion of an individual sample
Dispersion_ = Cg.dispersion(3)
print(Dispersion_)
# compute distance from each pole
array_distances = Cg.polarDistance(1, returndict)
#-------------------

-62.57824932507554 0.39649122807017545 0.6090770223489306 ['' '' '' ... '' '' ''] ['' '' '' ... '' '' ''] ['' '' '' ... '' '' ''] ['spkcom', 'libcom', 'colmil', 'libmil', 'libmslm', 'gunlaw', 'grass', 'reliten', 'pray', 'abdefect', 'abrape', 'absingle', 'abany', 'pillok', 'owngun', 'intmil', 'abpoorw', 'godchnge', 'religcon']
[0.04520903458061827, 0.12564842241405852]


In [7]:
#-------------------
# the following are for arrays of samples
# multiprocessing suffices
count = 0
if count == 0:
    Cg.set_nsamples(10)
    count +=1
# computing polar_indices makes sure that dissonance matrix only takes in polar cols
Cg.compute_polar_indices()
dissonance_array = Cg.dissonance_matrix(output_file='examples_results/DISSONANCE_matrix.csv')
# multiprocessing suffices
dataframes,error_array = Cg.randomMaskReconstruction_multiple('examples_results/randomMaskRecon_test.csv')
# multiprocessing suffices
ideology_index = Cg.compute_DLI_samples('ideology','examples_results/ideology.csv')
# multiprocessing suffices
local_dispersion = Cg.compute_DLI_samples('dispersion', 'examples_results/dispersion_test.csv')
# compute distance from each pole
# multiprocessing suffices
array_distances = Cg.polarDistance_multiple('examples_results/polarDistance_multiple_test.csv')
#-------------------

In [8]:
#the following must use parallelization
# next one must use mpi and hence will not run
# with mpi without maybe a seprate script.
# But look here: https://stackoverflow.com/questions/25772289/python-multiprocessing-within-mpi
distance_matrix=Cg.distfunc_multiples("examples_results/distfunc_multiples_testing.csv")


In [8]:
Cg.dmat_filewriter("GSS_cognet.py", "examples_data/gss_2018.joblib",
                           MPI_SETUP_FILE="GSS_mpi_setup.sh",
                           MPI_RUN_FILE="GSS_mpi_run.sh",
                           MPI_LAUNCHER_FILE="GSS_mpi_launcher.sh",
                           YEARS='2018',NODES=4,T=14)

In [14]:
## embedding
## embed generated Qdist Matrix
Cg.year = '2018'
Cg.embed('examples_results/distfunc_multiples_testing.csv', 'embed', 'examples_results/',EMBED_BINARY='cognet/cognet/bin/__embed__.so')
#pd.read_csv('examples_results/embed_E_2018.csv')
