In [18]:
# This is an example of how we see 
# the package work. The functions listed here
# are probably the only ones that should be exposed, ie documented.
# others should br prepended with a double underscore
#  
# The cognet directory has the following "modules"
# which are seprate .py files containing clases and functions
# The modules are cognet.py, dataFormatter.py, model.py, util.py, viz.py
# we will write the viz.py later.
import sys

from quasinet.qnet import qdistance
from cognet.cognet import cognet as cg
from cognet.dataFormatter import dataFormatter
from cognet.model import model 
#import cognet.util
import pandas as pd
import numpy as np

yr = '2018'
POLEFILE='GSS/data/polar_vectors.csv'
QPATH='GSS/data/gss_'+yr+'.joblib'
IMMUTABLE_FILE='GSS/data/immutable.csv'
GSSDATA = 'GSS/data/gss_'+yr+'.csv'

In [37]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from cognet.util import assert_None, assert_array_dimension
class dataFormatter1:
    """format data to be suitable for Qnet training and testing
    """

    def __init__(self,
                 samples):
        """init

        Args:
            samples ([str], optional): 2D array with rows as observations and columns as features.
        """
        self.samples = pd.read_csv(samples)
        self.features = {}
        self.nan_cols = []
        self.immutable_vars = None
        self.mutable_vars = None
        self.test_size = None
        self.random_state = None
        self.train_data = None
        self.test_data = None

    def __train_test_split(self,
                           test_size,
                           train_size=None,
                           random_state=None):
        """split the samples into training and testing samples

        Args:
          test_size (float): fraction of sample to take as test_size.
          train_size (float): fraction of sample to take as train_size. Defaults to None, and 1-test_size
          random_state (int, optional): random seed to split samples dataset . Defaults to None.
        """
        self.test_size = test_size
        self.random_state = random_state
        self.train_data, self.test_data = train_test_split(self.samples,
                                                           test_size=test_size,
                                                           train_size=train_size,
                                                           random_state=random_state)
    
    def Qnet_formatter(self,
                         samples=None,
                         key=None):
        """format data for Qnet input

        Args:
          samples ([str], optional): 2D array with rows as observations and columns as features.
          key (str): Either 'train' or 'test' key, to determine which set of features
        
        Returns:
            features and samples of either the train and test dataset
        """
        # if not isinstance(samples, np.ndarray):
        #     raise ValueError('Samples must be in numpy array form!')
        if samples is None:
            samples = self.samples
        features = np.array(samples.columns.astype(str)[:])
        samples = samples.replace("nan","").fillna("").values.astype(str)[:]
        # remove columns that are all NaNs
        not_all_nan_cols = ~np.all(samples == '', axis=0)
        self.nan_cols = np.all(samples == '', axis=0)

        samples = samples[:, not_all_nan_cols]
        
        features = features[not_all_nan_cols]
        features = list(features)
        if key is not None:
            self.features[key] = features
        return features, samples

    def format_samples(self,
                       key,
                       test_size=.5):
        """formats samples and featurenames, either all, train, or test
        
        Args:
          key (str): 'all', 'train', or 'test', corresponding to sample type

        Returns:
            samples and featurenames: formatted
        """
        
        
        if all(x is None for x in [self.train_data,
                                       self.test_data,
                                       self.samples]):
            raise ValueError("Split samples into test and train datasets or input samples first!")
        if key == 'train':
            self.__train_test_split(1-test_size)
            samples = self.train_data
        elif key == 'test':
            self.__train_test_split(test_size)
            samples = self.test_data
        elif key == 'all':
            samples = self.samples
        else:
            raise ValueError("Invalid key, key must be either 'all', 'test', or 'train")
        
        return self.Qnet_formatter(samples, key=key)
    
    def __set_varcase(self,
                      lower,
                      key='train',
                      vars=None):
        """set the features to all upper or lowercase

        Args:
          lower (bool): If true, set vars to lowercase, else to uppercase
          key (str, optional): Whether to set train or test features. Defaults to 'train'.
          vars ([str]): Mutable and immutable vars/features. Defaults to None.

        Returns:
          features, vars: formatted to either upper or lower case
        """
        if lower:
            features = [x.lower() for x in self.features[key]]
            if var is not None:
                vars = [x.lower() for x in vars]
        else:
            features = [x.upper() for x in self.features[key]]
            if vars is not None:
                vars = [x.upper() for x in vars]
        return features, vars

    def __interpretvars(self,
                        lower,
                        IMMUTABLE,
                        FILE=None,
                        LIST=None):
        """read in vars from file and set mutable, immutable

        Args:
          lower (bool): Whether to set variables to lowercase (True) or not (False)
          IMMUTABLE (book): IMMUTABLE if True, MUTABLE otherwise
          FILE (str, optional): file with vars in singular column. Defaults to None.
          LIST ([str], optional): 1D array of vars. Defaults to None.
          
        Returns:
          mutable vars, immutable vars: list
        """
        if IMMUTABLE:
            immutable_vars = np.array(LIST)
            if FILE is not None:
                immutable_vars = pd.read_csv(FILE,index_col=0).transpose()
            #assert_array_dimension(immutable_vars, 1)
            features, immutable_vars = self.__set_varcase(lower,
                                                          vars=immutable_vars)
            mutable_vars = [x for x in features
                            if x not in immutable_vars]
            immutable_vars = [x for x in immutable_vars
                              if x in features]
            invalid_vars = [x for x in immutable_vars
                            if x not in features]
        else:
            mutable_vars = LIST
            if FILE is not None:
                mutable_vars = pd.read_csv(FILE,index_col=0).transpose()
            #assert_array_dimension(mutable_vars, 1)
            features, mutable_vars = self.__set_varcase(lower,
                                                        vars=mutable_vars)
            immutable_vars = [x for x in features
                              if x not in mutable_vars]
            mutable_vars = [x for x in mutable_vars
                            if x in features]
            invalid_vars = [x for x in mutable_vars
                            if x not in features]
        if len(invalid_vars) != 0:
            print("{} vars not found".format(len(invalid_vars)))
            print("vars not found:{}".format(invalid_vars))
        return mutable_vars, immutable_vars

    def mutable_variables(self,
                immutable_list=None,
                IMMUTABLE_FILE=None,
                mutable_list=None,
                MUTABLE_FILE=None,
                lower=False):
        """set variables to be mutable or immutable

        Args:
          immutable_list (list)): 1D array of immutable variables. Defaults to None.
          IMMUTABLE_FILE (str, optional): file with immutable vars in singular column. Defaults to None.
          mutable_list (list, optional): 1D array of immutable variables. Defaults to None.
          MUTABLE_FILE (str, optional): file with mutable vars in singular column. Defaults to None.
          
        Returns:
          mutable_vars, immutable_vars: list
        """
        list_None = assert_None([immutable_list,mutable_list], raise_error=False)
        file_None = assert_None([IMMUTABLE_FILE,MUTABLE_FILE], raise_error=False)
        num_None = assert_None([immutable_list,mutable_list,
                                IMMUTABLE_FILE,MUTABLE_FILE], raise_error=False)
        if list_None == 0 or file_None == 0:
            raise ValueError("Only input either IMMUTABLE or MUTABLE vars, not both!")
        elif num_None == 4:
            raise ValueError("Too few inputs! One argument needed")
        elif num_None != 3:
            raise ValueError("Too many inputs! Only one argument needed")
        else:
            if IMMUTABLE_FILE is not None:
                mutable_vars, immutable_vars = self.__interpretvars(lower,
                                                                    IMMUTABLE=True,
                                                                    FILE=IMMUTABLE_FILE)
            elif MUTABLE_FILE is not None:
                mutable_vars, immutable_vars = self.__interpretvars(lower,
                                                                    IMMUTABLE=False,
                                                                    FILE=MUTABLE_FILE)
            elif immutable_list is not None:
                mutable_vars, immutable_vars = self.__interpretvars(lower,
                                                                    IMMUTABLE=True,
                                                                    LIST=immutable_list)
            elif mutable_list is not None:
                mutable_vars, immutable_vars = self.__interpretvars(lower,
                                                                    IMMUTABLE=False,
                                                                    LIST=mutable_list)
        self.mutable_vars, self.immutable_vars = mutable_vars, immutable_vars
        return mutable_vars, immutable_vars            

In [38]:
# testing dataFormatter
data = dataFormatter1(samples=GSSDATA)
# load the sample data
# have option for test/train split
# make checks to ensure we will not throw errors at qnet construction 
print(data.samples[:2])
features,samples = data.format_samples('train') # default trains and tests using half
all_samples = True
if all_samples: # use all samples to train, instead of half
    features,samples = data.Qnet_formatter()

# format data for Qnet training and fitting
print(samples.shape)

# set mutable and immutable vars either from list or file
im_vars_df = pd.read_csv(IMMUTABLE_FILE, names=['vars'])
im_vars_list = im_vars_df.vars.to_list()
mutable_vars, immutable_vars = data.mutable_variables(immutable_list=im_vars_list)
mutable_vars, immutable_vars = data.mutable_variables(IMMUTABLE_FILE=IMMUTABLE_FILE)

            wrkstat HRS1 HRS2 evwork        wrkslf  wrkgovt OCC10 PRESTG10  \
0  temp not working    e    c    NaN  someone else  private     b        c   
1  working fulltime    c    e    NaN  someone else  private     b        d   

  PRESTG105PLUS INDUS10  ...    neisafe rlooks rgroomed rweight rhlthend wtss  \
0             c       c  ...  very safe    NaN      NaN     NaN      NaN    e   
1             d       c  ...  very safe    NaN      NaN     NaN      NaN    c   

  wtssnr wtssall vstrat vpsu  
0      e       e   3301    1  
1      c       c   3301    1  

[2 rows x 1034 columns]
(1784, 1034)


In [40]:
samples[0][5:20]

array(['private', 'b', 'c', 'c', 'c', 'never married', '', '', '', '',
       'e', 'e', '', '', ''], dtype='<U113')

In [11]:
# testing model functionality
# can either input features and samples directly, or infer from data obj
model_ = model()

# qnet construction parameters, 
# choose to either load or fit qnet from scratch
# and to either load from url or local repo
test_model_buildqnet = True
url_load = False
if test_model_buildqnet:
        print("fitting")
        model_.fit(data_obj=data,
                   min_samples_split=2,
                   alpha=0.05,
                   max_depth=-1,
                   max_feats=-1,
                   early_stopping=False,
                   verbose=0,
                   random_state=None,
                   njobs=2)
        print("fitted")
        #model_.export_dot("GSS/results/tmp_dot_modelclass.dot",
        #                generate_trees=True)
        #model_.save("GSS/results/tmp_nodelclass.joblib")
        #model_.load("tmp_nodelclass.joblib")
else:
    if url_load:
        QNETFILE = 'https://zenodo.org/record/5781768/files/gss_2018.joblib'
    else:
        QNETFILE = 'GSS/data/gss_2018.joblib'
    model_.load(QNETFILE)

fitting
20
training Qnet -------------
Qnet trained --------------
fitted


In [12]:
# testing cognet
# set some paramaters in instantiating cognet class 
# if loading from model obj, no need to use load_data func, otherwise, load_data
Cg = cg()
print(len(model_.features))
Cg.load_from_model(model_, data, 'all')

1035


In [5]:
# distance calculation for individual samples    
# we have a nsteps parameter (for sample 1 and sample2)
# which qsamples the sample1 and sample2 if set before
# computing distance. Note qsampling must only 
# change mutable varaibles, so need to compute base-freq
distance = Cg.distance(samples[1],samples[3],nsteps1=5, nsteps2=5)
print("class-computed distance:", distance)
qdistance_ = qdistance(samples[1],samples[3],Cg.qnet,Cg.qnet)
print("actual:{}".format(qdistance_))

class-computed distance: 0.11381436810159826
actual:0.11418596000727718


In [53]:
# produce stats on how many column names actually match
stats = Cg.set_poles(POLEFILE,"L","R", steps=25, VERBOSE=True, restrict=True) # steps=120

# compute polar distance matrix
dmatrix = Cg.polar_separation(nsteps=0)
dmatrix

4 pole features not found in sample features


array([[0.        , 0.03291745],
       [0.03291745, 0.        ]])

In [54]:
Cg.samples

Unnamed: 0,wrkstat,HRS1,HRS2,evwork,wrkslf,wrkgovt,OCC10,PRESTG10,PRESTG105PLUS,INDUS10,...,neisafe,rlooks,rgroomed,rweight,rhlthend,wtss,wtssnr,wtssall,vstrat,vpsu
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1779,,,,,,,,,,,...,,,,,,,,,,
1780,,,,,,,,,,,...,,,,,,,,,,
1781,,,,,,,,,,,...,,,,,,,,,,
1782,,,,,,,,,,,...,,,,,,,,,,


In [35]:
# distance calculation for individual samples after setting poles
print("distance calculations")
distance = Cg.distance(Cg.samples.fillna('').values.astype(str)[3],Cg.samples.iloc[5].values.astype(str),nsteps1=0, nsteps2=0)
print("class-computed distance:", distance)
qdistance_ = qdistance(samples[3],samples[5],Cg.qnet,Cg.qnet)
print("actual:{}".format(qdistance_))

distance calculations
class-computed distance: 0.019529232334225977
actual:0.10533663368244232


In [36]:
#------------------
# the following are for single samples

# dissonance
dissonance_array = Cg.dissonance(1)
print("dissonance:", dissonance_array)

#ideology
ideology_index = Cg.ideology(4,pole_1="R",pole_2="L")
print("ideology:", ideology_index)

# disperion
Cg.num_qsamples = 5
dispersion_ = Cg.dispersion(3)
print("Dispersion:", dispersion_)

# compute distance from each pole
array_distances = Cg.polarDistance(1)
print("distance from poles:", array_distances)

# random mask and reconstruction
returndict = {}
rederr,r_prob,rand_err,s,qs,s_rand,mask_ = Cg.randomMaskReconstruction(index=1, 
                                                                       return_dict=returndict,
                                                                       index_colname="feature_names",
                                                                       output_dir="GSS/results/recon_results/",
                                                                       file_name="recon_tmp.csv",
                                                                       save_samples=True)# sample=np.array(samples[1]))
print("reconstruction results:", rederr, r_prob, rand_err)
#-------------------

dissonance: [0. 0. 0. ... 0. 0. 0.]
ideology: [0.09588734152684872, 0.04156695908794076, 0.035403060899776705, 0.06428271020985754]
Dispersion: [0.04560526637909075, 0.12550698664640406]
distance from poles: [0.021635271857159282, 0.022334688414062866]
reconstruction results: 56.841097308886056 0.29393939393939394 0.5370033765577625


In [10]:
# the following are for arrays of samples
# multiprocessing suffices

# set sammple sizeN
Cg.set_nsamples(10)
Cg.MAX_PROCESSES = 2
# computing polar_indices makes sure that dissonance matrix only takes in polar cols
Cg.compute_polar_indices()
dissonance_array = Cg.dissonance_matrix(outfile='GSS/results/DISSONANCE_matrix.csv')
print("dissonance array:", dissonance_array[:2])
print('----------------------------------------------------------------------\n')

# random mask and reconstruction
recon_df = Cg.randomMaskReconstruction_multiple('GSS/results/randomMaskRecon_test.csv')
print("reconstruction results", recon_df[:2])
print('----------------------------------------------------------------------\n')

# ideology indices
ideology_index = Cg.compute_DLI_samples('ideology','GSS/results/ideology.csv')
print("ideology indices", ideology_index)
print('----------------------------------------------------------------------\n')

# dispersion
local_dispersion = Cg.compute_DLI_samples('dispersion', 'GSS/results/dispersion_test.csv')
print("dispersion array", local_dispersion)
print('----------------------------------------------------------------------\n')

# polar distances
polar_array = Cg.polarDistance_multiple('GSS/results/polarDistance_multiple_test.csv')
print("polar distances array",polar_array)
print('----------------------------------------------------------------------\n')

Number of Processes 2 has been set using class parameter
dissonance array:    spkcom  colcom  libcom  spkmil  colmil  libmil  libhomo  libmslm  gunlaw  \
0     0.0     0.0     0.0     0.0     0.0     0.0      0.0      0.0     0.0   
1     0.0     0.0     0.0     0.0     0.0     0.0      0.0      0.0     0.0   

   grass  ...  shotgun  rowngun  viruses  intmil   abpoorw  godchnge  \
0    0.0  ...      0.0      0.0      0.0     0.0  0.000000  0.853822   
1    0.0  ...      0.0      0.0      0.0     0.0  0.769949  0.916401   

   prayfreq  religcon  religint  comfort  
0  0.000000  0.000000  0.827436      0.0  
1  0.960481  0.323764  0.000000      0.0  

[2 rows x 35 columns]
----------------------------------------------------------------------

Number of Processes 2 has been set using class parameter
reconstruction results       rederr    r_prob  rand_err  \
0  27.801338  0.310224  0.669250   
1  34.239480  0.296225  0.672663   

                                               mask_  
0 

In [55]:
Cg.MAX_PROCESSES = 2

# ideology indices
ideology_index = Cg.compute_DLI_samples('ideology','GSS/results/ideology.csv', 
                        num_qsamples=20,
                        steps=25,
                        n_jobs=2,
                        pole_1=0,
                        pole_2=1,
                        processes=2)
print("ideology indices", ideology_index)
print('----------------------------------------------------------------------\n')

Number of Processes 2 has been set using class parameter
ideology indices       ideology        dR        dL        d0
0    -0.280151  0.030485  0.049533  0.067991
1    -0.141590  0.037044  0.046671  0.067991
2    -0.272653  0.031553  0.050091  0.067991
3    -0.169343  0.034574  0.046088  0.067991
4    -0.193166  0.033283  0.046417  0.067991
...        ...       ...       ...       ...
1779 -0.274000  0.032207  0.050837  0.067991
1780 -0.131350  0.036454  0.045384  0.067991
1781  0.073439  0.041729  0.036735  0.067991
1782 -0.320890  0.030146  0.051964  0.067991
1783 -0.327382  0.030358  0.052618  0.067991

[1784 rows x 4 columns]
----------------------------------------------------------------------



In [11]:
# compute qdistance matrix for small set of samples
# set nsamples first to set the number of samples to be included in matrix
Cg.MAX_PROCESSES = 2
Cg.set_nsamples(30)
distance_matrix=Cg.distfunc_multiples("GSS/results/distfunc_multiples_testing.csv")
print(distance_matrix)

Number of Processes 2 has been set using class parameter
          0         1         2         3         4         5         6   \
0   0.000000  0.091232  0.134658  0.113836  0.086175  0.120154  0.079885   
1   0.091232  0.000000  0.108381  0.114186  0.079264  0.085646  0.086029   
2   0.134658  0.108381  0.000000  0.129788  0.120588  0.076976  0.130525   
3   0.113836  0.114186  0.129788  0.000000  0.109474  0.105337  0.127201   
4   0.086175  0.079264  0.120588  0.109474  0.000000  0.101664  0.093314   
5   0.120154  0.085646  0.076976  0.105337  0.101664  0.000000  0.115825   
6   0.079885  0.086029  0.130525  0.127201  0.093314  0.115825  0.000000   
7   0.079909  0.102618  0.145231  0.117701  0.075187  0.126603  0.094005   
8   0.126323  0.098386  0.111161  0.093618  0.124169  0.094520  0.115584   
9   0.096016  0.092799  0.139336  0.130243  0.066111  0.124740  0.110569   
10  0.129384  0.100322  0.089773  0.110758  0.102230  0.081354  0.124399   
11  0.103177  0.106363  0.13811

In [12]:
# compute qdistance matrix for small set of samples different from qnet samples
samples = Cg.samples.iloc[:10]
Cg.distfunc_multiples("GSS/results/distfunc_multiples_samplestesting.csv", samples=samples)

Number of Processes 2 has been set using class parameter


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,0.091232,0.134658,0.113836,0.086175,0.120154,0.079885,0.079909,0.126323,0.096016
1,0.091232,0.0,0.108381,0.114186,0.079264,0.085646,0.086029,0.102618,0.098386,0.092799
2,0.134658,0.108381,0.0,0.129788,0.120588,0.076976,0.130525,0.145231,0.111161,0.139336
3,0.113836,0.114186,0.129788,0.0,0.109474,0.105337,0.127201,0.117701,0.093618,0.130243
4,0.086175,0.079264,0.120588,0.109474,0.0,0.101664,0.093314,0.075187,0.124169,0.066111
5,0.120154,0.085646,0.076976,0.105337,0.101664,0.0,0.115825,0.126603,0.09452,0.12474
6,0.079885,0.086029,0.130525,0.127201,0.093314,0.115825,0.0,0.094005,0.115584,0.110569
7,0.079909,0.102618,0.145231,0.117701,0.075187,0.126603,0.094005,0.0,0.134604,0.067742
8,0.126323,0.098386,0.111161,0.093618,0.124169,0.09452,0.115584,0.134604,0.0,0.136706
9,0.096016,0.092799,0.139336,0.130243,0.066111,0.12474,0.110569,0.067742,0.136706,0.0


In [13]:
# compute qdistance matrix for small set of samples
# set nsamples first to set the number of samples to be included in matrix
distance_matrix=Cg.distfunc_multiples("GSS/results/distfunc_multiples_testing.csv")
print("local distance matrix:", distance_matrix)

Number of Processes 2 has been set using class parameter
local distance matrix:           0         1         2         3         4         5         6   \
0   0.000000  0.091232  0.134658  0.113836  0.086175  0.120154  0.079885   
1   0.091232  0.000000  0.108381  0.114186  0.079264  0.085646  0.086029   
2   0.134658  0.108381  0.000000  0.129788  0.120588  0.076976  0.130525   
3   0.113836  0.114186  0.129788  0.000000  0.109474  0.105337  0.127201   
4   0.086175  0.079264  0.120588  0.109474  0.000000  0.101664  0.093314   
5   0.120154  0.085646  0.076976  0.105337  0.101664  0.000000  0.115825   
6   0.079885  0.086029  0.130525  0.127201  0.093314  0.115825  0.000000   
7   0.079909  0.102618  0.145231  0.117701  0.075187  0.126603  0.094005   
8   0.126323  0.098386  0.111161  0.093618  0.124169  0.094520  0.115584   
9   0.096016  0.092799  0.139336  0.130243  0.066111  0.124740  0.110569   
10  0.129384  0.100322  0.089773  0.110758  0.102230  0.081354  0.124399   
11  0.10

In [5]:
# write files to compute qdistance matrix for large set of samples
# execute generated shell script to run mpi parallelization on midway
Cg.dmat_filewriter("GSS/GSS_cognet.py",
                   MPI_SETUP_FILE="GSS_mpi_setup.sh",
                   MPI_RUN_FILE="GSS_mpi_run.sh",
                   MPI_LAUNCHER_FILE="GSS_mpi_launcher.sh",
                   YEARS='2018',NODES=4,T=14)

In [None]:
## embedding
## embed generated Qdist Matrix
Cg.year = '2018'
Cg.embed('examples_results/distfunc_multiples_testing.csv', 'embed', 'examples_results/',EMBED_BINARY='cognet/cognet/bin/__embed__.so')
#pd.read_csv('examples_results/embed_E_2018.csv')
