# Hello (Preliminaries 🐈😺😹)
In this Notebook we walk through the creation of a belief network from the raw GSS dataset.

As a preliminary, make sure you actally have the raw dataset. It should be located and named as follows: 

>CLEAN\datasets\raw_data\gss7222_r4.sas7bdat 

Okay. Now, first we need to import all the functions we will need.

In [19]:
# Load the autoreload extension
%load_ext autoreload

# Set autoreload mode to 2
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
# Add parent directory to Python path
import os
import sys
source_folder  = os.path.join("..", "..", "source_code")

if source_folder not in sys.path:
    sys.path.append(source_folder)

# 1. Read in the raw dataset and cache it. 
#    Note: when we import the dataset, we automatically discard all variables that we're not interested. Edit the function if there are variables you'd like to keep.
from loaders.import_gss import import_dataset

# 2. Clean the raw dataset and derive special variables we are interested in. 
#    This involves: 
#                       a) normalising variables between -1 and 1 and derive special variables.
#                       b) derriving new variables from existing ones.
from loaders.clean_raw_data import clean_datasets

# 3. Calculate the belief network.
#    This involves calculating the correlation matrix of the filtered dataset.
from generators.corr_make_network import calculate_correlation_matrix
from generators.corr_make_network import CorrelationMethod, EdgeSuppressionMethod

# 4. Visualize the belief network.
#    This involves visualizing the belief network in a graph.
from visualizers.network_visualizer import generate_html_visualization
from visualizers.temporal_network_visualizer import (
    generate_temporal_html_visualization,
    test_temporal_correlation_matrix
)


### Importing the raw dataset 😺
First we will run a script that filters the dataset down to only the variables we are interested in. 

Feel free to look at the code in `import_gss.py` to see which variables are included. But keep in mind that if you want to add in more variables, you'll need to manually normalise it in clean_raw_data.py.

In [44]:
df, _ = import_dataset()

Loading dataset from from cache...
Done! ✨


### Cleaning the raw dataset 😺
Next we will run a script that cleans the dataset and derives special variables. 

This will normalise all the variables between -1 and 1, and derive some special variables like "VOTELAST_DEMREP" (this tells you which major party the respondent voted for in the previous election).

Some variables 


In [45]:
cleaned_df = clean_datasets()

Loading dataset from from cache...
Done! ✨


### Calculating the belief network 😺

Now we will run a script that calculates the belief network. This will calculate the correlation matrix of the dataset, and then use that to create a belief network.

Here we can specify the years of interest, further filther the variables of interest, specify the method of correlation, whether we want partial correlations, and how we want to suppress edges.


In [None]:
corr_matrix = calculate_correlation_matrix(
    cleaned_df, 
    years_of_interest=[2000, 2001, 2002, 2003],
    method=CorrelationMethod.PEARSON, 
    partial=True, 
    edge_suppression=EdgeSuppressionMethod.REGULARIZATION,
    suppression_params={'regularization': 0.2})

# Print the variable names
# Get the variable names from the correlation matrix
variable_names = corr_matrix.columns.tolist()
print(variable_names)

['PARTYID', 'NATEDUC', 'HOMOSEX', 'NATROAD', 'PORNLAW', 'POLABUSE', 'ABHLTH', 'FEPRESCH', 'HELPOTH', 'NATAID', 'XMARSEX', 'SPANKING', 'POPULAR', 'SPKCOM', 'FEFAM', 'NATCITY', 'CONEDUC', 'LIBHOMO', 'DIVLAW', 'COLMIL', 'WORKHARD', 'COLATH', 'NATCHLD', 'SUICIDE2', 'NATFARE', 'FEPOL', 'NATARMS', 'GETAHEAD', 'CONSCI', 'POLMURDR', 'ABANY', 'CONPRESS', 'NATSCI', 'RACDIF3', 'CONMEDIC', 'TEENSEX', 'NATHEAL', 'HELPBLK', 'LETDIE1', 'COLHOMO', 'NATSPAC', 'NATENVIR', 'RACDIF2', 'CONARMY', 'CONCLERG', 'NATDRUG', 'CONLABOR', 'POLESCAP', 'PRAYER', 'OBEY', 'SPKHOMO', 'POLATTAK', 'HELPNOT', 'HELPPOOR', 'POSTLIFE', 'POLVIEWS', 'SEXEDUC', 'ABRAPE', 'CONBUS', 'ABDEFECT', 'POLHITOK', 'CONFINAN', 'CAPPUN', 'LIBATH', 'CONLEGIS', 'ABNOMORE', 'FECHLD', 'SPKMIL', 'CONJUDGE', 'TRUST', 'RACDIF4', 'LIBRAC', 'FAIR', 'AFFRMACT', 'PREMARSX', 'ABPOOR', 'SUICIDE1', 'NATMASS', 'COURTS', 'CONFED', 'CONTV', 'NATPARK', 'LIBMIL', 'NATCRIME', 'COLCOM', 'GUNLAW', 'NATSOC', 'THNKSELF', 'SPKRAC', 'RACDIF1', 'GRASS', 'LIBCOM', 'H

In [42]:
corr_matrix

Unnamed: 0,RACDIF3,COLATH,SPKRAC,ABPOOR,LIBRAC,PREMARSX,CONJUDGE,RACDIF1,NATARMS,FAIR,...,RELIG_None,RELIG_Other,RELIG_Buddhism,RELIG_Hinduism,RELIG_Other_eastern_religions,RELIG_Muslim,RELIG_Orthodox_christian,RELIG_Christian,RELIG_Native_american,RELIG_Inter_nondenominational
RACDIF3,0.0,0.0,0.000000,0.0,0.000000,-0.0,0.0,0.044531,-0.00000,-0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0
COLATH,0.0,0.0,0.000000,0.0,-0.000000,-0.0,0.0,-0.000000,-0.00000,-0.0,...,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,0.0
SPKRAC,0.0,0.0,0.000000,0.0,-0.148121,-0.0,0.0,-0.000000,-0.00000,-0.0,...,0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0
ABPOOR,0.0,0.0,0.000000,0.0,-0.000000,-0.0,0.0,0.000000,-0.00698,-0.0,...,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,0.0
LIBRAC,0.0,-0.0,-0.148121,-0.0,0.000000,0.0,-0.0,0.000000,0.00000,0.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,-0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RELIG_Muslim,0.0,-0.0,-0.000000,-0.0,0.000000,0.0,0.0,-0.000000,-0.00000,0.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0
RELIG_Orthodox_christian,-0.0,-0.0,-0.000000,0.0,0.000000,0.0,0.0,-0.000000,-0.00000,-0.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0
RELIG_Christian,-0.0,0.0,-0.000000,-0.0,0.000000,0.0,-0.0,-0.000000,0.00000,0.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0
RELIG_Native_american,-0.0,-0.0,-0.000000,0.0,0.000000,0.0,0.0,0.000000,-0.00000,0.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0


### Plotting the belief network
With the following code, we can plot a single correlation matrix (generate_html_visualization), or we can plot an interactive temporal network ()

In [6]:
generate_html_visualization(
    corr_matrix, 
    output_path = 'delete_this_file.html',
    highlight_nodes = ['POLVIEWS'])


corr_mat_dict = generate_temporal_html_visualization(
    cleaned_df,
    nodes_to_highlight=['POLVIEWS'],
    time_window_length=4,
    start_year=1972,  # Changed from default
    end_year=2020,    # Changed from default
    step_size=2,
    method=CorrelationMethod.PEARSON,
    partial=True,
    edge_suppression=EdgeSuppressionMethod.REGULARIZATION,
    suppression_params={'regularization': 0.2},
    output_path='delete_this_temporal_network.html'
)


Network visualization has been saved to c:\Users\timbo\Github\BeliefNetworkEvo\CLEAN\notebooks\tutorials\delete_this_file.html
Is this shit on
Temporal network visualization has been saved to delete_this_temporal_network.html


### Basic analysis

We can look at some basic stuff, like a triad count and a balanced/unbalanced triad count.

In [7]:
from CLEAN.source_code.analyzers.triad_analyzer import count_triads

dict_triads = count_triads(corr_matrix, return_names=True, return_sums=True)

print(dict_triads['positive_triads'])
#print(dict_triads['positive_triad_nodes'])

print(dict_triads['negative_triads'])
print(dict_triads['negative_triad_nodes'])
#print(dict_triads['negative_triad_sums'])

377
5
[('HELPOTH', 'WORKHARD', 'OBEY'), ('HELPOTH', 'WORKHARD', 'THNKSELF'), ('HELPOTH', 'OBEY', 'THNKSELF'), ('WORKHARD', 'OBEY', 'THNKSELF'), ('RELIG_Protestant', 'RELIG_Catholic', 'RELIG_None')]


We can also look at degree distribution. 

In [8]:
from CLEAN.source_code.analyzers.degree_distribution import plot_degree_distribution

plot_degree_distribution(corr_matrix, threshold=0, log_scale=False)