#### Supplemental Materials 
For my systematic review on latent Variable modeling approaches, here are some additional plots and data that might be of interest. If you discover any issues, please contact zoe.sandle@donders.ru.nl.

In [15]:
### read in data and activate the environmentimport pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd
import plotly.graph_objects as go

#setting wd to where the data is
import os
os.chdir('C:/Users/U727148/Desktop/DATA/REVIEW/')
df = pd.read_excel('updated_from_cleaned_20250423.xlsx')
df_sankey_models_transformed = pd.read_excel('dfsankeymodels_updated_20250424.xlsx')
LCA_outcomes = pd.read_excel('LCA_outcomes.xlsx')
df_sankey_cups = pd.read_excel('dfsankeycu.xlsx')
df_sankey_beh = pd.read_excel('dfsankey_behavioral_variables.xlsx')

#### Sankey Plots 

In this first plot, the first column represents the general model used in all studies, the second column includes additional specifications (e.g. for factor analysis, whether it was exploratory or confirmatory, or which type of rotation was used). Follow up analyses are shown in the third column. Hovering over each node shows the number of incoming and outgoing flow, hovering over the connections shows the First Author & Year. Each node can be moved around for readability or bundled using box or lasso select. 

In [None]:
# 1. Create unique labels
labels = list(pd.concat([
    df_sankey_models_transformed['gm'], 
    df_sankey_models_transformed['mit'], 
    df_sankey_models_transformed['oa']
]).unique())

# 2. Map labels to indices
label_map = {label: idx for idx, label in enumerate(labels)}

# 3. Define sources, targets, and values
sources = df_sankey_models_transformed['gm'].map(label_map).tolist() + \
          df_sankey_models_transformed['mit'].map(label_map).tolist()

targets = df_sankey_models_transformed['mit'].map(label_map).tolist() + \
          df_sankey_models_transformed['oa'].map(label_map).tolist()

values = df_sankey_models_transformed['gm_count'].tolist() + \
         df_sankey_models_transformed['oa_count'].tolist()

# 4. Generate pastel colors
import random

def pastel_color():
    r = lambda: random.randint(100, 255)
    return f'rgba({r()},{r()},{r()},0.6)'

label_to_color = {label: pastel_color() for label in labels}
node_color_list = [label_to_color[label] for label in labels]

# 5. Set link colors by source node's color (first half) and middle node (second half)
link_colors = [label_to_color[df_sankey_models_transformed['gm'].iloc[i]] 
               for i in range(len(df_sankey_models_transformed))] + \
              [label_to_color[df_sankey_models_transformed['mit'].iloc[i]] 
               for i in range(len(df_sankey_models_transformed))]

# 7. Create the figure
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.9),
        label=labels,
        color=node_color_list,
        align="left"
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        color=link_colors,
        customdata=df_sankey_models_transformed['nameyear'].tolist() * 2,
        hovertemplate='%{customdata}<extra></extra>'
    )
)])

fig.update_layout(
    title_text="Sankey Diagram for model types",
    font_size=16.5,
    width=1500,
    height=800,
    hovermode='x'
)
fig.show()

fig.write_html("C:/Users/U727148/Latent_Variable_Supplement/sankey_models_plot.html", include_plotlyjs="cdn")

#### Variable Plots

In these plots, I am modeling the flow of usage from 1. variable, 2. scale used, 3. type of assessment (self and other report, experimental, observational). This is for the psychopathic traits/CU traits variable, the behavioral variable, and the cognitive variable (for the brain variables, there was not enough data available).


In [13]:
# 1. Create unique labels
labels = list(pd.concat([
    df_sankey_cups['p_or_cu'], 
    df_sankey_cups['scale'], 
    df_sankey_cups['reporting_type']
]).unique())

# 2. Map labels to indices
label_map = {label: idx for idx, label in enumerate(labels)}

# 3. Define sources, targets, and values
sources = df_sankey_cups['p_or_cu'].map(label_map).tolist() + \
          df_sankey_cups['scale'].map(label_map).tolist()

targets = df_sankey_cups['scale'].map(label_map).tolist() + \
          df_sankey_cups['reporting_type'].map(label_map).tolist()

values = df_sankey_cups['pcu_count'].tolist() + \
         df_sankey_cups['reporttype_count'].tolist()

# 4. Generate pastel colors
import random

def pastel_color():
    r = lambda: random.randint(100, 255)
    return f'rgba({r()},{r()},{r()},0.6)'

label_to_color = {label: pastel_color() for label in labels}
node_color_list = [label_to_color[label] for label in labels]

# 5. Set link colors by source node's color (first half) and middle node (second half)
link_colors = [label_to_color[df_sankey_cups['p_or_cu'].iloc[i]] 
               for i in range(len(df_sankey_cups))] + \
              [label_to_color[df_sankey_cups['scale'].iloc[i]] 
               for i in range(len(df_sankey_cups))]

# 7. Create the figure
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.9),
        label=labels,
        color=node_color_list,
        align="left"
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        color=link_colors,
        customdata=df_sankey_models_transformed['nameyear'].tolist() * 2,
        hovertemplate='%{customdata}<extra></extra>'
    )
)])

fig.update_layout(
    title_text="Sankey Diagram for psychopathic/CU traits, scales, and assessment types",
    font_size=16.5,
    width=1500,
    height=800,
    hovermode='x'
)
fig.show()

fig.write_html("C:/Users/U727148/Latent_Variable_Supplement/sankey_pcu_plot.html", include_plotlyjs="cdn")

In [27]:
#adding count vars to the behavioral variables based on how many times they appear in the data
#we add a new column to the df_sankey_beh dataframe that counts the number of times each behavioral variable appears in the data
df_sankey_beh['b_count'] = df_sankey_beh['behavioral_variable'].map(df_sankey_beh['behavioral_variable'].value_counts())
df_sankey_beh['ss_count'] = df_sankey_beh['scale_summarized'].map(df_sankey_beh['scale_summarized'].value_counts())
df_sankey_beh['a_count'] = df_sankey_beh['behavioral_assessment'].map(df_sankey_beh['behavioral_assessment'].value_counts())
df_sankey_beh['at_count'] = df_sankey_beh['behavioral_assessment_type'].map(df_sankey_beh['behavioral_assessment_type'].value_counts())



#### Behavioral Variables

In this plot, we see the behavioral variables, the scale they were investigated with, and reporting types. Because of the diversity of scales, I also added a node where I summarize the type of scale for easier viewing between the scale and the reporting type. 

In [44]:
#Sankey plot for behavioral variables
# 1. Create unique labels
labels = list(pd.concat([
    df_sankey_beh['behavioral_variable'], 
    df_sankey_beh['behavioral_assessment'], 
    df_sankey_beh['scale_summarized'],
    df_sankey_beh['behavioral_assessment_type'],
]).unique())

# 2. Map labels to indices
label_map = {label: idx for idx, label in enumerate(labels)}

# 3. Define sources, targets, and values
sources = df_sankey_beh['behavioral_variable'].map(label_map).tolist() + \
          df_sankey_beh['behavioral_assessment'].map(label_map).tolist() + \
          df_sankey_beh['scale_summarized'].map(label_map).tolist() 

targets = df_sankey_beh['behavioral_assessment'].map(label_map).tolist() + \
          df_sankey_beh['scale_summarized'].map(label_map).tolist() + \
          df_sankey_beh['behavioral_assessment_type'].map(label_map).tolist()

values = df_sankey_beh['b_count'].tolist() + \
         df_sankey_beh['ss_count'].tolist() + \
         df_sankey_beh['at_count'].tolist()

# 4. Generate pastel colors
import random

def pastel_color():
    r = lambda: random.randint(100, 255)
    return f'rgba({r()},{r()},{r()},0.6)'

label_to_color = {label: pastel_color() for label in labels}
node_color_list = [label_to_color[label] for label in labels]

# 5. Set link colors by source node's color (first) and middle nodes (second) and last node
link_colors = [label_to_color[df_sankey_beh['behavioral_variable'].iloc[i]] 
               for i in range(len(df_sankey_beh))] + \
              [label_to_color[df_sankey_beh['scale_summarized'].iloc[i]] 
               for i in range(len(df_sankey_beh))] + \
              [label_to_color[df_sankey_beh['behavioral_assessment_type'].iloc[i]] 
               for i in range(len(df_sankey_beh))]

# 7. Create the figure
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.9),
        label=labels,
        color=node_color_list,
        align="left"
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        color=link_colors,
        customdata=df_sankey_models_transformed['nameyear'].tolist() * 5,
        hovertemplate='%{customdata}<extra></extra>'
    )
)])

fig.update_layout(
    title_text="Sankey Diagram for variables, scales, and assessment types",
    font_size=16.5,
    width=1700,
    height=1200,
    hovermode='x'
)
fig.show()

fig.write_html("C:/Users/U727148/Latent_Variable_Supplement/sankey_behavior_plot.html", include_plotlyjs="cdn")

#### Cognitive variables

This is an overview plot for the studies using cognitive variables (which of course is only 40.5% of studies actually did). Left is the variable, in the middle is the name of the scale or test used, and left is the type of report.

In [79]:
#Making the sankey plot for cognitive variables
# 1. Create unique labels
labels = list(pd.concat([
    df_sankey_cog['cv'], 
    df_sankey_cog['ca'], 
    df_sankey_cog['cat']
]).unique())

# 2. Map labels to indices
label_map = {label: idx for idx, label in enumerate(labels)}

# 3. Define sources, targets, and values
sources = df_sankey_cog['cv'].map(label_map).tolist() + \
          df_sankey_cog['ca'].map(label_map).tolist()

targets = df_sankey_cog['ca'].map(label_map).tolist() + \
            df_sankey_cog['cat'].map(label_map).tolist()

values = df_sankey_cog['c_count'].tolist() + \
            df_sankey_cog['cat_count'].tolist()

# 4. Generate pastel colors
import random

def pastel_color():
    r = lambda: random.randint(100, 255)
    return f'rgba({r()},{r()},{r()},0.6)'

label_to_color = {label: pastel_color() for label in labels}
node_color_list = [label_to_color[label] for label in labels]

# 5. Set link colors by source node's color (first half) and middle node (second half)
link_colors = [label_to_color[df_sankey_cog['cv'].iloc[i]] 
               for i in range(len(df_sankey_cog))] + \
              [label_to_color[df_sankey_cog['ca'].iloc[i]] 
               for i in range(len(df_sankey_cog))]

# 7. Create the figure
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.9),
        label=labels,
        color=node_color_list,
        align="left"
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        color=link_colors,
        customdata=df_sankey_cog['nameyear'].tolist() * 2,
        hovertemplate='%{customdata}<extra></extra>'
    )
)])

fig.update_layout(
    title_text="Sankey Diagram for cognitive variables, assessments, and assessment types",
    font_size=16.5,
    width=1700,
    height=1200,
    hovermode='x'
)
fig.show()

fig.write_html("C:/Users/U727148/Latent_Variable_Supplement/sankey_cog_plot.html", include_plotlyjs="cdn")