<a href="https://colab.research.google.com/github/polis-community/red-dwarf/blob/main/docs/notebooks/polis-implementation-results-docs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Red-Dwarf Library

In [1]:
%pip install --quiet red-dwarf[all]@git+https://github.com/polis-community/red-dwarf.git@main

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.1/116.1 kB[0m [31m230.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m188.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/61.4 kB[0m [31m242.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.4/69.4 kB[0m [31m226.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for red-dwarf (pyproject.toml) ... [?25l[?25hdone
  Building wheel for annoy (setup.py) ... [?25l[?25hdone


# Format Notebook

In [2]:
# For easily pretty-printing values
from pprint import pprint

# Customize pandas table formatting for this notebook
import pandas as pd

# This does several things:
#   * disables scientific notation for all float outputs,
#   * show floats up to 15 significant figures (like in Polis API)
#   * for very small numbers, just renders '0' (again, like in Polis API)
#
# This only effects display in this notebook. Full float values are stored.
ZERO_THRESHOLD = 1e-15
pd.set_option('display.float_format', lambda x: '0' if abs(x) < ZERO_THRESHOLD else f'{x:.15g}')

# Load Data

In [3]:
from reddwarf.data_loader import Loader

# Topic: What were the most significant developments in tech and politics in 2018?
# 5 groups, 65 ptpts (56 grouped), 43 comments (open)
REPORT_ID="r2dfw8eambusb8buvecjt"

# We'll use the data_loader utility to simply load vote data.
loader = Loader(polis_id=REPORT_ID)

assert loader.report_id != None
assert loader.conversation_id != None
assert loader.polis_instance_url != None

# Lists
assert 0 < len(loader.comments_data)
assert 0 < len(loader.votes_data)
# Objects
assert 0 < len(loader.conversation_data.keys())
assert 0 < len(loader.math_data.keys())
assert 0 < len(loader.report_data.keys())

print(f"Conversation data loaded!")
print(f"Report interface: {loader.polis_instance_url}/report/{loader.report_id}")
print(f"Participation interface: {loader.polis_instance_url}/{loader.conversation_id}")

Conversation data loaded!
Report interface: https://pol.is/report/r2dfw8eambusb8buvecjt
Participation interface: https://pol.is/6jrufhr6dp


# Prepare Data

In [4]:
from reddwarf.utils.polismath import get_corrected_centroid_guesses

# Prepare some optional data to kickstart

# KMeans is only reproducible when it starts with previous cluster center guesses.
INIT_CLUSTER_CENTER_GUESSES = get_corrected_centroid_guesses(loader.math_data)

# Polis has some edge-cases logic that keeps arbitrary [early] participants in
# the clustering algorithm for reasons that are hard to reproduce, so we
# borrow the full list of participants from the API response to reproduce exactly.
# See: https://github.com/compdemocracy/polis/pull/1893#issuecomment-2654666421
KEEP_PARTICIPANT_IDS = loader.math_data["in-conv"]

# Run Clustering

In [5]:
from reddwarf.utils.statements import process_statements
from reddwarf.implementations.polis import run_clustering

# Extract list statement IDs for moderated and meta statements.
_, _, mod_out_statement_ids, meta_statement_ids = process_statements(loader.comments_data)

result = run_clustering(
    votes=loader.votes_data,
    mod_out_statement_ids=mod_out_statement_ids,
    meta_statement_ids=meta_statement_ids,
    # If clustering is getting ready to find a new k, more need to uncomment
    # this to properly reproduce Polis visualization.
    #
    # force_group_count=len(INIT_CLUSTER_CENTER_GUESSES),
    init_centers=INIT_CLUSTER_CENTER_GUESSES,
    keep_participant_ids=KEEP_PARTICIPANT_IDS,
)

print(f"Clustering result keys returned: {[k for k in vars(result)]}")

Clustering result keys returned: ['raw_vote_matrix', 'filtered_vote_matrix', 'reducer', 'clusterer', 'group_comment_stats', 'statements_df', 'participants_df', 'participant_projections', 'statement_projections', 'group_aware_consensus', 'consensus', 'repness']


# Inspect Selected Statements

In [6]:
from reddwarf.data_presenter import print_selected_statements

print(f"Compare with math (consensus & repness): {loader.polis_instance_url}/api/v3/math/pca2?conversation_id={loader.conversation_id}")
print()

print_selected_statements(
    result=result,
    statements_data=loader.comments_data,
)

Compare with math (consensus & repness): https://pol.is/api/v3/math/pca2?conversation_id=6jrufhr6dp

# CONSENSUS STATEMENTS

## FOR AGREEMENT

* Authoritarian populist parties worldwide figured out how to weaponize trust and social media, winning elections.
    86% of everyone who voted on statement 28 agreed.

* We realized that information warfare is occurring by nonstate actors in destabilizing the international order
    80% of everyone who voted on statement 20 agreed.

* 2018 has been marked by the troubling rise of authoritarian leaders around the world.
    88% of everyone who voted on statement 39 agreed.

* The conversation about ethical uses of technology has reached a tipping point. Citizens, businesses and governments are on it, but baffled.
    77% of everyone who voted on statement 27 agreed.

* 2018 was the year Americans stopped thinking Silicon Valley was “different” or distinct from Wall St or the military industrial complex
    74% of everyone who voted on statement

# Inspect Statements DataFrame

In [10]:
# All statement-specific data from the overall processing
result.statements_df

print(f"Compare with math: {loader.polis_instance_url}/api/v3/math/pca2?conversation_id={loader.conversation_id}")
print(f"Compare with comments: {loader.polis_instance_url}/api/v3/comments?conversation_id={loader.conversation_id}&moderation=true&include_voting_patterns=true")
print()

EXPECTED_STATEMENT_COLUMNS = ["x", "y", "to_zero", "is_meta", "mean", "pc1", "pc2", "pc3", "group-aware-consensus", "group-aware-consensus-agree", "group-aware-consensus-disagree", "extremity", "n_agree", "n_disagree", "n_total", "priority"]
for col in result.statements_df.columns:
    assert col in EXPECTED_STATEMENT_COLUMNS
# Columns:
#   * x: the X coordinate of this statement projected into the PCA space
#     * Compare with: math.pca.comment-projection[0]
#   * y: the Y coordinate of this statement projected into the PCA space
#     * Compare with: math.pca.comment-projection[1]
#   * to_zero: statement columns to be zero'd out
#     * Compare with: math.mod-out (from comments[].mod and comments[].is_meta)
#   * is_meta: whether the statement is a meta statement
#     * Compare with: math.meta-tids (from comments[].is_meta)
#   * mean: mean vote value (from PCA object)
#     * Compare with: math.pca.center
#   * pc1: first principal component (from PCA object)
#     * Compare with: math.pca.comps[0]
#   * pc2: second principal component (from PCA object)
#     * Compare with: math.pca.comps[1]
#   * pc3: third principal component (from PCA object, not yet implemented)
#     * Compare with: N/A
#   * group-aware-consensus: see below.
#   * group-aware-consensus-agree: group-aware consensus score for agreement
#     * Compare with: math.group-aware-consensus
#   * group-aware-consensus-disagree: group-aware consensus score for disagreement
#     * Compare with: N/A
#   * extremity: the extremity value of this statement
#     * Compare with: math.pca.comment-extremity
#   * n_agree: total participant agree votes
#     * Compare with: comments[].agree_count
#   * n_disagree: total participant disagree votes
#     * Compare with: comments[].disagree_count
#   * n_total: total participant votes (agree/disagree/pass)
#     * Compare with: comments[].count
#   * priority: priority metric (for comment routing to decide probability of showing statement to participants)
#     * Compare with: math.comment-priorities

result.statements_df

Compare with math: https://pol.is/api/v3/math/pca2?conversation_id=6jrufhr6dp
Compare with comments: https://pol.is/api/v3/comments?conversation_id=6jrufhr6dp&moderation=true&include_voting_patterns=true



Unnamed: 0_level_0,x,y,to_zero,is_meta,mean,pc1,pc2,pc3,extremity,n_agree,n_disagree,n_total,priority,group-aware-consensus,group-aware-consensus-agree,group-aware-consensus-disagree
statement_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0.0,0.0,True,False,0.0,0.0,0.0,,0.0,1,0,1,12.529726250088,0.0416666666666667,0.0416666666666667,0.0208333333333333
1,0.0,0.0,True,False,0.0,0.0,0.0,,0.0,12,2,15,1.82109888531028,0.0925925925925926,0.0925925925925926,0.0020576131687242
2,0.93125323362706,0.267836858293804,False,False,0.295454545454545,0.201569374735891,0.0579731764766448,,0.969004214542719,23,10,44,0.597439345460652,0.0367309458218549,0.0367309458218549,0.0005739210284664
3,0.8713698567369,-1.24063317713216,False,False,-0.155555555555556,0.114994607350936,-0.163726256959487,,1.51606599705644,13,20,45,0.303202861853377,0.0022250476795931,0.0022250476795931,0.0150190718372537
4,1.25036247826799,-0.635841691701309,False,False,0.25,0.2542379871112,-0.129286598589752,,1.40274765512762,27,14,47,1.41798505519592,0.0357954545454545,0.0357954545454545,0.0022194602272727
5,0.495300924914593,-0.9816133131322,False,False,0.510204081632653,0.15421256099003,-0.305626529863895,,1.09949429409144,34,9,49,1.57307801748817,0.064453125,0.064453125,0.000295928030303
6,1.47918156815449,-0.64607530535096,False,False,0.235294117647059,0.2949801742231,-0.128841117437291,,1.61412248969907,28,17,50,1.68931108856827,0.0181966726084373,0.0181966726084373,0.0029244652406417
7,1.0857280901523,-0.0552474073043842,False,False,0.333333333333333,0.248357972277262,-0.0126377259427525,,1.08713281698219,24,9,47,0.558957401010819,0.0206611570247934,0.0206611570247934,0.0004591368227731
8,1.94503779546219,0.939406921374527,False,False,-0.214285714285714,0.244271574277183,0.117977351446021,,2.16001328461257,14,23,42,0.907084748752989,0.0014462809917355,0.0014462809917355,0.0325413223140496
9,0.15562796216691,0.368641094648692,False,False,0.380952380952381,0.0383379904937573,0.0908124644534778,,0.400145372673497,24,9,41,0.437143764611501,0.0503496503496503,0.0503496503496503,0.0006216006216006


# Inspect KMeans Clusters

In [12]:
# Cluster centers are the only raw data we can't fit in DataFrames

print(f"Compare with math: {loader.polis_instance_url}/api/v3/math/pca2?conversation_id={loader.conversation_id}")
print()

print(f"Cluster count: {result.clusterer.n_clusters}\n")
print(f"Cluster center coordinates:\n{result.clusterer.cluster_centers_}")
# Compare with: math.group-clusters[].center

Compare with math: https://pol.is/api/v3/math/pca2?conversation_id=6jrufhr6dp

Cluster count: 5

Cluster center coordinates:
[[ 2.32499287 -0.6607373 ]
 [-0.25553636  0.23787153]
 [-1.27036435 -1.91307438]
 [ 1.33411692  1.42448734]
 [-3.25828158  1.41243537]]


# Inspect Participants DataFrame

In [13]:
# All participant-specific data from the overall processing
result.participants_df

print(f"Compare with math: {loader.polis_instance_url}/api/v3/math/pca2?conversation_id={loader.conversation_id}")
print()

EXPECTED_PARTICIPANT_COLUMNS = ["x", "y", "to_cluster", "cluster_id"]
for col in result.participants_df.columns:
    assert col in EXPECTED_PARTICIPANT_COLUMNS
# Columns:
#   * x: the X coordinate of this participant projected into the PCA space
#     * Compare with: math.base-clusters.x
#   * y: the Y coordinate of this participant projected into the PCA space
#     * Compare with: math.base-clusters.y
#   * to_cluster: whether a participant meets criteria to be clustered.
#     * Compare with: math.in-conv
#   * cluster_id: Label assigned during k-means clustering. (null for unclustered)
#     * Compare with: math.group-clusters & math.base-clusters

print("Example of how to grab group members from DataFrame:")
example_groups = []
# Drop participants without a cluster_id (unclustered)
df = result.participants_df.dropna(subset=["cluster_id"]).copy()
# Loop through each unique cluster
for cluster_id in sorted(df["cluster_id"].unique()):
    members = df[df["cluster_id"] == cluster_id].index.tolist()
    example_groups.append({"group_id": int(cluster_id), "members": members})

pprint(example_groups, width=120)
print()

result.participants_df

Compare with math: https://pol.is/api/v3/math/pca2?conversation_id=6jrufhr6dp

Example of how to grab group members from DataFrame:
[{'group_id': 0, 'members': [1, 8, 9, 11, 14, 15, 20, 24, 43, 52, 54]},
 {'group_id': 1, 'members': [7, 27, 31, 32, 33, 34, 36, 39, 40, 42, 45, 46, 47, 48, 51, 53, 56, 58, 60, 63]},
 {'group_id': 2, 'members': [6, 13, 19, 21, 30, 37, 44, 55, 57, 59]},
 {'group_id': 3, 'members': [0, 2, 3, 4, 10, 12, 16, 23, 49]},
 {'group_id': 4, 'members': [5, 17, 18, 22, 41, 50]}]



Unnamed: 0_level_0,x,y,to_cluster,cluster_id
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1.97302193088359,1.268456660786,True,3
1,3.43041183143061,-1.1672088226505,True,0
2,1.0169233110404,0.47138822691382,True,3
3,1.12613927020858,0.0830780925636486,True,3
4,1.26019023171978,0.808216152001822,True,3
...,...,...,...,...
60,-0.39356512167222,0.755904365787626,True,1
61,-0.000307772339754042,-0.0357411185442997,False,
62,-0.0229975826580077,0.059268251962768,False,
63,-1.14175593513587,0.319439251062715,True,1


# Inspect Grouped Statements DataFrame

In [14]:
# All group-specific statement data from the overall processing
result.group_comment_stats

print(f"Compare with: {loader.polis_instance_url}/api/v3/math/pca2?conversation_id={loader.conversation_id}")
# Compare with: math.repness
print()

EXPECTED_GROUPED_STATEMENT_COLUMNS = ["na", "nd", "ns", "pa", "pd", "pat", "pdt", "ra", "rd", "rat", "rdt"]
for col in result.group_comment_stats.columns:
    assert col in EXPECTED_GROUPED_STATEMENT_COLUMNS
# Columns:
#   * na: agree count for statement in group
#   * nd: disagree count for statement in group
#   * ns: any vote count (seen) for statement in group (agree/disagree/pass)
#   * pa: probability of agree in group
#   * pd: probability of disagree in group
#   * pat: test z-score for probability of agree in group
#   * pdt: test z-score for probability of disagree in group
#   * ra: representativeness of agree in group
#   * rd: representativeness of disagree in group
#   * rat: test z-score for representativenes of agree in group
#   * rdt: test z-score for representativenes of disagree in group
#   * repness_order: the order statement as repness in group [not yet implemented, see select_representative_statements()]

result.group_comment_stats

Compare with: https://pol.is/api/v3/math/pca2?conversation_id=6jrufhr6dp



Unnamed: 0_level_0,Unnamed: 1_level_0,na,nd,ns,pa,pd,pat,pdt,ra,rd,rat,rdt
group_id,statement_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0,0,0,0,0.5,0.5,1,1,0.75,1.5,0,0.866025403784438
0,1,4,0,4,0.833333333333333,0.166666666666667,2.23606797749979,-1.34164078649987,1.2037037037037,0.722222222222222,1.23201345076145,-0.221446294930535
0,2,7,0,7,0.888888888888889,0.111111111111111,2.82842712474619,-2.12132034355964,2.03921568627451,0.393939393939394,2.8521460064494,-0.962901907028613
0,3,6,2,10,0.583333333333333,0.25,0.904534033733291,-1.50755672288882,2.69791666666667,0.486842105263158,2.57884099459832,-1.48368383514863
0,4,8,0,8,0.9,0.1,3,-2.33333333333333,1.845,0.273333333333333,2.75743509005417,-1.52529354960215
...,...,...,...,...,...,...,...,...,...,...,...,...
4,38,0,0,2,0.25,0.25,-0.577350269189626,-0.577350269189626,0.821428571428571,0.575,0.0527750413650936,-0.396759800429077
4,39,1,0,1,0.666666666666667,0.333333333333333,1.4142135623731,0,0.8,3,0.51281164041655,1.40268475068489
4,40,0,0,0,0.5,0.5,1,1,0.666666666666667,6,0.467099366496914,2.33549683248457
4,41,0,0,0,0.5,0.5,1,1,0.6,6,0.314918328648887,2.33549683248457


# Inspect Vote Matrices

In [15]:
# The sparse vote matrix with values exactly as come from votes.
result.raw_vote_matrix

statement_id,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,1,-1,1,-1,1,1,1,1,...,1,1,1,-1,1,-1,1,,,
1,,1,1,1,1,1,1,1,1,0,...,,,,,,,,,,
2,,1,1,-1,-1,1,1,1,1,,...,,,,,,,,,,
3,,1,0,-1,0,1,1,1,1,,...,,,,,,,,,,
4,,1,0,1,1,0,-1,0,0,1,...,-1,-1,-1,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,,,,,,,,,,,...,,,,,,,1,0,0,0
61,,,,,,,,,,,...,,,,,,,,1,,
62,,,,,,,,,,,...,,,,,,,,,0,0
63,,,,,,-1,,,,,...,,,,,,,1,1,1,


In [16]:
# The sparse vote matrix with `mod_out` columns' votes zero'd out.
result.filtered_vote_matrix

statement_id,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,1,-1,1,-1,1,1,1,1,...,1,1,1,-1,1,-1,1,,,
1,0,0,1,1,1,1,1,1,1,0,...,,,,,,,,,,
2,0,0,1,-1,-1,1,1,1,1,,...,,,,,,,,,,
3,0,0,0,-1,0,1,1,1,1,,...,,,,,,,,,,
4,0,0,0,1,1,0,-1,0,0,1,...,-1,-1,-1,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,0,0,,,,,,,,,...,,,,,,,1,0,0,0
61,0,0,,,,,,,,,...,,,,,,,,1,,
62,0,0,,,,,,,,,...,,,,,,,,,0,0
63,0,0,,,,-1,,,,,...,,,,,,,1,1,1,
