<a href="https://colab.research.google.com/github/polis-community/red-dwarf/blob/main/docs/notebooks/polis-implementation-results-docs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install --quiet --no-cache-dir git+https://github.com/polis-community/red-dwarf.git@main

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.1/116.1 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/61.4 kB[0m [31m127.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.5/66.5 kB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for red-dwarf (pyproject.toml) ... [?25l[?25hdone


In [11]:
from reddwarf.data_loader import Loader

# Topic: What were the most significant developments in tech and politics in 2018?
# 5 groups, 65 ptpts (56 grouped), 43 comments (open)
REPORT_ID="r2dfw8eambusb8buvecjt"

# We'll use the data_loader utility to simply load vote data.
loader = Loader(polis_id=REPORT_ID)

assert loader.report_id != None
assert loader.conversation_id != None
assert loader.polis_instance_url != None

# Lists
assert 0 < len(loader.comments_data)
assert 0 < len(loader.votes_data)
# Objects
assert 0 < len(loader.conversation_data.keys())
assert 0 < len(loader.math_data.keys())
assert 0 < len(loader.report_data.keys())

print(f"Conversation data loaded!")
print(f"Report interface: {loader.polis_instance_url}/report/{loader.report_id}")
print(f"Participation interface: {loader.polis_instance_url}/{loader.conversation_id}")

Conversation data loaded!
Report interface: https://pol.is/report/r2dfw8eambusb8buvecjt
Participation interface: https://pol.is/6jrufhr6dp


In [12]:
from reddwarf.utils.polismath import get_corrected_centroid_guesses

# Prepare some optional data to kickstart
INIT_CLUSTER_CENTER_GUESSES = get_corrected_centroid_guesses(loader.math_data, skip_correction=False)

# Polis has some edge-cases logic that keeps arbitrary [early] participants in
# the clustering algorithm for reasons that are hard to reproduce, so we
# borrow these from API response to reproduce exactly.
# See: https://github.com/compdemocracy/polis/pull/1893#issuecomment-2654666421
KEEP_PARTICIPANT_IDS = loader.math_data["in-conv"]

In [32]:
from reddwarf.utils.statements import process_statements
from reddwarf.implementations.polis import run_clustering

_, _, mod_out_statement_ids, meta_statement_ids = process_statements(loader.comments_data)

result = run_clustering(
    votes=loader.votes_data,
    mod_out_statement_ids=mod_out_statement_ids,
    meta_statement_ids=meta_statement_ids,
    # If clustering is getting ready to find a new k, more need to uncomment
    # this to properly reproduce Polis visualization.
    #
    # force_group_count=len(INIT_CLUSTER_CENTER_GUESSES),
    init_centers=INIT_CLUSTER_CENTER_GUESSES,
    keep_participant_ids=KEEP_PARTICIPANT_IDS,
)

print(f"Keys: {[k for k in vars(result)]}")

Keys: ['raw_vote_matrix', 'filtered_vote_matrix', 'pca', 'projected_participants', 'projected_statements', 'kmeans', 'group_aware_consensus', 'group_comment_stats', 'statements_df', 'participants_df']


In [36]:
# All statement-specific data from the overall processing
result.statements_df

EXPECTED_STATEMENT_COLUMNS = ["x", "y", "to_zero", "is_meta", "mean", "pc1", "pc2", "pc3", "consensus", "extremity", "n_agree", "n_disagree", "n_total", "priority"]
for col in result.statements_df.columns:
    assert col in EXPECTED_STATEMENT_COLUMNS
# Columns:
#   * x: the X coordinate of this statement projected into the PCA space
#   * y: the Y coordinate of this statement projected into the PCA space
#   * to_zero: statement columns to be zero'd out
#   * is_meta: whether the statement is a meta statement
#   * mean: mean vote value (from PCA object)
#   * pc1: first principal component (from PCA object)
#   * pc2: second principal component (from PCA object)
#   * pc3: third principal component (from PCA object, not yet implemented)
#   * consensus: group-aware consensus score
#   * extremity: the extremity value of this statement
#   * n_agree: total participant agree votes
#   * n_disagree: total participant disagree votes
#   * n_total: total participant votes (agree/disagree/pass)
#   * priority: priority metric (for comment routing to decide probability of showing statement to participants)

result.statements_df

Unnamed: 0_level_0,x,y,to_zero,is_meta,mean,pc1,pc2,pc3,consensus,extremity,n_agree,n_disagree,n_total,priority
statement_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0.0,0.0,True,False,0.0,1.002155e-17,1.0463450000000001e-17,,0.041667,0.0,1,0,1,12.529726
1,0.0,0.0,True,False,0.0,5.5511150000000004e-17,-2.775558e-17,,0.092593,0.0,12,2,15,1.821099
2,0.931253,0.267837,False,False,0.295455,0.2015694,0.05797318,,0.036731,0.969004,23,10,44,0.597439
3,0.87137,-1.240633,False,False,-0.155556,0.1149946,-0.1637263,,0.002225,1.516066,13,20,45,0.303203
4,1.250362,-0.635842,False,False,0.25,0.254238,-0.1292866,,0.035795,1.402748,27,14,47,1.417985
5,0.495301,-0.981613,False,False,0.510204,0.1542126,-0.3056265,,0.064453,1.099494,34,9,49,1.573078
6,1.479182,-0.646075,False,False,0.235294,0.2949802,-0.1288411,,0.018197,1.614122,28,17,50,1.689311
7,1.085728,-0.055247,False,False,0.333333,0.248358,-0.01263773,,0.020661,1.087133,24,9,47,0.558957
8,1.945038,0.939407,False,False,-0.214286,0.2442716,0.1179774,,0.001446,2.160013,14,23,42,0.907085
9,0.155628,0.368641,False,False,0.380952,0.03833799,0.09081246,,0.05035,0.400145,24,9,41,0.437144


In [42]:
# All participant-specific data from the overall processing
result.participants_df

EXPECTED_PARTICIPANT_COLUMNS = ["x", "y", "to_cluster", "cluster_id"]
for col in result.participants_df.columns:
    assert col in EXPECTED_PARTICIPANT_COLUMNS
# Columns:
#   * x: the X coordinate of this participant projected into the PCA space
#   * y: the Y coordinate of this participant projected into the PCA space
#   * to_cluster: whether a participant meets criteria to be clustered.
#   * cluster_id: Label assigned during k-means clustering. (null for unclustered)

result.participants_df

Unnamed: 0_level_0,x,y,to_cluster,cluster_id
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1.973022,1.268457,True,3
1,3.430412,-1.167209,True,0
2,1.016923,0.471388,True,3
3,1.126139,0.083078,True,3
4,1.260190,0.808216,True,3
...,...,...,...,...
60,-0.393565,0.755904,True,1
61,-0.000308,-0.035741,False,
62,-0.022998,0.059268,False,
63,-1.141756,0.319439,True,1


In [44]:
# All group-specific statement data from the overall processing
result.group_comment_stats

EXPECTED_GROUPED_STATEMENT_COLUMNS = ["na", "nd", "ns", "pa", "pd", "pat", "pdt", "ra", "rd", "rat", "rdt"]
for col in result.group_comment_stats.columns:
    assert col in EXPECTED_GROUPED_STATEMENT_COLUMNS
# Columns:
#   * na: agree count for statement in group
#   * nd: disagree count for statement in group
#   * ns: any vote count for statement in group (agree/disagree/pass)
#   * pa: probability of agree in group
#   * pd: probability of disagree in group
#   * pat: test z-score for probability of agree in group
#   * pdt: test z-score for probability of disagree in group
#   * ra: representativeness of agree in group
#   * rd: representativeness of disagree in group
#   * rat: test z-score for representativenes of agree in group
#   * rdt: test z-score for representativenes of disagree in group
#   * repness_order: the order statement as repness in group [not yet implemented, see select_representative_statements()]

result.group_comment_stats

Unnamed: 0_level_0,Unnamed: 1_level_0,na,nd,ns,pa,pd,pat,pdt,ra,rd,rat,rdt
group_id,statement_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0,0,0,0,0.500000,0.500000,1.000000,1.000000,0.750000,1.500000,0.000000,0.866025
0,1,4,0,4,0.833333,0.166667,2.236068,-1.341641,1.203704,0.722222,1.232013,-0.221446
0,2,7,0,7,0.888889,0.111111,2.828427,-2.121320,2.039216,0.393939,2.852146,-0.962902
0,3,6,2,10,0.583333,0.250000,0.904534,-1.507557,2.697917,0.486842,2.578841,-1.483684
0,4,8,0,8,0.900000,0.100000,3.000000,-2.333333,1.845000,0.273333,2.757435,-1.525294
...,...,...,...,...,...,...,...,...,...,...,...,...
4,38,0,0,2,0.250000,0.250000,-0.577350,-0.577350,0.821429,0.575000,0.052775,-0.396760
4,39,1,0,1,0.666667,0.333333,1.414214,0.000000,0.800000,3.000000,0.512812,1.402685
4,40,0,0,0,0.500000,0.500000,1.000000,1.000000,0.666667,6.000000,0.467099,2.335497
4,41,0,0,0,0.500000,0.500000,1.000000,1.000000,0.600000,6.000000,0.314918,2.335497


In [19]:
# The sparse vote matrix with values exactly as come from votes.
result.raw_vote_matrix

statement_id,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,1.0,1.0,-1.0,1.0,-1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,-1.0,1.0,-1.0,1.0,,,
1,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,,,,,,,,,,
2,,1.0,1.0,-1.0,-1.0,1.0,1.0,1.0,1.0,,...,,,,,,,,,,
3,,1.0,0.0,-1.0,0.0,1.0,1.0,1.0,1.0,,...,,,,,,,,,,
4,,1.0,0.0,1.0,1.0,0.0,-1.0,0.0,0.0,1.0,...,-1.0,-1.0,-1.0,0.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,,,,,,,,,,,...,,,,,,,1.0,0.0,0.0,0.0
61,,,,,,,,,,,...,,,,,,,,1.0,,
62,,,,,,,,,,,...,,,,,,,,,0.0,0.0
63,,,,,,-1.0,,,,,...,,,,,,,1.0,1.0,1.0,


In [27]:
# The sparse vote matrix with `mod_out` columns' votes zero'd out.
result.filtered_vote_matrix

statement_id,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,1.0,-1.0,1.0,-1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,-1.0,1.0,-1.0,1.0,,,
1,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,,,,,,,,,,
2,0.0,0.0,1.0,-1.0,-1.0,1.0,1.0,1.0,1.0,,...,,,,,,,,,,
3,0.0,0.0,0.0,-1.0,0.0,1.0,1.0,1.0,1.0,,...,,,,,,,,,,
4,0.0,0.0,0.0,1.0,1.0,0.0,-1.0,0.0,0.0,1.0,...,-1.0,-1.0,-1.0,0.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,0.0,0.0,,,,,,,,,...,,,,,,,1.0,0.0,0.0,0.0
61,0.0,0.0,,,,,,,,,...,,,,,,,,1.0,,
62,0.0,0.0,,,,,,,,,...,,,,,,,,,0.0,0.0
63,0.0,0.0,,,,-1.0,,,,,...,,,,,,,1.0,1.0,1.0,
