# Cluster Visualization Calculations

This notebook takes precalculated cluster membership for the US house of representatives and uses it to do the following:
- Determine the size of each cluster
- Determine the "polarity" (ratio of democrat and republican membership) of a given cluster

In [106]:
import pandas as pd

In [107]:
cluster_data_fp = "../../data/clusters/voter_clusters.csv"
cluster_df = pd.read_csv(cluster_data_fp)

This file contains the cluster membership for each individual house member for each combination of topic and subject in the top 5 overall subjects for legislation and their associated topics. The amount of topics for a given subject may vary.

In [108]:
cluster_df.head()

Unnamed: 0,voters,Government operations and politics_ Government operations and politics_cluster,Government operations and politics_ Government information and archives_cluster,Government operations and politics_ political campaign regulation_cluster,Government operations and politics_ Elections_cluster,Government operations and politics_ voting_cluster,Government operations and politics_ State and local government operations_cluster,Government operations and politics_ benefits_cluster,Government operations and politics_ personnel management_cluster,Government operations and politics_ Government employee pay_cluster,...,Health_ Health_cluster,Health_ Health programs administration and funding_cluster,Health_ Congressional oversight_cluster,Health_ Health promotion and preventive care_cluster,Health_ Prescription drugs_cluster,Health_ Health care costs and insurance_cluster,Health_ Department of Health and Human Services_cluster,Health_Administrative law and regulatory procedures_cluster,Health_ Government information and archives_cluster,Health_ Government studies and investigations_cluster
0,vote_A000374,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,0.0,...,2.0,2.0,2.0,1.0,2.0,0.0,1.0,0.0,0.0,1.0
1,vote_A000370,2.0,0.0,2.0,2.0,2.0,1.0,1.0,1.0,2.0,...,0.0,0.0,0.0,2.0,0.0,1.0,2.0,1.0,2.0,2.0
2,vote_A000055,0.0,1.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,...,1.0,2.0,2.0,1.0,1.0,0.0,0.0,2.0,0.0,1.0
3,vote_A000371,2.0,0.0,2.0,2.0,2.0,1.0,1.0,1.0,2.0,...,0.0,0.0,0.0,2.0,0.0,1.0,2.0,1.0,2.0,2.0
4,vote_A000372,0.0,1.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,...,1.0,2.0,2.0,1.0,1.0,0.0,0.0,2.0,0.0,1.0


In [109]:
clusters = cluster_df.columns
print(len(clusters))
#print(clusters)

51


In [110]:
# Convert a single subject/topic combination into a series representing its clusters
cluster_size_df = cluster_df.groupby(clusters[1]).size()
cluster_size_df = cluster_size_df.reset_index()
cluster_name = cluster_size_df.columns[0]
del cluster_size_df[cluster_name]
cluster_size_df = cluster_size_df.rename(columns={0:cluster_name})

In [111]:
cluster_size_df.head()

Unnamed: 0,Government operations and politics_ Government operations and politics_cluster
0,212
1,85
2,219


In [112]:
# Join it with the remainder of the other subjects and topics
for c in clusters[2:]:
    cluster_sizes = cluster_df.groupby(c).size()
    cluster_sizes = cluster_sizes.reset_index()
    del cluster_sizes[c]
    cluster_sizes = cluster_sizes.rename(columns={0:c})
    cluster_size_df = pd.concat([cluster_size_df, cluster_sizes], axis=1)
    

In [113]:
# Export these
cluster_size_df.to_csv("cluster_sizes.csv")

Here we will join the cluster membership with party information about each House member.

In [114]:
nodes_df = pd.read_csv("../../data/kg/nodes.csv")
members_df = nodes_df[nodes_df["ntype"] == "member"]
parties_df = nodes_df[nodes_df["ntype"] == "party"]

In [115]:
members_df

Unnamed: 0,nid,ntype,nname
250,250,member,A000055
251,251,member,A000360
252,252,member,A000367
253,253,member,A000369
254,254,member,A000370
...,...,...,...
795,795,member,Y000033
796,796,member,Y000062
797,797,member,Y000064
798,798,member,Y000065


In [116]:
parties_df.head()

Unnamed: 0,nid,ntype,nname
0,0,party,D
1,1,party,I
2,2,party,ID
3,3,party,R


In [117]:
party_membership_df = pd.read_csv("../../data/kg/member_memberof_party.csv")
party_membership_df.head()

Unnamed: 0,src_nid,tgt_nid
0,250,3
1,251,3
2,252,1
3,252,3
4,253,3


In [118]:
party_membership_df = party_membership_df.join(members_df.set_index('nid'), on='src_nid')
party_membership_df = party_membership_df.join(parties_df.set_index('nid'), on='tgt_nid', rsuffix="_party", lsuffix="_member")

In [119]:
party_membership_df.head()

Unnamed: 0,src_nid,tgt_nid,ntype_member,nname_member,ntype_party,nname_party
0,250,3,member,A000055,party,R
1,251,3,member,A000360,party,R
2,252,1,member,A000367,party,I
3,252,3,member,A000367,party,R
4,253,3,member,A000369,party,R


In [120]:
cluster_df["voters"].head()

0    vote_A000374
1    vote_A000370
2    vote_A000055
3    vote_A000371
4    vote_A000372
Name: voters, dtype: object

In [121]:
def get_voter_id(vote_str):
    return vote_str[5:]

In [122]:
cluster_df["voters"] = cluster_df["voters"].apply(get_voter_id)
cluster_df.head()

Unnamed: 0,voters,Government operations and politics_ Government operations and politics_cluster,Government operations and politics_ Government information and archives_cluster,Government operations and politics_ political campaign regulation_cluster,Government operations and politics_ Elections_cluster,Government operations and politics_ voting_cluster,Government operations and politics_ State and local government operations_cluster,Government operations and politics_ benefits_cluster,Government operations and politics_ personnel management_cluster,Government operations and politics_ Government employee pay_cluster,...,Health_ Health_cluster,Health_ Health programs administration and funding_cluster,Health_ Congressional oversight_cluster,Health_ Health promotion and preventive care_cluster,Health_ Prescription drugs_cluster,Health_ Health care costs and insurance_cluster,Health_ Department of Health and Human Services_cluster,Health_Administrative law and regulatory procedures_cluster,Health_ Government information and archives_cluster,Health_ Government studies and investigations_cluster
0,A000374,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,0.0,...,2.0,2.0,2.0,1.0,2.0,0.0,1.0,0.0,0.0,1.0
1,A000370,2.0,0.0,2.0,2.0,2.0,1.0,1.0,1.0,2.0,...,0.0,0.0,0.0,2.0,0.0,1.0,2.0,1.0,2.0,2.0
2,A000055,0.0,1.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,...,1.0,2.0,2.0,1.0,1.0,0.0,0.0,2.0,0.0,1.0
3,A000371,2.0,0.0,2.0,2.0,2.0,1.0,1.0,1.0,2.0,...,0.0,0.0,0.0,2.0,0.0,1.0,2.0,1.0,2.0,2.0
4,A000372,0.0,1.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,...,1.0,2.0,2.0,1.0,1.0,0.0,0.0,2.0,0.0,1.0


In [128]:
cluster_w_party_df = cluster_df.join(party_membership_df.set_index('nname_member'), on='voters').dropna()
cluster_w_party_df

Unnamed: 0,voters,Government operations and politics_ Government operations and politics_cluster,Government operations and politics_ Government information and archives_cluster,Government operations and politics_ political campaign regulation_cluster,Government operations and politics_ Elections_cluster,Government operations and politics_ voting_cluster,Government operations and politics_ State and local government operations_cluster,Government operations and politics_ benefits_cluster,Government operations and politics_ personnel management_cluster,Government operations and politics_ Government employee pay_cluster,...,Health_ Health care costs and insurance_cluster,Health_ Department of Health and Human Services_cluster,Health_Administrative law and regulatory procedures_cluster,Health_ Government information and archives_cluster,Health_ Government studies and investigations_cluster,src_nid,tgt_nid,ntype_member,ntype_party,nname_party
0,A000374,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,0.0,...,0.0,1.0,0.0,0.0,1.0,257.0,3.0,member,party,R
1,A000370,2.0,0.0,2.0,2.0,2.0,1.0,1.0,1.0,2.0,...,1.0,2.0,1.0,2.0,2.0,254.0,0.0,member,party,D
2,A000055,0.0,1.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,...,0.0,0.0,2.0,0.0,1.0,250.0,3.0,member,party,R
3,A000371,2.0,0.0,2.0,2.0,2.0,1.0,1.0,1.0,2.0,...,1.0,2.0,1.0,2.0,2.0,255.0,0.0,member,party,D
4,A000372,0.0,1.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,...,0.0,0.0,2.0,0.0,1.0,256.0,3.0,member,party,R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
443,G000061,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,2.0,0.0,2.0,1.0,0.0,417.0,3.0,member,party,R
444,H001092,1.0,2.0,1.0,1.0,1.0,2.0,0.0,0.0,1.0,...,2.0,1.0,0.0,1.0,0.0,485.0,0.0,member,party,D
445,M000687,2.0,2.0,2.0,2.0,2.0,1.0,0.0,0.0,1.0,...,2.0,2.0,1.0,1.0,0.0,566.0,0.0,member,party,D
446,J000020,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,2.0,0.0,2.0,1.0,0.0,488.0,3.0,member,party,R


In [170]:
party_colors = ["#092573", "#250973", "#500973", "#730950", "#8f0303"]
subject_topic_names = list(cluster_df.columns[1:])

['Government operations and politics_ Government operations and politics_cluster', 'Government operations and politics_ Government information and archives_cluster', 'Government operations and politics_ political campaign regulation_cluster', 'Government operations and politics_ Elections_cluster', 'Government operations and politics_ voting_cluster', 'Government operations and politics_ State and local government operations_cluster', 'Government operations and politics_ benefits_cluster', 'Government operations and politics_ personnel management_cluster', 'Government operations and politics_ Government employee pay_cluster', 'Government operations and politics_ public corruption_cluster', 'Finance and financial sector_ Finance and financial sector_cluster', 'Finance and financial sector_ Government studies and investigations_cluster', 'Finance and financial sector_ Congressional oversight_cluster', 'Finance and financial sector_ Government information and archives_cluster', 'Finance a

In [191]:
# Dictionary that will hold the color data for each cluster - we will turn this into a dataframe
color_df_data = {}

# For each subject and topic combination
for st in subject_topic_names:
    col_list = [st, "nname_party"]
    st_df = cluster_w_party_df[col_list]
    # Get the number of members in each party in each cluster
    st_df = cluster_w_party_df.groupby(col_list).size().reset_index().rename(columns={0:"count"})
    clusters = list(st_df[st].unique())
    cluster_colors = []
    # Calculate the proportion of Republicans in each cluster
    for c in clusters:
        current_cluster_df = st_df[st_df[st] == c]
        total_members = current_cluster_df.sum()["count"]
        if "R" in current_cluster_df["nname_party"].values:
            num_repub = current_cluster_df["count"].loc[current_cluster_df["nname_party"] == "R"].iloc[0]
        else:
            num_repub = 0
        frac_repub = num_repub / total_members
        # Retrieve the color from a set list of colors that go from bluest (most democratic) to reddest (most republican)
        color_ind = int(round(frac_repub * (len(party_colors) - 1))) # We have to subtract 1 because Python uses 0 indexing
        color = party_colors[color_ind]
        cluster_colors.append(color)
    st_color_col_name = st + "_color"
    color_df_data[st_color_col_name] = cluster_colors
    color_df_data[st + "_size"] = cluster_size_df[st]

color_df = pd.DataFrame(data=color_df_data)
color_df.head()

Unnamed: 0,Government operations and politics_ Government operations and politics_cluster_color,Government operations and politics_ Government operations and politics_cluster_size,Government operations and politics_ Government information and archives_cluster_color,Government operations and politics_ Government information and archives_cluster_size,Government operations and politics_ political campaign regulation_cluster_color,Government operations and politics_ political campaign regulation_cluster_size,Government operations and politics_ Elections_cluster_color,Government operations and politics_ Elections_cluster_size,Government operations and politics_ voting_cluster_color,Government operations and politics_ voting_cluster_size,...,Health_ Health care costs and insurance_cluster_color,Health_ Health care costs and insurance_cluster_size,Health_ Department of Health and Human Services_cluster_color,Health_ Department of Health and Human Services_cluster_size,Health_Administrative law and regulatory procedures_cluster_color,Health_Administrative law and regulatory procedures_cluster_size,Health_ Government information and archives_cluster_color,Health_ Government information and archives_cluster_size,Health_ Government studies and investigations_cluster_color,Health_ Government studies and investigations_cluster_size
0,#8f0303,212,#092573,234,#8f0303,211,#8f0303,211,#8f0303,211,...,#8f0303,195,#8f0303,209,#500973,85,#8f0303,197,#500973,89
1,#500973,85,#8f0303,197,#500973,85,#500973,85,#500973,85,...,#092573,234,#500973,85,#092573,223,#500973,86,#8f0303,197
2,#092573,219,#500973,85,#092573,220,#092573,220,#092573,220,...,#500973,87,#092573,222,#8f0303,208,#092573,233,#092573,230


In [192]:
color_df.to_csv("viz_clusters.csv")