In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Problem 4: Clustering senators

In this problem, you will use the voting history from the 114th Congress to split Senators into clusters.

In [2]:
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/114_congress.csv'
data = pd.read_csv(url)
data.head()

Unnamed: 0,name,party,state,00001,00004,00005,00006,00007,00008,00009,00010,00020,00026,00032,00038,00039,00044,00047
0,Alexander,R,TN,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,Ayotte,R,NH,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,Baldwin,D,WI,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
3,Barrasso,R,WY,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
4,Bennet,D,CO,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


The data contains all the votes from the 114th Senate. Each row contains the votes of an individual senator. Votes are coded as 0 for “No”, 1 for “Yes”, and 0.5 for “Abstain”.

In [3]:
# feature matrix X
X = data.drop(['name','party','state'],axis=1).to_numpy()
# vector with party affiliation
party = data['party'].to_numpy()
# senators names
names = data['name'].to_numpy()

**Part 1:** Use kmeans to split the senators into two clusters. 
Do the two clusters correspond to the two parties?

In [10]:
def kmeans(X,k,max_iterations=1000):
    from sklearn.metrics import pairwise_distances
    # initialize iteration counter
    it = 0
    repeat = True
    
    # number of datapoints
    m = X.shape[0] 
    
    # initialization
    means = X[np.random.choice(m,k,replace=False)] 
    dist = pairwise_distances(X,means)
    clusters = np.argmin(dist, axis=1)
 
    
    while repeat and it<max_iterations:
        
        # update means; # if a cluster has no data points associated with it, replace it with a random data point
        means = np.array([np.mean(X[clusters==i], axis=0) 
                          if np.sum(clusters==i)!=0
                          else  X[np.random.randint(m)]
                          for i in range(k)])
        
        # update clusters
        dist = pairwise_distances(X,means)
        new_clusters = np.argmin(dist, axis=1)
        
        # check if the new clusters are equal to the previous clusters
        if np.sum(clusters!=new_clusters)==0: 
            repeat = False
        clusters = new_clusters
                
        it += 1 # increment iteration counter by 1
    
    return clusters, means

In [11]:
clusters, means = kmeans(X,k=2)

In [12]:
# cluster 0
data[clusters==0].party.value_counts()

D    43
I     2
Name: party, dtype: int64

In [13]:
# cluster 1
data[clusters==1].party.value_counts()

R    54
D     1
Name: party, dtype: int64

**Part 2:** Find the senators that are in the cluster associated with the opposite party

In [17]:
data[(clusters==1) & (data.party=='D')].name

56    Manchin
Name: name, dtype: object

**Part 3:** More clusters could show wings of each party, or cross-party groups. Use kmeans to split the senators into 5 clusters, and analyze the two democratic factions

In [18]:
clusters, means = kmeans(X,k=5)

In [19]:
# cluster 0
data[clusters==0].party.value_counts()

D    6
R    2
Name: party, dtype: int64

In [20]:
# cluster 1
data[clusters==1].party.value_counts()

R    47
Name: party, dtype: int64

In [21]:
# cluster 2
data[clusters==2].party.value_counts()

D    31
I     1
Name: party, dtype: int64

In [22]:
# cluster 3
data[clusters==3].party.value_counts()

D    7
I    1
Name: party, dtype: int64

In [23]:
# cluster 4
data[clusters==4].party.value_counts()

R    5
Name: party, dtype: int64

In [26]:
# cluster 0
data[clusters==0].name

1        Ayotte
20      Collins
28     Donnelly
42     Heitkamp
56      Manchin
59    McCaskill
74         Reid
89       Tester
Name: name, dtype: object

In [27]:
# cluster 2
data[clusters==2].name

2        Baldwin
5     Blumenthal
7         Booker
9          Boxer
10         Brown
12      Cantwell
14        Cardin
21         Coons
29        Durbin
32     Feinstein
35       Franken
37    Gillibrand
44        Hirono
49         Kaine
54         Leahy
57        Markey
61      Menendez
62       Merkley
63      Mikulski
66        Murphy
67        Murray
68        Nelson
73          Reed
79       Sanders
81        Schatz
82       Schumer
85       Shaheen
93         Udall
95        Warner
96        Warren
97    Whitehouse
99         Wyden
Name: name, dtype: object

In [29]:
# cluster 3
data[clusters==3].name

4        Bennet
15       Carper
16        Casey
41     Heinrich
50         King
52    Klobuchar
71       Peters
87     Stabenow
Name: name, dtype: object