In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns

from sklearn.cluster import KMeans, DBSCAN, HDBSCAN

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

In [2]:
df = pd.read_csv('../data/raw-polls-updated.csv')

In [3]:
# Removing the unrated pollsters we identified earlier
unrated_pollsters = ['University of Maryland', 'Brigham Young University', 'Research America Inc.', 
                     'Insights West', 'Sacred Heart University', 'Hofstra University']

unrated_indices = []

for pollster in unrated_pollsters:
    unrated_indices += list(df[df['pollster'] == pollster].index)

df = df.drop(
    index = unrated_indices
    ).reset_index()# Resetting index fixes issues from dropping rows

In [4]:
# We're only going to select columns with measures on the poll level and
# not pollster level metrics. For this approach we will rate our clusters
# by how well they approximate pollster level differences.
features = [
    'year',
    'type_simple',
    'partisan',
    'samplesize',
    'margin_poll',
    'advancedplusminus',
    'imputed_600',
    'anon',
    'registered_voters',
    'averaged',
    'Text',
    'Live Phone',
    'Mail',
    'Face-to-Face',
    'IVR',
    'Online',
    'rightcall',
    'error',
    'calc_bias',
    'days_bt_polldate_election'
]

X = df[features]



# We're going to define these stages now because all subsequent pipelines will want
# to have access to them. Scaling is notably ommitted here because we will be comparing
# Standard and MinMax scaling with each clustering algorithm.
partisaner = Pipeline( # OneHotEncodes race type and partisan
    [
        ('ohe', OneHotEncoder(drop = 'first'))
    ]
)

preprocessing_pipe = Pipeline(
    [
        ('ct', ColumnTransformer(
            [
                ('patrisan_ct', partisaner, ['partisan', 'type_simple'])
            ],
            remainder = 'passthrough'
            )
        )
    ]
)

## K-Means Clustering

In [5]:
kmeans_pipe = Pipeline(
    [
        ('pre', preprocessing_pipe),
        ('min_max', MinMaxScaler()),
        ('kmeans', KMeans(n_init = 30, random_state = 42)) # Default is 8 clusters
    ]
).fit(X)

clustered = pd.concat(
    [X, pd.Series(kmeans_pipe['kmeans'].labels_, name = 'cluster')]
    , axis = 1)

print(
    "8-Means Silhouette Score: {}".format(
        silhouette_score(kmeans_pipe.transform(X), clustered['cluster'])
    )
)
clustered.groupby('cluster')[['calc_bias', 'error', 'rightcall']].agg(
    {
    'calc_bias' : ['count', 'mean'],
    'error' : ['mean'],
    'rightcall' : ['mean']
    }
)

8-Means Silhouette Score: 0.4115341273020342


Unnamed: 0_level_0,calc_bias,calc_bias,error,rightcall
Unnamed: 0_level_1,count,mean,mean,mean
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,1973,0.972884,5.638094,0.776736
1,1960,-1.317735,5.545255,0.811735
2,886,2.759526,4.545056,0.80474
3,1247,0.417674,6.191564,0.831997
4,1442,0.481553,3.768974,0.785021
5,1082,-3.509769,8.371774,0.799908
6,867,-0.7903,4.98278,0.782007
7,1298,0.051148,5.161009,0.829738


We will consider this model our 'baseline' so to speak, it has relatively well distributed clusters, some apparent differences in the average error, bias, and correct call rate (CCR), and has a silhouette score or 0.4115 which is not bad but not good. Before going any further lets try a number of values for $k$ ranging from 2 to 51 solely because there are 51 states plus DC and PR in our data (Note that we did not include state in our variables).

In [16]:
score = []

for k in range(2, 52):#
    cl  = Pipeline(
    [
        ('preprocessing', preprocessing_pipe),
        ('min_max', MinMaxScaler()),
        ('kmeans', KMeans(n_clusters = k, n_init = 30, random_state = 42))
    ]
)
    cl.fit(X)

    sil = silhouette_score(cl.transform(X), cl['kmeans'].labels_)

    score.append((k, cl['kmeans'].inertia_, sil)) 

score_df = pd.DataFrame(score, columns = ['k', 'inertia', 'silhouette_score'])

score_df.sort_values(
    by = ['silhouette_score'], ascending = False
    ).head(), score_df.loc[[0, 3, 48, 49], :]

(     k       inertia  silhouette_score
 0    2  16081.932580          0.588787
 1    3  14067.272930          0.559873
 26  28   4722.691140          0.535634
 20  22   5383.550621          0.528013
 22  24   5114.216432          0.527903,
      k       inertia  silhouette_score
 0    2  16081.932580          0.588787
 3    5  11568.173246          0.411342
 48  50   3496.351330          0.427591
 49  51   3483.726987          0.388677)

We see that our best performing values of $k$ are 2 and 3, with a few values in the mid 20s also performing well. We also see that 50 and 51 clusters performed poorly which is to be expected but was worth trying at least. Lets take a look at our 2 and 3 cluster models before moving on.

#### 2-Means Model

In [31]:
# k = 2
kmeans_pipe = Pipeline(
    [
        ('pre', preprocessing_pipe),
        ('min_max', MinMaxScaler()),
        ('kmeans', KMeans(n_clusters = 2, n_init = 30, random_state = 42))
    ]
).fit(X)

clustered = pd.concat(
    [X, pd.Series(kmeans_pipe['kmeans'].labels_, name = 'cluster')]
    , axis = 1)

print(
    "2-Means Silhouette Score: {}".format(
        silhouette_score(kmeans_pipe.transform(X), clustered['cluster'])
    )
)

clustered.groupby('cluster')[['calc_bias', 'error', 'rightcall']].agg(
    {
    'calc_bias' : ['count', 'mean'],
    'error' : ['mean'],
    'rightcall' : ['mean']
    }
)

2-Means Silhouette Score: 0.5887869160751409


Unnamed: 0_level_0,calc_bias,calc_bias,error,rightcall
Unnamed: 0_level_1,count,mean,mean,mean
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,5851,-0.141478,5.552343,0.795847
1,4904,-0.120648,5.457969,0.809543


We see these clusters are fairly even in size and also not particularly different in terms of poll accuracy metrics. Let's look at other measurable differences between our clusters.

In [22]:
clustered.groupby('cluster')['partisan'].value_counts(normalize = True)

cluster  partisan
0        NPL         0.908392
         D           0.050761
         R           0.040848
1        NPL         0.942496
         D           0.031403
         R           0.026101
Name: proportion, dtype: float64

We see more Non-Partisan polls in the first cluster but otherwise this isnt' too interesting of an angle so let's try our three most common methodologies.

In [24]:
clustered.groupby('cluster')[['Live Phone', 'IVR', 'Online']].mean()

Unnamed: 0_level_0,Live Phone,IVR,Online
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1.0,0.027175,0.014698
1,0.00102,0.651101,0.463295


It looks like our 2-mean model is separating at least partially based on Live Phone methodology, that's interesting and worth keeping in mind for the 3-mean model. Before we move on, let's quickly consider a few more measures.

In [27]:
clustered.groupby('cluster')['type_simple'].value_counts(normalize  = True)

cluster  type_simple
0        Pres-G         0.249872
         Sen-G          0.227482
         House-G        0.182704
         Pres-P         0.180311
         Gov-G          0.159631
1        Pres-G         0.299959
         Sen-G          0.268352
         Gov-G          0.158238
         Pres-P         0.155791
         House-G        0.117659
Name: proportion, dtype: float64

No particular difference here, fewer House races in the second cluster but no clear standouts here.

In [29]:
clustered.groupby('cluster')[
    [
        'days_bt_polldate_election',
        'samplesize',
        'margin_poll'
    ]
].mean()

Unnamed: 0_level_0,days_bt_polldate_election,samplesize,margin_poll
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,10.432063,751.575799,2.660118
1,9.163744,1115.34533,2.87154


Cluster one has slightly earlier polls and much smaller sample sizes, likely due to live phone polling having lower response rates, margin is about the same however but let's quickly check the standard deviation for margin and bias because they both can have negative values.

In [30]:
clustered.groupby('cluster')[
    [
        'calc_bias',
        'margin_poll'
    ]
].std()

Unnamed: 0_level_0,calc_bias,margin_poll
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,7.456059,14.971607
1,7.211605,13.321026


Slightly lower bias standard deviation for cluster two, we also see noticeably lower poll margin standard deviation. We have very slight evidence that cluster two is slightly more accurate it seems. Now onto the 3-means model.

#### 3-Means Model

In [32]:
# k = 3
kmeans_pipe = Pipeline(
    [
        ('pre', preprocessing_pipe),
        ('min_max', MinMaxScaler()),
        ('kmeans', KMeans(n_clusters = 3, n_init = 30, random_state = 42))
    ]
).fit(X)

clustered = pd.concat(
    [X, pd.Series(kmeans_pipe['kmeans'].labels_, name = 'cluster')]
    , axis = 1)

print(
    "3-Means Silhouette Score: {}".format(
        silhouette_score(kmeans_pipe.transform(X), clustered['cluster'])
    )
)

clustered.groupby('cluster')[['calc_bias', 'error', 'rightcall']].agg(
    {
    'calc_bias' : ['count', 'mean'],
    'error' : ['mean'],
    'rightcall' : ['mean']
    }
)

3-Means Silhouette Score: 0.5598733195136475


Unnamed: 0_level_0,calc_bias,calc_bias,error,rightcall
Unnamed: 0_level_1,count,mean,mean,mean
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,2767,-1.335276,5.461211,0.802855
1,2345,1.454529,5.43052,0.81258
2,5643,-0.201242,5.565639,0.79736


In [33]:
clustered.groupby('cluster')['partisan'].value_counts(normalize = True)

cluster  partisan
0        NPL         0.926997
         D           0.040477
         R           0.032526
1        NPL         0.955650
         D           0.026013
         R           0.018337
2        NPL         0.909268
         D           0.049265
         R           0.041467
Name: proportion, dtype: float64

We see that there again isn't much difference between error and CCR for this model but we do see a bit of a difference between bias for the three clusters. Its hard to make sense of that just yet but for now we can say that the polls in the first cluster trend towards Republican bias while the polls in the third cluster trend toward Democratic bias. The polls in cluster 2 however seem to trend towards neither. All three clusters are about the same amount of wrong (though cluster two is ~1% more accurate) but the ways in which they are wrong is different. We also see that the cluster sizes have diverged a bit, our third cluster is about half of our observations with the remaining two being about the same size. Let's check methodology, we'd expect to see some trends here that extend what we saw in the 2-means model.

In [34]:
clustered.groupby('cluster')[['Live Phone', 'IVR', 'Online']].mean()

Unnamed: 0_level_0,Live Phone,IVR,Online
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.057102,0.988435,0.0
1,0.026866,0.262687,0.993603
2,0.998582,0.000177,0.004962


As expected, our 3-means model managed to separate into the three main methodologies. Remember, these methodologies can be used together so some amount of overlap is expected. Cluster 1 is mainly composed of IVR polls with a handful of live phone polls and no online polls. Cluster 2 has mainly online polls but a quarter of the polls are IVR with a very small number of live phone polls. Lastly, cluster 3 is almost entirely live phone polls with a few IVR and online polls.

In [35]:
clustered.groupby('cluster')['type_simple'].value_counts(normalize  = True)

cluster  type_simple
0        Sen-G          0.250813
         Pres-G         0.236718
         Pres-P         0.192989
         Gov-G          0.173473
         House-G        0.146007
1        Pres-G         0.378678
         Sen-G          0.293390
         Gov-G          0.139446
         Pres-P         0.100213
         House-G        0.088273
2        Pres-G         0.246323
         Sen-G          0.224172
         Pres-P         0.186071
         House-G        0.183413
         Gov-G          0.160021
Name: proportion, dtype: float64

We see some slight differences here, namely cluster 2 has proportionally, more presidential general polls compared to the other groups. Again, we struggle to attribute much meaning here given how even the types are distributed across clusters.

In [36]:
clustered.groupby('cluster')[
    [
        'calc_bias',
        'margin_poll'
    ]
].std()

Unnamed: 0_level_0,calc_bias,margin_poll
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,7.144516,13.018991
1,6.955245,13.480922
2,7.480267,15.082851


Checking the standard deviation of bias and poll margin is interesting, our second cluster had the lowest on average bias and has the lowest standard deviation for bias as well. Cluster 3, the biggest one, has the highest standard deviation for both metrics and is much higher for poll margin than the rest. We now may have some slight evidence that cluster 2 captures the 'most accurate' polls in some way. More importantly, we can say that online polls are actually generally well centered around 0 bias in the aggregate which is quite interesting.

### Standard Scaler

So far we've only worked with a min-max scaler and our original data. Min-max scaler scales values to between 0 and 1 depending on their distance from the minimum and maximum values for that feature which means that all of our categorical variables (coded as 0 and 1) are set as the maximum possible difference from each other. This likely led to our clusters being so well-defined by methodology. With a standard scaler we'd expect slightly different results so let's check and compare.

In [37]:
score = []

for k in range(2, 31): # Reducing max clusters from 52 to 30 
    cl  = Pipeline(
    [
        ('preprocessing', preprocessing_pipe),
        ('ss', StandardScaler()), # StandardScaler instead
        ('kmeans', KMeans(n_clusters = k, n_init = 30, random_state = 42))
    ]
)
    cl.fit(X)

    sil = silhouette_score(cl.transform(X), cl['kmeans'].labels_) 

    score.append((k, cl['kmeans'].inertia_, sil)) 

score_df = pd.DataFrame(score, columns = ['k', 'inertia', 'silhouette_score'])

score_df.sort_values(
    by = ['silhouette_score'], ascending = False
    ).head(), score_df.loc[[0, 1], :] # Making sure to check 2 and 3 means

(     k        inertia  silhouette_score
 0    2  224418.079078          0.653473
 20  22   90639.513827          0.314636
 14  16  105750.001434          0.314532
 22  24   87424.452103          0.306763
 23  25   85964.821563          0.305023,
    k        inertia  silhouette_score
 0  2  224418.079078          0.653473
 1  3  205136.317710          0.258471)

As expected, standard scaler shows very different results from the min-max scaler. Additionally, generally we have worse silhouette scores for standard scaled models. The notable exception is 2-means here which has the highest silhouette score we've seen so far so lets take a look at it.

In [38]:
# k = 2
kmeans_pipe = Pipeline(
    [
        ('pre', preprocessing_pipe),
        ('ss', StandardScaler()),
        ('kmeans', KMeans(n_clusters = 2, n_init = 30, random_state = 42))
    ]
).fit(X)

clustered = pd.concat(
    [X, pd.Series(kmeans_pipe['kmeans'].labels_, name = 'cluster')]
    , axis = 1)

print(
    "2-Means Silhouette Score: {}".format(
        silhouette_score(kmeans_pipe.transform(X), clustered['cluster'])
    )
)

clustered.groupby('cluster')[['calc_bias', 'error', 'rightcall']].agg(
    {
    'calc_bias' : ['count', 'mean'],
    'error' : ['mean'],
    'rightcall' : ['mean']
    }
)

2-Means Silhouette Score: 0.6534730509132135


Unnamed: 0_level_0,calc_bias,calc_bias,error,rightcall
Unnamed: 0_level_1,count,mean,mean,mean
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,10107,-0.125055,5.509579,0.808647
1,648,-0.24,5.505123,0.699846


In [40]:
clustered.groupby('cluster')['partisan'].value_counts()

cluster  partisan
0        NPL         9937
         D            170
1        R            367
         D            281
Name: count, dtype: int64

Interesting, our model has only separated a small section of the polls but that section has a much lower CCR and twice the bias. We also see the exact same error which is notable but it's hard to discern what that tells us yet. We also see that this model is separating all of the Republican polls and about two-thirds of the of the Democratic polls. They are in the same cluster than has much lower CCR than the main cluster. It seems that second cluster is isolating less accurate polls due to partisan influence but we can't draw any hard conclusions from this information.

In [42]:
clustered.groupby('cluster')[['Live Phone', 'IVR', 'Online']].mean()

Unnamed: 0_level_0,Live Phone,IVR,Online
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.541902,0.30563,0.223805
1,0.584877,0.405864,0.148148


We have mostly an even distribution of poll methodoology here. There are some differences in IVR and online rates but given the sample size difference this isn't too notable. The standard scaler is very clearly working differently than the min-max scaler as we expected.

In [43]:
clustered.groupby('cluster')['type_simple'].value_counts(normalize  = True)

cluster  type_simple
0        Pres-G         0.275749
         Sen-G          0.244286
         Pres-P         0.175027
         Gov-G          0.161472
         House-G        0.143465
1        House-G        0.302469
         Sen-G          0.274691
         Pres-G         0.225309
         Gov-G          0.120370
         Pres-P         0.077160
Name: proportion, dtype: float64

Some slight differences here, the less accurate cluster twice the proportion of House races and very few Presidential Primary races which makes some sense given the partisan slant that our cluster seems to represent.

In [44]:
clustered.groupby('cluster')[
    [
        'calc_bias',
        'margin_poll'
    ]
].std()

Unnamed: 0_level_0,calc_bias,margin_poll
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,7.353117,14.484143
1,7.226516,9.486006


Taking our quick look at standard deviation for margin and bias we see similar spread in bias but much less spread in margins. This again seems to point to systematic issues with the polls in the second cluster.

### PCA

Principle Component Analysis (PCA) is a method of dimensionality reduction that transforms our data into a set of linear combinations of our features that maximize the variance in our data. In essence, PCA is a tool that can help us separate the valuable information in our data from the noise. This may prove quite valuable because polling has a lot of noise. In the end, the goal of polling is to predict the future and there will always be randomness the poll will not take into account.

We will try PCA with both the standard scaler and the min-max scaler because they spread our data out differently so it follows that our PCA will behave differently with the different scalers.

#### Min-Max Scaler

In [46]:
score = []

for k in range(2, 31):
    cl  = Pipeline(
    [
        ('preprocessing', preprocessing_pipe),
        ('min_max', MinMaxScaler()),
        # Choosing the components that give us 85% of the variance in our data
        ('pca', PCA(n_components = 0.85)), 
        ('kmeans', KMeans(n_clusters = k, n_init = 30, random_state = 42))
    ]
)
    cl.fit(X)

    sil = silhouette_score(cl.transform(X), cl['kmeans'].labels_)

    score.append((k, cl['kmeans'].inertia_, sil))

score_df = pd.DataFrame(score, columns = ['k', 'inertia', 'silhouette_score'])

score_df.sort_values(
    by = ['silhouette_score'], ascending = False
    ).head(), score_df.loc[[0, 1], :]

(     k       inertia  silhouette_score
 0    2  13590.609948          0.633850
 1    3  11580.460107          0.598354
 19  21   3363.467289          0.596041
 20  22   3240.350545          0.595020
 18  20   3549.065515          0.589247,
    k       inertia  silhouette_score
 0  2  13590.609948          0.633850
 1  3  11580.460107          0.598354)

We see higher silhouette scores which is interesting, especially for the 2-means model. We also see that 3-means does only slightly better than the 20-22 means models. We'll investigate the 2 and 3 means models to see if they're any different from the clusters from before.

In [50]:
# k = 2
kmeans_pipe = Pipeline(
    [
        ('pre', preprocessing_pipe),
        ('min_max', MinMaxScaler()),
        ('pca', PCA(n_components = 0.85)),
        ('kmeans', KMeans(n_clusters = 2, n_init = 30, random_state = 42))
    ]
).fit(X)

clustered = pd.concat(
    [X, pd.Series(kmeans_pipe['kmeans'].labels_, name = 'cluster')]
    , axis = 1)

print(
    "2-Means Silhouette Score: {}".format(
        silhouette_score(kmeans_pipe.transform(X), clustered['cluster'])
    )
)

clustered.groupby('cluster')[['calc_bias', 'error', 'rightcall']].agg(
    {
    'calc_bias' : ['count', 'mean'],
    'error' : ['mean'],
    'rightcall' : ['mean']
    }
)

2-Means Silhouette Score: 0.6338501807701804


Unnamed: 0_level_0,calc_bias,calc_bias,error,rightcall
Unnamed: 0_level_1,count,mean,mean,mean
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,5849,-0.142424,5.553344,0.795777
1,4906,-0.119529,5.456814,0.809621


In [52]:
clustered.groupby('cluster')['partisan'].value_counts(normalize = True)

cluster  partisan
0        NPL         0.908360
         D           0.050778
         R           0.040862
1        NPL         0.942519
         D           0.031390
         R           0.026091
Name: proportion, dtype: float64

These clusters are nearly the same as the ones we had for our 2-mean min-max model without PCA decomposition. For that reason we'll do only a cursory glance at this model. It is interesting that exactly two polls got moved but because this model mainly separated by methodology we can't take too much away here.

In [53]:
clustered.groupby('cluster')[['Live Phone', 'IVR', 'Online']].mean()

Unnamed: 0_level_0,Live Phone,IVR,Online
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1.0,0.026842,0.014703
1,0.001427,0.651243,0.463106


The same as before, we can be satisfied this model is performing very similarly to the one without PCA.

In [54]:
# k = 3
kmeans_pipe = Pipeline(
    [
        ('pre', preprocessing_pipe),
        ('min_max', MinMaxScaler()),
        ('pca', PCA(n_components = 0.85)),
        ('kmeans', KMeans(n_clusters = 3, n_init = 30, random_state = 42))
    ]
).fit(X)

clustered = pd.concat(
    [X, pd.Series(kmeans_pipe['kmeans'].labels_, name = 'cluster')]
    , axis = 1)

print(
    "3-Means Silhouette Score: {}".format(
        silhouette_score(kmeans_pipe.transform(X), clustered['cluster'])
    )
)

clustered.groupby('cluster')[['calc_bias', 'error', 'rightcall']].agg(
    {
    'calc_bias' : ['count', 'mean'],
    'error' : ['mean'],
    'rightcall' : ['mean']
    }
)

3-Means Silhouette Score: 0.5983536672156453


Unnamed: 0_level_0,calc_bias,calc_bias,error,rightcall
Unnamed: 0_level_1,count,mean,mean,mean
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,5651,-0.197356,5.568252,0.797646
1,2368,1.389307,5.465532,0.812289
2,2736,-1.313622,5.425464,0.802449


In [55]:
clustered.groupby('cluster')['partisan'].value_counts(normalize = True)

cluster  partisan
0        NPL         0.909397
         D           0.049195
         R           0.041409
1        NPL         0.956503
         D           0.025760
         R           0.017736
2        NPL         0.925804
         D           0.040936
         R           0.033260
Name: proportion, dtype: float64

Again, these results look quite familiar. We will check methodology again before moving along.

In [56]:
clustered.groupby('cluster')[['Live Phone', 'IVR', 'Online']].mean()

Unnamed: 0_level_0,Live Phone,IVR,Online
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.997346,0.0,0.005309
1,0.02576,0.260135,0.983108
2,0.058114,1.0,0.0


The few polls that moved around due to PCA seem to have made the clusters a bit purer with regards to methodology. So far it seems that PCA has improved our clustering but not changed them by a large margin.

#### Standard Scaler

In [58]:
score = []

for k in range(2, 31): # Reducing max clusters from 52 to 30 
    cl  = Pipeline(
    [
        ('preprocessing', preprocessing_pipe),
        ('ss', StandardScaler()), # StandardScaler instead
        ('pca', PCA(n_components = 0.85)),
        ('kmeans', KMeans(n_clusters = k, n_init = 30, random_state = 42))
    ]
)
    cl.fit(X)

    sil = silhouette_score(cl.transform(X), cl['kmeans'].labels_) 

    score.append((k, cl['kmeans'].inertia_, sil)) 

score_df = pd.DataFrame(score, columns = ['k', 'inertia', 'silhouette_score'])

score_df.sort_values(
    by = ['silhouette_score'], ascending = False
    ).head(), score_df.loc[0, :] # Making sure to check 2 means

(     k        inertia  silhouette_score
 0    2  192654.792815          0.675127
 19  21   69983.212980          0.300875
 10  12   98391.385355          0.298260
 12  14   85879.860042          0.295490
 18  20   71749.739544          0.295341,
 k                        2.000000
 inertia             192654.792815
 silhouette_score         0.675127
 Name: 0, dtype: float64)

Again, we see similar results as what we saw before applying PCA but let's take a look at our 2-means model again before moving on.

In [59]:
# k = 2
kmeans_pipe = Pipeline(
    [
        ('pre', preprocessing_pipe),
        ('ss', StandardScaler()),
        ('pca', PCA(n_components = 0.85)),
        ('kmeans', KMeans(n_clusters = 2, n_init = 30, random_state = 42))
    ]
).fit(X)

clustered = pd.concat(
    [X, pd.Series(kmeans_pipe['kmeans'].labels_, name = 'cluster')]
    , axis = 1)

print(
    "2-Means Silhouette Score: {}".format(
        silhouette_score(kmeans_pipe.transform(X), clustered['cluster'])
    )
)

clustered.groupby('cluster')[['calc_bias', 'error', 'rightcall']].agg(
    {
    'calc_bias' : ['count', 'mean'],
    'error' : ['mean'],
    'rightcall' : ['mean']
    }
)

2-Means Silhouette Score: 0.6751270446822838


Unnamed: 0_level_0,calc_bias,calc_bias,error,rightcall
Unnamed: 0_level_1,count,mean,mean,mean
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,10107,-0.125055,5.509579,0.808647
1,648,-0.24,5.505123,0.699846


We see the exact same values as we saw before along with the exact same cluster size so we can surmise that PCA did nothing except inflate our silhouette score due to maximizing the variance in our data.

It seems that PCA is valuable and does improve the performance of our clustering but does not actually change our clusters by much. Moving forward we will be applying PCA from the start to ensure our models are as effective as possible though it will likely make no major difference.

## DBSCAN

DBSCAN is a density based clustering algorithm that separates the data into clusters as appropriate and designates outliers of data points that don't fit into any possible cluster. It should perform differently from K-Means and we suspect that there will be a number of outliers identified.

### Min-Max Scaler

In [60]:
# DBSCAN doesn't have a transform method so we need to process our data separately
dbscan_prep = Pipeline(
    [
        ('pre', preprocessing_pipe),
        ('min_max', MinMaxScaler()),
        ('pca', PCA(n_components = 0.85))
    ]
).fit(X)

In [62]:
# Finding good values of epsilon and min_samples is crucial so we set a wide range
# 
score = []

for min_sample in range(5, 100, 5):
    for eps in range(2, 20): # The max value here came from trial and error (eps >= 1 throws an error)
        db = DBSCAN(eps = eps/20, min_samples = min_sample, n_jobs = 4)
        db.fit(dbscan_prep.transform(X))

        sil = silhouette_score(dbscan_prep.transform(X), db.labels_)

        score.append((eps/10, min_sample, sil))

score_db = pd.DataFrame(score, columns = ['eps', 'min_samples', 'silhouette_score'])
score_db.sort_values(by = ['silhouette_score'], ascending = False).head()

In [63]:
# eps = 0.9, min_samples = 20

Unnamed: 0,eps,min_samples,silhouette_score
61,0.9,20,0.422936
25,0.9,10,0.421476
43,0.9,15,0.416802
79,0.9,25,0.414523
7,0.9,5,0.413423


We have quite low silhouette scores here which is unfortunate but we can take a look at our best performing hyper-parameters and see if we get anything particularly useful.

In [65]:
dbscan = DBSCAN(eps = 0.9, min_samples = 20, n_jobs = 4).fit(
    dbscan_prep.transform(X)
    )

clustered = pd.concat([X, pd.Series(dbscan.labels_, name = 'cluster')], axis = 1)

clustered.groupby('cluster')[['calc_bias', 'error', 'rightcall']].agg(
    {
    'calc_bias' : ['count', 'mean'],
    'error' : ['mean'],
    'rightcall' : ['mean']
    }
)

Unnamed: 0_level_0,calc_bias,calc_bias,error,rightcall
Unnamed: 0_level_1,count,mean,mean,mean
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
-1,28,-1.751071,6.143929,0.267857
0,10727,-0.127754,5.507655,0.803487


It looks like our model was only able to find 28 outliers and no clusters, but those outliers were much less accurate than the rest of the polls. Let's investigate the 28 before moving on.

In [73]:
# These two features and metrics were isolated by using .describe() 
# For the sake of brevity only the relevant features and metric are kept
clustered.groupby('cluster')[['year', 'anon']].agg(['min', 'mean', 'median', 'max'])

Unnamed: 0_level_0,year,year,year,year,anon,anon,anon,anon
Unnamed: 0_level_1,min,mean,median,max,min,mean,median,max
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
-1,2012,2018.25,2019.5,2020,0,0.964286,1.0,1
0,1998,2010.229235,2010.0,2020,0,0.043069,0.0,1


It seems our model has managed to isolate a strand of modern polls that were mostly sponsered by anonymous partisan operatives that were *much* less accurate than typical polls are. Let's check our partisan variable to see if we have any interesting result there.

In [74]:
clustered.groupby('cluster')['partisan'].value_counts()

cluster  partisan
-1       D             15
         R             13
 0       NPL         9937
         D            436
         R            354
Name: count, dtype: int64

An even amount of each which is less scandalous than it could have been but still a notable result. All in all, DBSCAN managed to identify 28 polls that performed quite poorly which is an accomplishment in itself with how poorly or models have done at identifying differences in CCR so far.

### Standard Scaler

In [75]:
dbscan_prep = Pipeline(
    [
        ('pre', preprocessing_pipe),
        ('ss', StandardScaler()),
        ('pca', PCA(n_components = 0.85))
    ]
).fit(X)

In [88]:
score = []

for min_sample in range(5, 100, 5):
        for eps in range(5, 30):
            # Avoiding a value error that comes up with some ratios of eps : min_samples
            # If the algorithm assigns everything to 1 group then the exception is triggered.
            # You can only calculate a silhouette score if there are 2 or more groups.
            try: 
                db = DBSCAN(eps = eps/20, min_samples = min_sample, n_jobs = 4)
                db.fit(dbscan_prep.transform(X))
                
                sil = silhouette_score(dbscan_prep.transform(X), db.labels_)
            except ValueError:
                sil = np.nan
            score.append((eps/10, min_sample, sil))

score_db = pd.DataFrame(score, columns = ['eps', 'min_samples', 'silhouette_score'])
score_db.sort_values(by = ['silhouette_score'], ascending = False).head()

Unnamed: 0,eps,min_samples,silhouette_score
24,2.9,5,0.095208
23,2.8,5,0.088517
49,2.9,10,0.078591
124,2.9,25,0.077756
99,2.9,20,0.07508


This is shockingly poor performance. We can check if the issue is with standard scaling or with applying PCA to standard scaled data but before that let's take a look at one of these models just in case.

In [91]:
dbscan = DBSCAN(eps = 2.9, min_samples = 5, n_jobs = 4).fit(
    dbscan_prep.transform(X)
    )

clustered = pd.concat([X, pd.Series(dbscan.labels_, name = 'cluster')], axis = 1)

clustered.groupby('cluster')[['calc_bias', 'error', 'rightcall']].agg(
    {
    'calc_bias' : ['count', 'mean'],
    'error' : ['mean'],
    'rightcall' : ['mean']
    }
)

Unnamed: 0_level_0,calc_bias,calc_bias,error,rightcall
Unnamed: 0_level_1,count,mean,mean,mean
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
-1,147,-1.672517,11.900748,0.636054
0,9268,-0.276293,5.495708,0.810747
1,131,0.645802,4.969771,0.774809
2,413,0.111961,5.045182,0.857143
3,402,1.42607,4.126368,0.670398
4,9,3.04,5.511111,0.388889
5,15,1.776,3.176,0.9
6,9,-0.023333,7.081111,0.888889
7,33,-0.823333,4.688182,0.787879
8,7,9.614286,10.16,1.0


That's simple too many clusters to make much sense of but it does seem capable of separating some of the low CCR polls. It is very likely that the result is simply due to very small clusters for the most part so we're going to move on to checking if PCA is the issue here.

In [92]:
dbscan_prep = Pipeline(
    [
        ('pre', preprocessing_pipe),
        ('ss', StandardScaler()),
        # ('pca', PCA(n_components = 0.85)) No PCA this time
    ]
).fit(X)


score = []
# Same hyper-paramters as before
for min_sample in range(5, 100, 5):
        for eps in range(5, 30):
            try: 
                db = DBSCAN(eps = eps/20, min_samples = min_sample, n_jobs = 4)
                db.fit(dbscan_prep.transform(X))
                
                sil = silhouette_score(dbscan_prep.transform(X), db.labels_)
            except ValueError:
                sil = np.nan
            score.append((eps/10, min_sample, sil))

score_db = pd.DataFrame(score, columns = ['eps', 'min_samples', 'silhouette_score'])
score_db.sort_values(by = ['silhouette_score'], ascending = False).head()

Unnamed: 0,eps,min_samples,silhouette_score
24,2.9,5,0.065546
23,2.8,5,0.047925
22,2.7,5,0.038718
49,2.9,10,0.036884
48,2.8,10,0.03196


Interesting, it seems like standard scaler just doesn't work well with DBSCAN in this case. We will move on and set DBSCAN aside in favor of a different approach.

## Heriarchical DBSCAN (HDBSCAN)

The main flaw of DBSCAN is that it performs poorly on data with varying densities. That is, when some of the data is clustered tight and some of the data is clustered over a large area, DBSCAN has a hard time properly assigning clusters. HDBSCAN aims to solve this issue by automatically varying the density checked for clusters. In short, HDBSCAN can detect clusters with varying densities while DBSCAN cannot. A helpful reference can be found [here](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html).

### Min-Max Scaler

We will start with min-max scaling because standard scaling performed quite poorly with DBSCAN.

In [93]:
hdbscan_prep = Pipeline(
    [
        ('pre', preprocessing_pipe),
        ('min_max', MinMaxScaler()),
        ('pca', PCA(n_components = 0.85))
    ]
).fit(X)

In [95]:
for min_sample in range(15, 100, 5):
        for min_cluster_size in range(5, 100, 5):
            hdb = HDBSCAN(
                  min_cluster_size = min_cluster_size, 
                  min_samples = min_sample, 
                  n_jobs = 4
            )
            hdb.fit(hdbscan_prep.transform(X))

            sil = silhouette_score(hdbscan_prep.transform(X), hdb.labels_)

            score.append((min_cluster_size, min_sample, sil))

score_db = pd.DataFrame(score, columns = ['min_cluster', 'min_samples', 'silhouette_score'])
score_db.sort_values(by = ['silhouette_score'], ascending = False).head()

In [96]:
score_db.sort_values(by = ['silhouette_score'], ascending = False).head()

Unnamed: 0,min_cluster,min_samples,silhouette_score
603,75.0,45,0.382961
602,70.0,45,0.382961
605,85.0,45,0.379923
604,80.0,45,0.379923
607,95.0,45,0.376771


It looks like HDBSCAN is doing a worse job than DBSCAN but we should look at our best models before making any decisions here.

In [97]:
hdbscan = HDBSCAN(min_cluster_size = 75, min_samples = 45, n_jobs = 4).fit(
    hdbscan_prep.transform(X)
    )

clustered = pd.concat([X, pd.Series(hdbscan.labels_, name = 'cluster')], axis = 1)

clustered.groupby('cluster')[['calc_bias', 'error', 'rightcall']].agg(
    {
    'calc_bias' : ['count', 'mean'],
    'error' : ['mean'],
    'rightcall' : ['mean']
    }
)

Unnamed: 0_level_0,calc_bias,calc_bias,error,rightcall
Unnamed: 0_level_1,count,mean,mean,mean
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
-1,1414,-0.310035,6.184717,0.5686
0,474,1.520127,5.634768,0.862869
1,131,2.102214,4.472443,0.923664
2,481,-1.314699,4.578274,0.995842
3,85,-5.456471,7.740471,0.0
4,123,-3.306911,8.724634,0.979675
5,121,1.806777,4.159835,0.979339
6,372,-0.76,5.614516,0.865591
7,940,-0.304521,4.906777,0.994681
8,124,0.813145,6.568629,0.004032
