# The shape of the United States presidential elections
### There are many ways to map election results - you've pobably not tried this one

This Jupyter notebook reproduces the results discussed in a [blog post](https://towardsdatascience.com/the-shape-of-the-united-states-presidential-elections-c336d80e4ddf) on Medium's Towards Data Science. Further details and explanations can be found in the [blog post](https://towardsdatascience.com/the-shape-of-the-united-states-presidential-elections-c336d80e4ddf).


## Libraries

In [1]:
import warnings
warnings.simplefilter('ignore')

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap
from matplotlib.colors import rgb2hex

import seaborn as sns
sns.set()

import utils
import plotting

from sklearn.preprocessing import MinMaxScaler
from gtda.mapper import make_mapper_pipeline
from gtda.mapper import plot_static_mapper_graph

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

ModuleNotFoundError: No module named 'seaborn'

## Data
As a first step, we will read the full dataset. As we are interested in investigating temporal evolution, we indentified numerical features changing between 2000 to 2016 and extract them. Giotto's Mappper implementation expects a numpy ndarray. Thus, we extract an ndarray for each election year.

In [None]:
# read full dataset
df = pd.read_pickle(os.path.join('data',
                                 'usa_election_full_dataset.pickle'))

df.head()

In [None]:
# extract and transform relevant columns
data = utils.get_data(df)

In [None]:
# split data per year
data_per_year = utils.split_data_by_year(data, df)

## Mapper
We want to use the first two principal components as a filter function. However, we observe that the resulting filter value distribution for the election year 2016 is
1. skewed in the first component and 
2. has heavy tales in the second component.

We, therefore, find a transformation to obtain a more compact distribution. Thanks to Giotto's compatibility with Sklearn, we can simply define the filter function as a pipeline.

In [None]:
# define pca as initial filter function
pca = PCA(n_components=2)

In [None]:
# analyse filter values for 2016
filtr_vals = pca.fit_transform(data_per_year['2016'])

fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(15, 10))

# pca 1
(sns.distplot(filtr_vals[:, 0],
              ax=ax[0, 0])
 .set_title('Original PCA 1'))
(sns.distplot(np.log(filtr_vals[:, 0] - min(filtr_vals[:, 0]) + 1),
              ax=ax[1, 0])
 .set_title('Transformed PCA 1'))

# pca 2
(sns.distplot(filtr_vals[:, 1],
              ax=ax[0, 1])
 .set_title('PCA 2'))
(sns.distplot(np.log(np.abs(filtr_vals[:, 1]) + 1),
              ax=ax[1, 1])
 .set_title('Transformed PCA 2'));

In [None]:
# define filter function
filter_func = Pipeline([('pca', PCA(n_components=2)),
                        ('trafo', FunctionTransformer(utils.log_transform_2d_filter_values))])

In [None]:
# perform mapper
pipe = make_mapper_pipeline(filter_func=filter_func, scaler=MinMaxScaler())
graph = pipe.fit_transform(data_per_year['2016'])

### Plotting
Now that we performed the Mapper algorithm, we can analyse the result visually.

In [None]:
# retrieve tuple of with datapoints per node as entries
node_elements_full = graph['node_metadata']['node_elements']

# set node color to be the mean filter function value of data points in a cluster (i.e. node)
node_color = np.array(utils.get_node_summary(node_elements_full,
                                             filter_func.fit_transform(data_per_year['2016']),
                                             summary_stat=np.mean))

# define custom options for plot:
# 1. set node size as uniform
# 2. set minimum and maximum color value
plotly_kwargs = {
    'node_trace_marker_size': [1] * len(node_elements_full),
    'node_trace_marker_cmin': min(node_color),
    'node_trace_marker_cmax': max(node_color)}

# create figure object
fig = plot_static_mapper_graph(pipe, data_per_year['2016'],
                               layout='kk', layout_dim=2,
                               node_color_statistic=node_color,
                               color_by_columns_dropdown=False,
                               plotly_kwargs=plotly_kwargs)
# show figure and enable scrolling
fig.show(config={'scrollZoom': True})

### Colored by Economic Indicator
We can find different regions in the Mapper graph based on the features we used. As a first step, we only plot the big connected component and define the singletons/small connected components as one region. The regions of the big component are revealed by coloring the nodes by the mean value of a given feature. Here, we limit ourselves to the three most informative features.

In [None]:
vertices_to_remove = utils.get_small_cluster_ids()
nodes_to_remove = np.array(list(v_ for v in vertices_to_remove for v_ in graph['node_metadata']['node_elements'][v]))

In [None]:
relevant_cols = ['Personal income (thousands of dollars)',
                 'Per capita personal income',
                 'Per capita retirement and other',]

for col in relevant_cols:
    print(f'-- {col} --')
    # set node color to mean feature value
    df_filtered = utils.get_filtered_values(df[df['year'] == 2016][col].values, nodes_to_remove)
    node_color = np.array([np.mean(df_filtered[graph['node_metadata']['node_elements'][vgroup]])
                           for vgroup in range(graph.vcount())])
    # set node text
    node_text = utils.get_node_text(
        dict(zip(range(len(node_elements_full)),
                 node_elements_full)),
             utils.get_n_electors(
                 node_elements_full,
                 df[df['year'] == 2016]['n_electors'].reset_index(drop=True)),
             node_color,
             col)

    # set custom plot options
    plotly_kwargs = {
        'node_trace_text': node_text,
        'node_trace_marker_size': [1] * len(node_elements_full),
        'node_trace_marker_cmin': min(node_color),
        'node_trace_marker_cmax': max(node_color)}

    # get figure object
    fig = plot_static_mapper_graph(pipe, data_per_year['2016'],
                                             layout = 'kk', layout_dim=2,
                                             node_color_statistic = node_color,
                                             color_by_columns_dropdown=True,
                                             plotly_kwargs=plotly_kwargs, clone_pipeline=False)
    
    # show figure and enable zooming via scrolling
    fig.show(config={'scrollZoom': True})

The six regions we identify are as follows:
1. High net worth: light green
2. High net worth per inhabitant: violet
3. High per capita retirement: pink
4. Elevated and average net worth: orange
5. Low net worth: yellow
6. Singletons: dark green

We can, on the one hand, color the Mapper graph accordingly:

In [None]:
colorscale = dict(zip(range(6), ['#004e00', '#7CFC00', '#6f0043','#32a8a0', '#f8de00', '#f80096']))

fig = plotting.get_region_plot(pipe, data_per_year['2016'], 'kk',
                               node_elements_full, colorscale)
fig.show(config={'scrollZoom': True})


On the other hand, we can also color the counties on a map of the USA by the color of the region they belong to. The color of a county belonging to multiple regions is given by the mean of their colors.

In [None]:
colorscale = dict(zip(range(6), ['#004e00', '#7CFC00', '#6f0043','#32a8a0', '#f8de00', '#f80096']))
fig = plotting.get_county_plot_by_region(data_per_year['2016'], colorscale,
                                         node_elements_full,
                                         df[df['year'] == 2016]['fips'].tolist())
fig.show(config={'scrollZoom': True})

### Colored by Winner of Presidential Election
Even though we used only economic indicators for the Mapper algorithm, we can color the graph by the percentage of counties won by Democtrats/Republicans. Thus, red nodes indicate a cluster where most counties were won by Republicans. In addition, we adjust the size of a node (<i>i.e.</i> cluster) based on the number of counties it represents.

In [None]:
for year in range(2016, 1996, -4):
    print(f'-- {year} Election --')
    fig = plotting.get_graph_plot_colored_by_election_results(
        pipe, year, df, data_per_year[f'{year}'])
    fig.show(config={'scrollZoom': True})

### Example for 3D Plot
Giotto's Mapper implementation also includes the possibility to plot the result in 3D!

In [None]:
# set layout
layout = graph.layout('kamada_kawai_3d')

# define node color
node_color = np.array(utils.get_node_summary(node_elements_full,
                                    df[df['year'] == 2016]['winner']
                                    .values,
                                    summary_stat=np.mean))

# define node text
node_text = utils.get_node_text(
    dict(zip(range(len(node_elements_full)),
                   node_elements_full)),
         utils.get_n_electors(
             node_elements_full,
             df[df['year'] == 2016]['n_electors'].reset_index(drop=True)),
         node_color,
         'Percentage of Counties Won by Republicans')

# set custom plot options
plotly_kwargs = {
    'node_trace_marker_colorscale': 'RdBu',
    'node_trace_marker_reversescale': True,
    'node_trace_hoverlabel': dict(
        bgcolor=list(map(lambda x: rgb2hex(get_cmap('RdBu_r')(x)),
                         node_color))),
    'node_trace_text': node_text,
    'node_trace_marker_size':
    utils.get_n_electors(node_elements_full,
                        df[df['year'] == 2016]['n_electors'].reset_index(drop=True)),
    'node_trace_marker_sizeref':
    .05 / max(utils.get_n_electors(node_elements_full,
                                    df[df['year'] == 2016]['n_electors'].reset_index(drop=True)))}


fig = plot_static_mapper_graph(pipe, data,
                               layout='kk', layout_dim=3,
                               node_color_statistic=node_color,
                               color_by_columns_dropdown=True,
                               plotly_kwargs=plotly_kwargs)
fig.show()