In [21]:
import numpy as np
import pandas as pd

from bokeh.io import output_file, output_notebook, push_notebook, show
from bokeh.models import (
  GMapPlot, GMapOptions, ColumnDataSource, Circle, Patch, DataRange1d, 
  ResetTool, PanTool, WheelZoomTool, BoxSelectTool, ResizeTool
)
import bokeh.palettes
import bokeh.charts

output_notebook()

from sklearn.cluster import KMeans

In [2]:
BASE_DIR = "./"

RAW_RESULTS = BASE_DIR + "raw-votes-20.txt"
BALLOTS_GEOLOCATIONS = BASE_DIR + "ballots_geolocations.csv"


In [3]:
ISRAEL_MAP_OPTIONS = GMapOptions(lat=32.0, lng=35.0, map_type="hybrid", zoom=8)

def create_map():
    plot = GMapPlot(
        x_range=DataRange1d(), y_range=DataRange1d(), map_options=ISRAEL_MAP_OPTIONS, title="Israel",
        tools = [ResetTool(), WheelZoomTool(), PanTool(), ],
        plot_width=1200, plot_height=1000

    )
    return plot


In [4]:
def all_colors(n):
    return ['#{0:06x}'.format(int(x)) for x in np.linspace(0,256*256*256-1, n)]


# Load data

## Ballots Geolocations

In [5]:
ballots_geo = pd.read_csv(BALLOTS_GEOLOCATIONS).set_index("id")
ballots_geo[['lat', 'long']] = ballots_geo[['lat','long']].astype('str')

## Raw data

In [6]:
ballots_raw = pd.read_csv(RAW_RESULTS, sep="\t", header=0)
ballots_raw.rename(columns={
    " שם ישוב": "city_name", 
    "סמל ישוב": "city_id", 
    "מספר קלפי": "ballot_id",
    "בזב": "bazab",
    "מצביעים": "total_votes",
    "פסולים": "invalid_votes",
    "כשרים": "valid_votes",
}, inplace=True)
ballots_raw["id"] = ballots_raw.apply(lambda x: str(x.city_id)+"-"+str(x.ballot_id), axis=1)
ballots_raw.index = ballots_raw.id.values

## Combine together

In [7]:
ballots = ballots_raw.join(ballots_geo[["lat", "long"]], on="id")

# ignore מעטפות חיצוניות 
ballots = ballots[ballots.city_name != 'מעטפות חיצוניות']


In [8]:
ALL_PARTIES = ['אמת', 'ג', 'ודעם', 'ז', 'זך' ,'זץ', 'טב', 'י', 'יז', 'יך', 'יץ', 'כ', 'ל', 'מחל', 'מרצ',
       'נז', 'ני', 'נץ', 'ע', 'פה', 'ף', 'ףץ', 'קנ', 'קץ', 'רק', 'שס']


---

# Group ballots by physical location
(many ballots are in one building - aggregating them)

In [9]:
ballots['latlong'] = ballots.apply(lambda r: (r.lat, r.long), axis=1)
geoagg_ballots = ballots.groupby('latlong').sum()[['bazab', 'total_votes', 'invalid_votes', 'valid_votes'] + ALL_PARTIES]
geoagg_ballots = pd.merge(geoagg_ballots, ballots[['lat', 'long', 'latlong']].drop_duplicates(), how='inner', left_index=True, right_on='latlong').set_index('latlong')

---

# Draw all ballots on GMap

In [11]:
plot = create_map()

cds = ColumnDataSource(data=dict(
        lon = geoagg_ballots['long'],
        lat = geoagg_ballots['lat'],
        radius = np.clip(np.power(geoagg_ballots['valid_votes']/50, 0.6), 3, 30),
    
))


plot.add_glyph(cds, Circle(x='lon', y='lat', size='radius', fill_color="navy", fill_alpha=0.6, radius_units="screen"))
#output_file("/tmp/1.html")

show(plot)

---

## Normalizing votes to % of valid_votes
Thus getting a point in NUMBER OF PARTIES dimention space between 0..1 for every ballot

In [10]:
geoagg_ballots_norm = geoagg_ballots.copy()
geoagg_ballots_norm[ALL_PARTIES] = geoagg_ballots_norm[ALL_PARTIES].div(geoagg_ballots_norm.valid_votes, axis="index")

# KMeans clustering

In [11]:
KMEANS_NUM_CLUSTERS = 8

kmeans = KMeans(n_clusters=KMEANS_NUM_CLUSTERS)
kmeans_clustering = kmeans.fit_predict(geoagg_ballots_norm)

_colors = all_colors(KMEANS_NUM_CLUSTERS)
label_colors = [_colors[i] for i in kmeans_clustering]


In [14]:
plot = create_map()

cds = ColumnDataSource(data=dict(
        lon = geoagg_ballots['long'],
        lat = geoagg_ballots['lat'],
        radius = np.clip(np.power(geoagg_ballots['valid_votes']/50, 0.6), 3, 30),
        fill = label_colors,
    
))


plot.add_glyph(cds, Circle(x='lon', y='lat', size='radius', fill_color='fill', fill_alpha=0.6))
#output_file("/tmp/1.html")

show(plot)

In [12]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(geoagg_ballots_norm[ALL_PARTIES])
pca.components_

array([[  2.80680715e-01,  -4.40645372e-02,  -8.98365079e-01,
         -4.13516367e-04,  -1.38171385e-04,   5.16054054e-05,
          6.83909046e-02,  -1.55625355e-05,  -1.01310961e-04,
         -4.13590306e-25,  -3.41122036e-03,   8.67191375e-02,
          3.93174062e-02,   2.85809802e-01,   3.27636053e-02,
          3.60864424e-04,  -2.49183897e-05,  -4.91889555e-04,
         -1.06294670e-02,   1.22553861e-01,  -1.99522162e-04,
         -2.38056482e-04,   1.37199705e-02,   1.18917468e-02,
          3.99760374e-04,   1.54338729e-02],
       [  7.71387712e-01,  -1.75279389e-01,   1.09901363e-01,
         -1.01222839e-04,  -1.82645139e-05,   9.94430264e-05,
         -1.71519428e-01,  -2.09586686e-04,   1.75398144e-05,
          0.00000000e+00,   5.98147048e-04,  -3.10608833e-02,
         -1.11897917e-01,  -4.48577267e-01,   2.03869068e-01,
          2.95109061e-05,  -1.14464643e-04,  -1.09449130e-04,
          1.47494016e-03,   1.56288560e-01,   6.24884578e-05,
         -6.27970166e-04,

In [22]:
p = bokeh.plotting.figure()
cs = ["red", "blue"]

for ind,c in enumerate(pca.components_):
    p.line(x=range(len(ALL_PARTIES)), y=c.tolist(), color=cs[ind])
    push_notebook()

In [23]:
show(p)