# ¿Es posible predecir la 2da vuelta de 2014?

In [1]:
import pandas as pd

import os

In [2]:
pd.set_option('future.no_silent_downcasting', True)

In [3]:
DATA_PATH = os.path.join('..', 'data')
PRIMARY_PATH = os.path.join(DATA_PATH, 'primary')

YEAR_TO_PREDICT = 2014

In [4]:
df = pd.read_parquet(os.path.join(PRIMARY_PATH, 'pres-1990-2014.parquet'))
df.region = df.region.str.upper()
df['electoral_id'] = df.region+' - '+df.electoral
df.sort_values(['electoral_id', 'porcentaje'], ascending=[True, False], inplace=True)
support = {
    'Michelle Bachelet': 'izquierda',
    'Eduardo Frei': 'izquierda',
    'Sebastián Piñera': 'derecha',
    'Patricio Aylwin': 'izquierda',
    'Joaquín Lavín': 'derecha',
    'Ricardo Lagos': 'izquierda',
    'Evelyn Matthei': 'derecha',
    'Hernán Büchi': 'derecha',
    'Arturo Alessandri': 'derecha',
    'Francisco J. Errázuriz': 'derecha',
    'Marco Enríquez - Ominami': 'izquierda',
    'Tomás Hirsch': 'izquierda',
    'José Piñera': 'derecha',
    'Manfred Max Neef': 'izquierda',
    'Franco Parisi': 'derecha',
    'Alfredo Sfeir': 'izquierda',
    'Arturo Frei': 'derecha',
    'Cristian Reitze': 'izquierda',
    'Eugenio Pizarro': 'izquierda',
    'Gladys Marín': 'izquierda',
    'Jorge Arrate': 'izquierda',
    'Marcel Claude': 'izquierda',
    'Ricardo Israel': 'derecha',
    'Roxana Miranda': 'izquierda',
    'Sara Larraín': 'izquierda',
    'Tomás Jocelyn - Holt': 'derecha',
    'Sara Larráin': 'izquierda',
}
df['candidato'] = df['candidato'].str.strip()
df['inclinacion'] = df['candidato'].map(support)
df


Unnamed: 0,eleccion,tipo,region,electoral,candidato,votos,porcentaje,electoral_id,inclinacion
21336,2014,2,AISEN,AISEN,Michelle Bachelet,4283,0.702938,AISEN - AISEN,izquierda
1319,1990,0,AISEN,AISEN,Patricio Aylwin,4607,0.625356,AISEN - AISEN,izquierda
11660,2006,2,AISEN,AISEN,Michelle Bachelet,4294,0.569270,AISEN - AISEN,izquierda
4154,1994,0,AISEN,AISEN,Eduardo Frei,4373,0.556857,AISEN - AISEN,izquierda
15044,2010,2,AISEN,AISEN,Sebastián Piñera,4328,0.555370,AISEN - AISEN,derecha
...,...,...,...,...,...,...,...,...,...
16241,2014,1,VALPARAISO,ZAPALLAR,Ricardo Israel,23,0.008008,VALPARAISO - ZAPALLAR,derecha
5022,2000,1,VALPARAISO,ZAPALLAR,Tomás Hirsch,12,0.005362,VALPARAISO - ZAPALLAR,izquierda
5020,2000,1,VALPARAISO,ZAPALLAR,Sara Larráin,10,0.004468,VALPARAISO - ZAPALLAR,izquierda
5019,2000,1,VALPARAISO,ZAPALLAR,Arturo Frei,7,0.003128,VALPARAISO - ZAPALLAR,derecha


## Modelos Básicos

Cada modelo debe responder estas 3 preguntas:
- ¿Quién ganará la 2da vuelta?
- ¿Cuántas personas votarán?
- ¿Qué porcentaje de votos obtendrá cada candidato?

Según 3 niveles de desagregación:

- Circunscripción Electoral.
- Región.
- Chile.

In [5]:
results = dict()

train = df.query('eleccion < @YEAR_TO_PREDICT or (eleccion == @YEAR_TO_PREDICT and tipo == 1)')
test = df.query('eleccion == @YEAR_TO_PREDICT and tipo == 2')

### Modelo: Simple

En primera vuelta hay muchos candidatos, pero en segunda vuelta sólo quedan dos. Por lo tanto, el modelo más simple es asumir que los votos de los candidatos de primera vuelta se suman a los dos candidatos de segunda vuelta.

Los votos de los candidatos de primera vuelta se transfieren al candidato de segunda vuelta que tenga la misma inclinación política.

Se asume que la misma cantidad de personas votará en la segunda vuelta que en la primera vuelta.

In [6]:
model = 'Simple'

last_year = test.eleccion.max()
votes = train.query('eleccion == @last_year and tipo == 1')
result = (
    votes
    .groupby(['region', 'electoral_id', 'inclinacion'])
    .sum()
    ['votos']
    .sort_index()
    .reset_index()
    .set_index('electoral_id')
)
result['porcentaje'] = result.votos/result.groupby('electoral_id').votos.sum()
result.reset_index(inplace=True)
result.set_index(['region', 'electoral_id', 'inclinacion'], inplace=True)
results[model] = result
result

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,votos,porcentaje
region,electoral_id,inclinacion,Unnamed: 3_level_1,Unnamed: 4_level_1
AISEN,AISEN - AISEN,derecha,2707,0.372096
AISEN,AISEN - AISEN,izquierda,4568,0.627904
AISEN,AISEN - BAHIA MURTA,derecha,131,0.617925
AISEN,AISEN - BAHIA MURTA,izquierda,81,0.382075
AISEN,AISEN - BALMACEDA,derecha,93,0.394068
...,...,...,...,...
VALPARAISO,VALPARAISO - SAUSALITO,izquierda,19699,0.452175
VALPARAISO,VALPARAISO - VILLA ALEMANA,derecha,17440,0.424506
VALPARAISO,VALPARAISO - VILLA ALEMANA,izquierda,23643,0.575494
VALPARAISO,VALPARAISO - ZAPALLAR,derecha,1504,0.523677


## Resultados

- **q1:** ¿Quién ganará la 2da vuelta?
- **q2:** ¿Cuántas personas votarán por cada candidato?
- **q3:** ¿Qué porcentaje de votos obtendrá cada candidato?

---

- **ce:** Circunscripción Electoral.
- **r:** Región.
- **ch:** Chile.

**Ejemplo:**

- `q2_ce`: ¿Cuántas personas votarán en cada Circunscripción Electoral?

In [7]:
test_results = test[['region', 'electoral_id', 'inclinacion', 'votos', 'porcentaje']].copy()

# ¿Quién ganará la 2da vuelta?
q1_ce = test_results.groupby(['electoral_id', 'inclinacion']).votos.sum().unstack().idxmax(axis=1)
q1_r = test_results.groupby(['region', 'inclinacion']).votos.sum().unstack().idxmax(axis=1)
q1_ch = test_results.groupby(['inclinacion']).votos.sum().idxmax()

# ¿Cuántas personas votarán por cada candidato?
q2_ce = test_results.groupby(['electoral_id', 'inclinacion']).votos.sum().unstack().T
q2_ce.index = pd.MultiIndex.from_product([['real'], q2_ce.index], names=['model', 'inclinacion'])
q2_r = test_results.groupby(['region', 'inclinacion']).votos.sum().unstack().T
q2_r.index = pd.MultiIndex.from_product([['real'], q2_r.index], names=['model', 'inclinacion'])
q2_ch = test_results.groupby(['inclinacion']).votos.sum()

# ¿Qué porcentaje de votos obtendrá cada candidato?
q3_ce = test_results.groupby(['electoral_id', 'inclinacion']).porcentaje.sum().unstack().T
q3_ce.index = pd.MultiIndex.from_product([['real'], q3_ce.index], names=['model', 'inclinacion'])
q3_r = test_results.groupby(['region', 'inclinacion']).votos.sum()
q3_r_total = test_results.groupby(['region']).votos.sum()
q3_r = (q3_r/q3_r_total).unstack().T
q3_r.index = pd.MultiIndex.from_product([['real'], q3_r.index], names=['model', 'inclinacion'])
q3_ch = test_results.groupby(['inclinacion']).votos.sum()/test_results.votos.sum()

# Save results
df_results = {
    'q1_ce': q1_ce.to_frame('real'),
    'q1_r': q1_r.to_frame('real'),
    'q1_ch': pd.DataFrame({'real': {'inclinacion': q1_ch}}),
    'q2_ce': q2_ce.T,
    'q2_r': q2_r.T,
    'q2_ch': q2_ch.to_frame('real'),
    'q3_ce': q3_ce.T,
    'q3_r': q3_r.T,
    'q3_ch': q3_ch.to_frame('real'),
}

for model, result in results.items():

    # ¿Quién ganará la 2da vuelta?
    q1_ce = result.groupby(['electoral_id', 'inclinacion']).votos.sum().unstack().idxmax(axis=1)
    q1_r = result.groupby(['region', 'inclinacion']).votos.sum().unstack().idxmax(axis=1)
    q1_ch = result.groupby(['inclinacion']).votos.sum().idxmax()
    q1_ch = pd.DataFrame({model.lower(): {'inclinacion': q1_ch}})

    # ¿Cuántas personas votarán por cada candidato?
    q2_ce = result.groupby(['electoral_id', 'inclinacion']).votos.sum().unstack().T
    q2_ce.index = pd.MultiIndex.from_product(
        [[model.lower()], q2_ce.index], names=['model', 'inclinacion']
    )
    q2_r = result.groupby(['region', 'inclinacion']).votos.sum().unstack().T
    q2_r.index = pd.MultiIndex.from_product(
        [[model.lower()], q2_r.index], names=['model', 'inclinacion']
    )
    q2_ch = result.groupby(['inclinacion']).votos.sum().to_frame(model.lower())

    # ¿Qué porcentaje de votos obtendrá cada candidato?
    q3_ce = result.groupby(['electoral_id', 'inclinacion']).porcentaje.sum().unstack().T
    q3_ce.index = pd.MultiIndex.from_product(
        [[model.lower()], q3_ce.index], names=['model', 'inclinacion']
    )
    q3_r = result.groupby(['region', 'inclinacion']).votos.sum()
    q3_r_total = result.groupby(['region']).votos.sum()
    q3_r = (q3_r/q3_r_total).unstack().T
    q3_r.index = pd.MultiIndex.from_product(
        [[model.lower()], q3_r.index], names=['model', 'inclinacion']
    )
    q3_ch = result.groupby(['inclinacion']).votos.sum()/result.votos.sum()

    # Save results
    df_results['q1_ce'] = df_results['q1_ce'].join(q1_ce.to_frame(model.lower()))
    df_results['q1_r'] = df_results['q1_r'].join(q1_r.to_frame(model.lower()))
    df_results['q1_ch'] = df_results['q1_ch'].join(q1_ch)
    df_results['q2_ce'] = pd.concat([df_results['q2_ce'].T, q2_ce]).T
    df_results['q2_r'] = pd.concat([df_results['q2_r'].T, q2_r]).T
    df_results['q2_ch'] = df_results['q2_ch'].join(q2_ch)
    df_results['q3_ce'] = pd.concat([df_results['q3_ce'].T, q3_ce]).T
    df_results['q3_r'] = pd.concat([df_results['q3_r'].T, q3_r]).T
    df_results['q3_ch'] = df_results['q3_ch'].join(q3_ch.to_frame(model.lower())).T

df_results['q3_ch'] = df_results['q3_ch'].T
for i in range(2, 4):
    df_results[f'q{i}_ch'] = (
        df_results[f'q{i}_ch']
        .rename_axis('inclinacion')
        .reset_index()
        .assign(pais='CHILE')
        .set_index('pais')
        .pivot(columns='inclinacion')
    )
    df_results[f'q{i}_ch'].columns.names = ['model', 'inclinacion']
for i in range(1, 3):
    df_results[f'q{i}_ch']['pais'] = 'CHILE'
    df_results[f'q{i}_ch'].set_index('pais', inplace=True)
for q, level in zip(['q1_ce', 'q1_r', 'q1_ch'], ['electoral_id', 'region', 'pais']):
    df_results[q] = (
        df_results[q]
        .stack()
        .reset_index(name='inclinacion')
        .rename(columns={'level_1': 'model'})
        .assign(
            izquierda=lambda x: x['inclinacion'] == 'izquierda',
            derecha=lambda x: x['inclinacion'] == 'derecha',
        )
        .melt(
            id_vars=['model', level], value_vars=['izquierda', 'derecha'],
            var_name='inclinacion', value_name='value'
        )
        .pivot(index=['model', 'inclinacion'], columns=level, values='value')
        .fillna(False)
        .astype(bool)
        .T
    )

## Resultados por Área

In [8]:
area_results = dict()
for area in ['ce', 'r', 'ch']:
    area_results[area] = pd.concat(
        [df_results[f'q{i}_{area}'] for i in range(1, 4)],
        axis=1,
        keys=[f'q{i}' for i in range(1, 4)],
        names=[area, 'model', 'inclinacion']
    )
    area_results[area]['q1'] = area_results[area]['q1'].astype(bool)
    area_results[area]['q2'] = area_results[area]['q2'].astype(float)
    area_results[area]['q3'] = area_results[area]['q3']*100
    display(area_results[area])

ce,q1,q1,q1,q1,q2,q2,q2,q2,q3,q3,q3,q3
model,real,real,simple,simple,real,real,simple,simple,real,real,simple,simple
inclinacion,derecha,izquierda,derecha,izquierda,derecha,izquierda,derecha,izquierda,derecha,izquierda,derecha,izquierda
electoral_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
AISEN - AISEN,False,True,False,True,1810.0,4283.0,2707.0,4568.0,29.706220,70.293780,37.209622,62.790378
AISEN - BAHIA MURTA,True,False,True,False,121.0,77.0,131.0,81.0,61.111111,38.888889,61.792453,38.207547
AISEN - BALMACEDA,False,True,False,True,60.0,133.0,93.0,143.0,31.088083,68.911917,39.406780,60.593220
AISEN - CHILE CHICO,False,True,False,True,354.0,1067.0,478.0,1197.0,24.912034,75.087966,28.537313,71.462687
AISEN - CISNES,False,True,False,True,245.0,527.0,324.0,601.0,31.735751,68.264249,35.027027,64.972973
...,...,...,...,...,...,...,...,...,...,...,...,...
VALPARAISO - SANTO DOMINGO,True,False,False,True,2988.0,2958.0,3078.0,3567.0,50.252270,49.747730,46.320542,53.679458
VALPARAISO - SAUSALITO,True,False,True,False,22741.0,13621.0,23866.0,19699.0,62.540564,37.459436,54.782509,45.217491
VALPARAISO - VILLA ALEMANA,False,True,False,True,15739.0,18078.0,17440.0,23643.0,46.541680,53.458320,42.450649,57.549351
VALPARAISO - ZAPALLAR,True,False,True,False,1500.0,1069.0,1504.0,1368.0,58.388478,41.611522,52.367688,47.632312


r,q1,q1,q1,q1,q2,q2,q2,q2,q3,q3,q3,q3
model,real,real,simple,simple,real,real,simple,simple,real,real,simple,simple
inclinacion,derecha,izquierda,derecha,izquierda,derecha,izquierda,derecha,izquierda,derecha,izquierda,derecha,izquierda
region,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
AISEN,False,True,False,True,10638.0,20924.0,13788.0,23338.0,33.705088,66.294912,37.138394,62.861606
ANTOFAGASTA,False,True,False,True,47071.0,81241.0,72256.0,102090.0,36.6848,63.3152,41.444025,58.555975
ARAUCANIA,False,True,False,True,138027.0,196492.0,151918.0,229635.0,41.261333,58.738667,39.815701,60.184299
ARICA,False,True,False,True,21430.0,32984.0,30103.0,40014.0,39.383247,60.616753,42.932527,57.067473
ATACAMA,False,True,False,True,24046.0,55119.0,31468.0,66146.0,30.374534,69.625466,32.237179,67.762821
BIOBIO,False,True,False,True,254487.0,464898.0,302815.0,533664.0,35.375633,64.624367,36.201148,63.798852
COQUIMBO,False,True,False,True,64948.0,155340.0,78203.0,176869.0,29.483222,70.516778,30.659186,69.340814
LAGOS,False,True,False,True,94981.0,172232.0,106622.0,200355.0,35.545052,64.454948,34.732895,65.267105
MAGALLANES,False,True,False,True,15092.0,34001.0,19767.0,42042.0,30.741654,69.258346,31.980779,68.019221
MAULE,False,True,False,True,122989.0,257831.0,139608.0,290232.0,32.295835,67.704165,32.479062,67.520938


ch,q1,q1,q1,q1,q2,q2,q2,q2,q3,q3,q3,q3
model,real,real,simple,simple,real,real,simple,simple,real,real,simple,simple
inclinacion,derecha,izquierda,derecha,izquierda,derecha,izquierda,derecha,izquierda,derecha,izquierda,derecha,izquierda
pais,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
CHILE,False,True,False,True,2111891.0,3470379.0,2364834.0,4220974.0,37.832118,62.167882,35.908031,64.091969


## Métricas

In [9]:
metrics = dict()
pd.set_option('display.float_format', '{:.4f}'.format)
for area in ['ce', 'r', 'ch']:
    a_df = area_results[area].copy()
    q1 = {
        model: (100*(a_df['q1'].real == a_df['q1'][model]).mean(axis=1)).to_frame('accuracy')
        for model, _ in a_df['q1'].columns
    }
    for i in range(2, 4):
        locals()[f'q{i}'] = {
            model: (a_df[f'q{i}'].real-a_df[f'q{i}'][model]).abs().mean(axis=1).to_frame('mae')
            for model, _ in a_df[f'q{i}'].columns
        }
    metrics[area] = {f'q{i}': pd.concat(locals()[f'q{i}'], axis=1) for i in range(1, 4)}
    metrics[area] = pd.concat(metrics[area], axis=1, keys=['q1', 'q2', 'q3'])
    display(metrics[area])

Unnamed: 0_level_0,q1,q1,q2,q2,q3,q3
Unnamed: 0_level_1,real,simple,real,simple,real,simple
Unnamed: 0_level_2,accuracy,accuracy,mae,mae,mae,mae
electoral_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
AISEN - AISEN,100.0000,100.0000,0.0000,591.0000,0.0000,7.5034
AISEN - BAHIA MURTA,100.0000,100.0000,0.0000,7.0000,0.0000,0.6813
AISEN - BALMACEDA,100.0000,100.0000,0.0000,21.5000,0.0000,8.3187
AISEN - CHILE CHICO,100.0000,100.0000,0.0000,127.0000,0.0000,3.6253
AISEN - CISNES,100.0000,100.0000,0.0000,76.5000,0.0000,3.2913
...,...,...,...,...,...,...
VALPARAISO - SANTO DOMINGO,100.0000,0.0000,0.0000,349.5000,0.0000,3.9317
VALPARAISO - SAUSALITO,100.0000,100.0000,0.0000,3601.5000,0.0000,7.7581
VALPARAISO - VILLA ALEMANA,100.0000,100.0000,0.0000,3633.0000,0.0000,4.0910
VALPARAISO - ZAPALLAR,100.0000,100.0000,0.0000,151.5000,0.0000,6.0208


Unnamed: 0_level_0,q1,q1,q2,q2,q3,q3
Unnamed: 0_level_1,real,simple,real,simple,real,simple
Unnamed: 0_level_2,accuracy,accuracy,mae,mae,mae,mae
region,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
AISEN,100.0,100.0,0.0,2782.0,0.0,3.4333
ANTOFAGASTA,100.0,100.0,0.0,23017.0,0.0,4.7592
ARAUCANIA,100.0,100.0,0.0,23517.0,0.0,1.4456
ARICA,100.0,100.0,0.0,7851.5,0.0,3.5493
ATACAMA,100.0,100.0,0.0,9224.5,0.0,1.8626
BIOBIO,100.0,100.0,0.0,58547.0,0.0,0.8255
COQUIMBO,100.0,100.0,0.0,17392.0,0.0,1.176
LAGOS,100.0,100.0,0.0,19882.0,0.0,0.8122
MAGALLANES,100.0,100.0,0.0,6358.0,0.0,1.2391
MAULE,100.0,100.0,0.0,24510.0,0.0,0.1832


Unnamed: 0_level_0,q1,q1,q2,q2,q3,q3
Unnamed: 0_level_1,real,simple,real,simple,real,simple
Unnamed: 0_level_2,accuracy,accuracy,mae,mae,mae,mae
pais,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
CHILE,100.0,100.0,0.0,501769.0,0.0,1.9241


## Métricas Totales

In [10]:
total_metrics = dict()
for area in ['ce', 'r', 'ch']:
    total_metrics[area] = metrics[area].mean().reset_index()
    total_metrics[area].columns = ['question', 'model', 'metric', 'value']
    total_metrics[area].set_index(['question', 'model'], inplace=True)
    total_metrics[area].sort_index(inplace=True)
    total_metrics[area].columns.name = area
    display(total_metrics[area])

Unnamed: 0_level_0,ce,metric,value
question,model,Unnamed: 2_level_1,Unnamed: 3_level_1
q1,real,accuracy,100.0
q1,simple,accuracy,96.5969
q2,real,mae,0.0
q2,simple,mae,902.4895
q3,real,mae,0.0
q3,simple,mae,2.7056


Unnamed: 0_level_0,r,metric,value
question,model,Unnamed: 2_level_1,Unnamed: 3_level_1
q1,real,accuracy,100.0
q1,simple,accuracy,100.0
q2,real,mae,0.0
q2,simple,mae,33451.2667
q3,real,mae,0.0
q3,simple,mae,1.7774


Unnamed: 0_level_0,ch,metric,value
question,model,Unnamed: 2_level_1,Unnamed: 3_level_1
q1,real,accuracy,100.0
q1,simple,accuracy,100.0
q2,real,mae,0.0
q2,simple,mae,501769.0
q3,real,mae,0.0
q3,simple,mae,1.9241
