### imports

In [1]:
import os
from IPython.display import display

import pandas as pd
pd.options.display.max_columns = 100

import qgrid
from qgrid import show_grid as sg
qgrid.set_grid_option('forceFitColumns', False)
qgrid.set_grid_option('defaultColumnWidth', 180)
qgrid.set_defaults(show_toolbar=True)

### read the dataset from `.csv`

In [2]:
fp = os.path.join('..', 'data', 'output', 'Vybary_2015.csv')
df = pd.read_csv(fp)

In [3]:
df

Unnamed: 0,region,district,district_type,H_n,H_%,K_n,K_%,L_n,L_%,U_n,U_%,electors,new_electors,ballots,voted_n,voted_%,early_voters,home_voters,pp_voters,against_all,spoilt
0,г. Менск,Завадскі,раён гораду,4428,3.64,9221,7.58,75931,62.42,3966,3.26,166138,465,121646,121639,73.22,48601,837,72201,27414,679
1,г. Менск,Ленінскі,раён гораду,4068,3.37,9952,8.24,77753,64.34,3451,2.86,165567,713,120995,120844,72.99,43996,2796,74052,25263,357
2,г. Менск,Маскоўскі,раён гораду,4753,3.21,9472,6.40,99879,67.50,3731,2.52,198655,376,148082,147972,74.49,60939,1791,85242,29289,848
3,г. Менск,Кастрычніцкі,раён гораду,3077,3.36,6569,7.17,61000,66.62,2365,2.58,124966,5,91631,91566,73.27,33855,962,56749,18165,390
4,г. Менск,Партызанскі,раён гораду,1870,3.79,3868,7.83,30542,61.85,1598,3.24,67502,109,49385,49379,73.15,19231,2164,27984,10914,587
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156,Рэспубліка,Гарадзенская вобласць. Агулам,Агулам,15691,2.27,32755,4.75,592490,85.89,7092,1.03,779638,2105,689841,689824,88.48,286439,64395,338990,34169,7627
157,Рэспубліка,Менская вобласць. Агулам,Агулам,32128,3.21,44155,4.41,858287,85.74,21763,2.17,1116445,2019,1001107,1001041,89.66,412680,97707,490654,37552,7156
158,Рэспубліка,Магілёўская вобласць. Агулам,Агулам,27173,3.66,25652,3.46,655396,88.30,7928,1.07,816030,1420,742575,742263,90.96,298268,59198,384797,19443,6671
159,Рэспубліка,г. Менск. Агулам,Агулам,32585,3.33,69134,7.07,642119,65.69,26628,2.72,1314164,3807,978800,977448,74.38,384285,20561,572602,199056,7926


In [4]:
df.dtypes

region            object
district          object
district_type     object
H_n                int64
H_%              float64
K_n                int64
K_%              float64
L_n                int64
L_%              float64
U_n                int64
U_%              float64
electors           int64
new_electors       int64
ballots            int64
voted_n            int64
voted_%          float64
early_voters       int64
home_voters        int64
pp_voters          int64
against_all        int64
spoilt             int64
dtype: object

In [5]:
assert not df.isna().any().any()

## data consistency checks

* check that `voted_n <= ballots`

In [6]:
wrong = df[df['voted_n'] > df['ballots']]
if wrong.shape[0] > 0:
    print(f'wrong records:')
    display(wrong)
    raise ValueError()

print('all is OK')

all is OK


* check that `ballots <= electors`

In [7]:
wrong = df[df['ballots'] > df['electors']]
if wrong.shape[0] > 0:
    print(f'wrong records:')
    display(wrong)
    raise ValueError()

print('all is OK')

all is OK


* check total number of voted electors

In [8]:
df['c_voted_n'] = df[['H_n', 'K_n', 'L_n', 'U_n', 'against_all', 'spoilt']].sum(axis=1)
if not all(df['c_voted_n'] == df['voted_n']):
    raise ValueError()

print('all is OK')

all is OK


* check voters turnout

In [9]:
df['c_voted_%'] = (df['voted_n'] / df['electors'] * 100).round(2)
if not all(df['c_voted_%'] == df['voted_%']):
    raise ValueError()

print('all is OK')

all is OK


* check fraction of voted electors for each candidate

In [10]:
for candidate in ['H', 'K', 'L', 'U']:
    df[f'c_{candidate}_%'] = (df[f'{candidate}_n'] / df['voted_n'] * 100).round(2)
    wrong = df[df[f'c_{candidate}_%'] != df[f'{candidate}_%']]
    if wrong.shape[0] > 0:
        print(f'found {wrong.shape[0]} wrond records for candidate: {candidate}')
        display(wrong)
    else:
        print(f'\nall is OK for {candidate}\n')


all is OK for H


all is OK for K


all is OK for L


all is OK for U



* check that `voted electors = early voters + voted at home + voted at polling place`

In [11]:
df['c_voted_n_2'] = df['early_voters'] + df['home_voters'] + df['pp_voters']
wrong = df[df['c_voted_n_2'] != df['voted_n']]
if wrong.shape[0] > 0:
    print(f'found {wrong.shape[0]} wrong recors')
    display(wrong)

print('all is OK')

all is OK


* drop temporary columns

In [12]:
tmp_cols = [x for x in df.columns if x.startswith('c_')]
print(f'dropping columns: {tmp_cols}')
df.drop(columns=tmp_cols, inplace=True)

dropping columns: ['c_voted_n', 'c_voted_%', 'c_H_%', 'c_K_%', 'c_L_%', 'c_U_%', 'c_voted_n_2']


* check that total values in the bottom of each region sheet contain correctly aggregated values

In [13]:
cols_to_check_sum = [
    'H_n', 'K_n', 'L_n', 'U_n',
    'electors', 'new_electors', 'ballots', 'voted_n', 
    'early_voters', 'home_voters', 'pp_voters',
    'against_all', 'spoilt'
]

# check republic separately
regions = [
    'г. Менск', 'Менская вобласць', 'Брэсцкая вобласць',
    'Гарадзенская вобласць', 'Віцебская вобласць',
    'Магілёўская вобласць', 'Гомельская вобласць'
]

for r in regions:
    sub = df.query(f'region == "{r}"')
    sub_values = sub.query('district_type != "Агулам"')
    sub_agg = sub.query('district_type == "Агулам"')
    res = all(sub_values[cols_to_check_sum].sum() == sub_agg[cols_to_check_sum])
    assert res, f'wrong total sum found for region "{r}"'
    
# check republic separately cause these records have
# different value in district
republic_values = df.query('region == "Рэспубліка" and district != "Рэспубліка. Агулам"')
republic_agg = df.query('region == "Рэспубліка" and district == "Рэспубліка. Агулам"')
check = all(republic_values[cols_to_check_sum].sum() == republic_agg[cols_to_check_sum])
assert check, 'wrong total sum for "Рэспубліка"'

print('all is OK')

all is OK


* check that total values from each region sheet are same as values in the sheet with republican stats

In [14]:
sub = df.query('district_type == "Агулам" and district != "Рэспубліка. Агулам"')
sub.sort_values('district')

Unnamed: 0,region,district,district_type,H_n,H_%,K_n,K_%,L_n,L_%,U_n,U_%,electors,new_electors,ballots,voted_n,voted_%,early_voters,home_voters,pp_voters,against_all,spoilt
54,Брэсцкая вобласць,Брэсцкая вобласць. Агулам,Агулам,21299,2.35,48305,5.32,782738,86.22,10720,1.18,1004274,4452,908360,907789,90.39,379432,55833,472524,38407,6320
153,Рэспубліка,Брэсцкая вобласць. Агулам,Агулам,21299,2.35,48305,5.32,782738,86.22,10720,1.18,1004274,4452,908360,907789,90.39,379432,55833,472524,38407,6320
100,Віцебская вобласць,Віцебская вобласць. Агулам,Агулам,30939,3.87,28799,3.61,697035,87.28,11171,1.4,876896,2767,798712,798659,91.08,345591,91175,361893,25935,4780
154,Рэспубліка,Віцебская вобласць. Агулам,Агулам,30939,3.87,28799,3.61,697035,87.28,11171,1.4,876896,2767,798712,798659,91.08,345591,91175,361893,25935,4780
74,Гарадзенская вобласць,Гарадзенская вобласць. Агулам,Агулам,15691,2.27,32755,4.75,592490,85.89,7092,1.03,779638,2105,689841,689824,88.48,286439,64395,338990,34169,7627
156,Рэспубліка,Гарадзенская вобласць. Агулам,Агулам,15691,2.27,32755,4.75,592490,85.89,7092,1.03,779638,2105,689841,689824,88.48,286439,64395,338990,34169,7627
152,Гомельская вобласць,Гомельская вобласць. Агулам,Агулам,42130,4.23,22626,2.27,874413,87.79,16829,1.69,1101235,182,996295,995989,90.44,417286,47904,530799,31663,8328
155,Рэспубліка,Гомельская вобласць. Агулам,Агулам,42130,4.23,22626,2.27,874413,87.79,16829,1.69,1101235,182,996295,995989,90.44,417286,47904,530799,31663,8328
126,Магілёўская вобласць,Магілёўская вобласць. Агулам,Агулам,27173,3.66,25652,3.46,655396,88.3,7928,1.07,816030,1420,742575,742263,90.96,298268,59198,384797,19443,6671
158,Рэспубліка,Магілёўская вобласць. Агулам,Агулам,27173,3.66,25652,3.46,655396,88.3,7928,1.07,816030,1420,742575,742263,90.96,298268,59198,384797,19443,6671


In [15]:
res = sub.groupby('district').apply(
    lambda group: all(group.drop(columns='region').nunique() == 1)
)
if not all(res):
    raise ValueError()

print('all is OK')

all is OK
