# Zimbabwe Presidential Elections Data Analysis

## 1-Import libraries

In [249]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

%matplotlib inline

In [250]:
# change into the right folder
#cd zec_pres/

In [251]:
# check to see files
#ls

## 2-Function to Convert Excel Files to Dataframes

In [252]:
def excelToDF(xlsfile, cols):
    """ Takes an Excel File from the ZEC Election results for Zimbabwe presidential elections (one file for each province)
        Processes one file and returns two files, one with raw disaggregated data at polling station level
        And another with aggregated data at constituency and other levels that were preexisting in the
        original files
    """
    # import the provincial results file
    prov = pd.ExcelFile(xlsfile)
    #prov.sheet_names - shows that the first sheetname is the one of interest
    df = prov.parse(sheetname=prov.sheet_names[0])
    df = df.iloc[:,:len(cols)].copy()
    
    # change column names to correct ones, locate row with proper column names first
    column_index = np.where(df.iloc[:,0] == 'DISTRICT')[0][0]
    df.columns = cols

    # get propoer columns and remove null columns
    cols = [c for c in df.columns if type(c)==type('c') ]

    # make sure columns names have no spaces of or punctuation
    df = df.loc[:,cols].copy()
    df.columns = [c.lower().strip().replace(" ","_").replace(".","") for c in cols]
    
    # remove first few columns which are useless
    #column_index = np.where(df.iloc[:,0] == 'DISTRICT')[0][0]
    df = df[column_index+2:].copy()
    
    # get name of province
    namelist = xlsfile.split(" ")
    remove = ['July' ,'2018', 'Harmonised', 'Presidential', 'Election', 'Results.xlsx']
    name =" ".join([n for n in namelist if n not in remove])
     
    print("Processing dfs for {} province".format(name))
    
    # find the rows relevant for raw, disaggregated polling station data
    raw = (df.district.notnull() & df.constituency.notnull() & 
                df.local_authority.notnull() & df.ward_no.notnull() & df.polling_stations.notnull())

    # find the rows relevant for aggregated data
    agg = (df.district.isnull() & df.constituency.isnull() & 
                df.local_authority.isnull() & df.polling_stations.notnull()
          & df.station_code.isnull())
    
    # Make the raw and aggregated dfs
    draw, dagg = df[raw], df[agg]
   
    # make column for the province and add to the dfs
    draw['province'] = name
    dagg['province'] = name
   
    return draw, dagg 

## 3- Define a set of standard colums

In [253]:
# Get columns names to apply to all dfs
xlsfile = 'Bulawayo Metropolitan Province 2018 Harmonised Presidential Election Results.xlsx'
# import the provincial results file
prov = pd.ExcelFile(xlsfile)
#prov.sheet_names - shows that the first sheetname is the one of interest
df = prov.parse(sheetname=prov.sheet_names[0])

# change column names to correct ones
column_index = np.where(df.iloc[:,0] == 'DISTRICT')[0][0]
df.columns = df.loc[column_index]

# get propoer columns and remove null columns
cols = [c for c in df.columns if type(c)==type('c') ]
cols

['DISTRICT',
 'CONSTITUENCY ',
 'LOCAL AUTHORITY',
 'WARD NO.',
 'POLLING STATIONS',
 'STATION CODE',
 'Busha Joseph Makamba FreeZim Congress',
 'Chamisa Nelson MDC Alliance',
 'Chikanga  Everisto Washington Rebuild Zimbabwe',
 'Dzapasi Melbah # 1980 Freedom Movement Zimbabwe',
 'Gava Peter Mapfumo UDF',
 'Hlabangana Kwanele RPZ',
 'Kasiyamhuru Blessing ZPP',
 'Khupe Thokozani MDC-T',
 'Madhuku Lovemore NCA',
 'Mangoma Elton Steers Coalition of Democrats',
 'Manyika Noah Ngoni BZA',
 'Mapfumo Chiguvare Tonderayi Johannes Timothy PPPZ',
 'MARIYACHA Violet UDM',
 'Mhambi-Hove Divine NAPDR',
 'Mnangagwa Emmerson Dambudzo ZANU PF',
 'Moyo Donald Nkosana APA',
 'Mteki Bryn Taurai Independent',
 'Mugadza Willard Tawonezvi BCP',
 'Mujuru Joice Teurai Ropa PRC',
 'Munyanduri Tenda Peter NPF',
 'MutinhirI Ambrose NPF',
 'Shumba Kuzozvirava Doniel UDA',
 'Wilson Peter Harry DOP',
 'Total Votes Rejected ',
 'Ballot Paper Unaccounted for',
 'Total Votes Cast',
 'Total Valid Votes Cast']

## 4-Convert all Excel Files to DataFrames

In [254]:
# make sure the folder only contains the excel files of interest
# make a list of the filename to loop through later
provincial_filenames = os.listdir()

In [255]:
provincial_filenames

['Bulawayo Metropolitan Province 2018 Harmonised Presidential Election Results.xlsx',
 'Harare Metropolitan Province July 2018 Harmonised Presidential Election Results.xlsx',
 'Manicaland Province Province July 2018 Harmonised Presidential Election Results.xlsx',
 'Mashonaland Central Province July 2018 Harmonised Presidential Election Results.xlsx',
 'Mashonaland East Province July 2018 Harmonised Presidential Election Results.xlsx',
 'Mashonaland West  Province July 2018 Harmonised Presidential Election Results.xlsx',
 'Masvingo Province July 2018 Harmonised Presidential Election Results.xlsx',
 'Matabeleland North Province July 2018 Harmonised Presidential Election Results.xlsx',
 'Matabeleland South Province July 2018 Harmonised Presidential Election Results.xlsx',
 'Midlands Province July 2018 Harmonised Presidential Election Results.xlsx']

In [256]:
# get lists of dfs, each containing an aggregated and disaggregated dataframe
# each list index represents a province
raw_dfs = [excelToDF(fn, cols)[0] for fn in provincial_filenames]
agg_dfs = [excelToDF(fn, cols)[1] for fn in provincial_filenames]

Processing dfs for Bulawayo Metropolitan Province province


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Processing dfs for Harare Metropolitan Province province
Processing dfs for Manicaland Province Province province
Processing dfs for Mashonaland Central Province province
Processing dfs for Mashonaland East Province province
Processing dfs for Mashonaland West  Province province
Processing dfs for Masvingo Province province
Processing dfs for Matabeleland North Province province
Processing dfs for Matabeleland South Province province
Processing dfs for Midlands Province province
Processing dfs for Bulawayo Metropolitan Province province
Processing dfs for Harare Metropolitan Province province
Processing dfs for Manicaland Province Province province
Processing dfs for Mashonaland Central Province province
Processing dfs for Mashonaland East Province province
Processing dfs for Mashonaland West  Province province
Processing dfs for Masvingo Province province
Processing dfs for Matabeleland North Province province
Processing dfs for Matabeleland South Province province
Processing dfs for 

## 5- Cancatenate DataFrames and Transfer to Excel for Analysis

In [259]:
# Concatenate
rconcat = pd.concat(raw_dfs)
aconcat = pd.concat(agg_dfs)

In [270]:
rconcat = rconcat.reset_index().drop(['index'], axis=1)
aconcat = aconcat.reset_index().drop(['index'], axis=1)

In [271]:
# Convert and save to Excel
rconcat.to_excel("zimelectionresults2018_bypollingstation.xlsx")
aconcat.to_excel("zimelectionresults2018_otheraggregates.xlsx")

## Random Code Snipets to Test

In [246]:
xlsfile='Matabeleland South2 Province July 2018 Harmonised Presidential Election Results.xlsx'

In [247]:
# import the provincial results file
prov = pd.ExcelFile(xlsfile)
#prov.sheet_names - shows that the first sheetname is the one of interest
df = prov.parse(sheetname=prov.sheet_names[0])
df = df.iloc[:,:len(cols)].copy()

In [262]:
#df

In [238]:
np.where(df.iloc[:,0] == 'DISTRICT')[0][0]

IndexError: index 0 is out of bounds for axis 0 with size 0

In [230]:
# change column names to correct ones, locate row with proper column names first
column_index = np.where(df.iloc[:,0] == 'DISTRICT')[0][0]
df.columns = cols

# get propoer columns and remove null columns
cols = [c for c in df.columns if type(c)==type('c') ]

# make sure columns names have no spaces of or punctuation
df = df.loc[:,cols].copy()
df.columns = [c.lower().strip().replace(" ","_").replace(".","") for c in cols]

# remove first few columns which are useless
column_index = np.where(df.iloc[:,0] == 'DISTRICT')[0][0]
df = df[column_index+1:].copy()

In [263]:
#df

In [164]:
# make sure columns names have no spaces of or punctuation
df = df.loc[:,cols].copy()
df.columns = [c.lower().strip().replace(" ","_").replace(".","") for c in cols]

# remove first 2 columns which are useless
df = df[3:].copy()

# get name of province
namelist = xlsfile.split(" ")
remove = ['July' ,'2018', 'Harmonised', 'Presidential', 'Election', 'Results.xlsx']
name =" ".join([n for n in namelist if n not in remove])

print("Processing dfs for {} province".format(name))

Processing dfs for Manicaland Province Province province


In [165]:
# find the rows relevant for raw, disaggregated polling station data
raw = (df.district.notnull() & df.constituency.notnull() & 
            df.local_authority.notnull() & df.ward_no.notnull() & df.polling_stations.notnull())

# find the rows relevant for aggregated data
agg = (df.district.isnull() & df.constituency.isnull() & 
            df.local_authority.isnull() & df.polling_stations.notnull()
      & df.station_code.isnull())

# Make the raw and aggregated dfs
draw, dagg = df[raw], df[agg]

# make column for the province and add to the dfs
draw['province'] = name
dagg['province'] = name

return draw, dagg 

AttributeError: 'DataFrame' object has no attribute 'district'