In [None]:
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent

from time import sleep
from random import randint, randrange
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

ua = UserAgent(verify_ssl=False)
header = {'User-Agent':str(ua.firefox)}

#### From the dashboard website, get the URLs for each NA constituency result.

In [None]:
r = requests.get('https://www.ecp.gov.pk/resultdashboard/ge2018.aspx',headers=header)
soup = BeautifulSoup(r.text, "lxml")

# From the dashboard page, fetch URLs for all the NA seats
na_urls = [i.find('a')['href'][2:].replace(' ','%20')\
           for i in soup.find_all(name='div',attrs={'class':'card card-hover'})\
           if i.find('a') != None]

#### Once the links for all the available constituencies have been retrieved, scrape the voting stats and data from each and store them as a list of dictionaries.

In [None]:
dataframe = []
count = 0

for na_url in na_urls:
    url = 'https://www.ecp.gov.pk' + na_url
    seat = na_url[na_url.find('=') + 1 : na_url.find('&')]
    print('\n--- Fetching URL for Seat # : ' + str(seat))
    print('--- URL : ' + url)
    r = requests.get(url,headers=header)
    if r.status_code == 200:
        print('--- HTML retrieved. Extracting Data')
        soup = BeautifulSoup(r.text, "lxml")
    else:
        print('*** Abort. HTML Status Code ' + str(r.status_code))
        break
    
    # Extract seat info and result status
    seatInfo = soup.find_all('span',{'id' : 'ContentPlaceHolder2_lblSubHeading'})[0].getText()
    seatName = seatInfo[seatInfo.find('(') + 1 : seatInfo.find(')')]
    seatStatus = seatInfo[seatInfo.find(')') + 1 : ].strip()
    
    # Extract voting statistics
    stats = soup.find_all('table')[0]
    registeredVoters = stats.find('span',{'id': 'ContentPlaceHolder1_lblRegVoters'}).getText()
    votesPolled = stats.find('span',{'id': 'ContentPlaceHolder1_lblVotesPolled'}).getText()
    validVotes = stats.find('span',{'id': 'ContentPlaceHolder1_lblValidVotes'}).getText()
    rejectedVotes = stats.find('span',{'id': 'ContentPlaceHolder1_lblRejVotes'}).getText()
    polledToRegisteredRatio = stats.find('span',{'id': 'ContentPlaceHolder1_lblTO'}).getText().replace('%','').strip()
    
    # Extract voting results
    voteCount = soup.find_all('table')[1]
    votingResults = []
    for i in voteCount.find_all('tr'):
        row = i.find_all('p')
        if len(row) != 0:
            candidateName = row[0].getText()
            candidateParty = row[1].getText()
            candidateVotes = row[2].getText()
            votingDict = {'candidateName' : candidateName,\
                          'candidateParty' : candidateParty,\
                          'candidateVotes' : int(candidateVotes)}
            votingResults.append(votingDict)
        
    data = {'seat' : seat,\
            'seatName' : seatName,\
            'seatStatus' : seatStatus,\
            'registeredVoters' : int(registeredVoters),\
            'votesPolled' : int(votesPolled),\
            'validVotes' : int(validVotes),\
            'rejectedVotes' : int(rejectedVotes),\
            'polledToRegRatio' : float(polledToRegisteredRatio) / 100,\
            'numberOfCandidates' : len(votingResults),\
            'votingResults' : votingResults,
            }
    
    dataframe.append(data)
    print('--- ' + seat + ' data addedd succesfully.')
    count += 1
    
    # Sleep for a few seconds before moving on to the next seat
    sleep_seconds = randint(2,10)
    print('--- Sleeping For : ', sleep_seconds, ' seconds.\n')
    sleep(sleep_seconds)
    
print('\n--- ' + str(count) + ' seats processed.')    

#### We will also retrieve voter participation rates, which is also bifurcated by gender.

In [None]:
participationResult = []

r = requests.get('https://www.ecp.gov.pk/frmstats.aspx',headers=header)
soup = BeautifulSoup(r.text, "lxml")
rows = soup.find_all('tr')

for i in rows[1:]: # We don't look at the first row since these are just headers.
    row = i.find_all('td')
    participationResult.append(
        {
        'seat' : row[0].getText(),
        'femaleTurnout' : float(row[1].getText().replace(' %','')) / 100,
        'maleTurnout' : float(row[2].getText().replace(' %','')) / 100,
        'totalTurnout' : float(row[3].getText().replace(' %','')) / 100,
        }
    )
    
dfParticipation = pd.DataFrame(participationResult)

#### Store as a Pandas dataframe and store in a CSV

In [None]:
df = pd.DataFrame(dataframe)
df = df.merge(dfParticipation,on='seat')
df.to_csv('Election_2018_NA_Results_Raw.csv',encoding='utf-8',index=False)

#### The voting results is a list of dictionaries contained in the dataframe above. For easier access, we will convert this to it's own dataframe and store it in a CSV as well.

In [None]:
resultsOnlyDataframe = []
for i in dataframe:
    resultsDict = i['votingResults']
    for j in resultsDict:
        j['seat'] = i['seat']
        j['seatName'] = i['seatName']
        resultsOnlyDataframe.append(j)

In [None]:
dfResultsOnly = pd.DataFrame(resultsOnlyDataframe)

In [None]:
dfResultsOnly.to_csv('Election_2018_NA_Results_VotingOnly.csv',encoding='utf-8',index=False)

## Winners & Runner Ups

In [None]:
# Adding margins from winners.

seat = ''
for row in dfResultsOnly.iterrows():
    if row[1]['seat'] == seat:
        margin = winnerVotes - row[1]['candidateVotes']
    else:
        winnerVotes = dfResultsOnly[dfResultsOnly['seat'] == row[1]['seat']][['candidateVotes']].max().values[0]
        margin = winnerVotes - row[1]['candidateVotes']
        seat = row[1]['seat']

    dfResultsOnly.loc[row[0],'marginFromWinner'] = int(margin)

#### Winners DF & CSV

In [None]:
winnerIdx = dfResultsOnly.groupby(by='seat',)[['candidateVotes']].idxmax()
winnerResults = dfResultsOnly.iloc[winnerIdx['candidateVotes'].values]
winnerResults = winnerResults.drop('marginFromWinner',axis=1)
winnerResults = winnerResults.merge(runnerupResults[['seat','marginFromWinner']],on='seat').rename(columns={'marginFromWinner' : 'winMargin'})
winnerResults.to_csv('winnerResults.csv',index=False,encoding='utf-8')

#### Runner Ups DF & CSV

In [None]:
runnerUpIdx = dfResultsOnly.groupby(by='seat')['candidateVotes'].nlargest(2).reset_index().groupby('seat').last()['level_1'].values
runnerupResults = dfResultsOnly.iloc[runnerUpIdx]
runnerupResults.to_csv('runnerupResults.csv',index=False,encoding='utf-8')