In [3]:
import pandas as pd
import math
from google.cloud import storage


In [20]:
client = storage.Client(project="uk-election-406413")
bucket = client.bucket("ukelectiondata")
files = bucket.list_blobs()
file_names = [file.name for file in files]


In [353]:
election_results = bucket.blob(file_names[0])
election_results


<Blob: ukelectiondata, 1918-2019_election_results_by_constituency.xlsx, None>

In [354]:
# Commenting this out since I do not want to download the file every time I run the notebook,
# but if you want to trun locally youy will need to de comment it.

# election_results.download_to_filename('/Users/andreabrumana/code/willgreen93/UK_election/raw_data/1918-2019_election_results_by_constituency.xlsx')


In [16]:
# Need to install openpyxl. Run pip install openpyxl in terminal
data = pd.read_excel("/Users/andreabrumana/code/willgreen93/UK_election/raw_data/1918-2019_election_results_by_constituency.xlsx", sheet_name=None)


In [9]:
# Just want the data from 2001 onwards
years_in_scope = list(data.keys())[-6:]
years_in_scope


['2001', '2005', '2010', '2015', '2017', '2019']

In [10]:
data['2019'].head()


Unnamed: 0.1,Unnamed: 0,2019 GENERAL ELECTION,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47
0,,Results by constituency,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,Conservative,,...,UUP,,,Alliance,,,Other,,,
2,,id,Constituency,County,Country/Region,Country,Electorate,,Votes,Vote share,...,Votes,Vote share,,Votes,Vote share,,Votes,Votes Share,Total votes,Turnout
3,,W07000049,ABERAVON,West Glamorgan,Wales,Wales,50750,,6518,0.206279,...,,,,,,,731,0.023134,31598,0.622621
4,,W07000058,ABERCONWY,Clwyd,Wales,Wales,44699,,14687,0.460913,...,,,,,,,,,31865,0.712879


In [24]:
def clean_data(data:dict, key:str):
    """A function that takes a dictionary of dataframes and a key and returns a cleaned dataframe"""

    # Access the dataframe
    temp_data = data[key]

    #Preparing the columns name
    two_rows = temp_data.iloc[1:3] # the columns name are in the second and third row of the temp_data dataframe
    new_list = []
    prev = None
    second_row = list(two_rows.iloc[1])

    for id, each in enumerate(list(two_rows.iloc[0])):
        if (not prev == "id")  and (isinstance(prev, str)):
            new_list.append(f"{prev}_{second_row[id]}")
        else:
            new_list.append(f"{each}_{second_row[id]}")
        prev = each

    cleaned_list = [element.lower().strip() for element in new_list]

    temp_data.columns = cleaned_list
    temp_data = temp_data.iloc[3:, 1:].reset_index(drop=True)
    temp_data.drop(columns='nan_nan', inplace=True)
    temp_data = temp_data.rename(columns=lambda x: x.replace('nan_', ''))
    temp_data = temp_data.rename(columns=lambda x: x.replace('_nan', ''))

    # There is an empty row, after the last row of the dataframe, and after it there is a legend of the data. Need to remove it
    empty_row_index = temp_data.index[temp_data.isnull().all(axis=1)].min()
    temp_data = temp_data.loc[:empty_row_index - 1, :]

    # Add an year column that depens on the key
    temp_data['year'] = int(key)
    temp_data = temp_data[["year", *temp_data.columns[:-1]]]

    # Clean data types
    votes_columns = [col for col in temp_data.columns if col.endswith('_votes')]
    share_columns = [col for col in temp_data.columns if col.endswith('share')]
    temp_data[votes_columns] = temp_data[votes_columns].astype(float)
    temp_data[share_columns] = temp_data[share_columns].astype(float)

    # Create homogenous columns in a new dataframe, and selecting just the columns we need
    clean_data = pd.DataFrame()

    clean_data['year'] = temp_data['year']
    clean_data['constituency_id'] = temp_data['id']
    clean_data['constituency'] = temp_data['constituency']
    clean_data['electorate'] = temp_data['electorate']
    # Conservative
    clean_data['conservative_votes'] = temp_data['conservative_votes']
    clean_data['conservative_vote_share'] = temp_data['conservative_vote share']
    # Labour
    clean_data['labour_votes'] = temp_data['labour_votes']
    clean_data['labour_vote_share'] = temp_data['labour_vote share']
    # Liberal Democrats
    clean_data['liberal_democrats_votes'] = temp_data['liberal democrats_votes']
    clean_data['liberal_democrats_vote_share'] = temp_data['liberal democrats_vote share']
    # Group all the parties, expect labour and conservative in "Other"
    clean_data['other_parties_votes'] = temp_data[votes_columns].sum(axis=1) - temp_data['conservative_votes'] - temp_data['labour_votes'] - temp_data['liberal democrats_votes']
    clean_data['other_parties_vote_share'] = temp_data[share_columns].sum(axis=1) - temp_data['conservative_vote share'] - temp_data['labour_vote share'] - temp_data['liberal democrats_vote share']
    clean_data['total_votes'] = temp_data['total votes']
    clean_data['tornout'] = temp_data['turnout']
    clean_data.fillna(0, inplace=True)

    return clean_data


In [25]:
# Testing the function for an year
clean_data(data, '2019')


Unnamed: 0,year,constituency_id,constituency,electorate,conservative_votes,conservative_vote_share,labour_votes,labour_vote_share,liberal_democrats_votes,liberal_democrats_vote_share,other_parties_votes,other_parties_vote_share,total_votes,tornout
0,2019,W07000049,ABERAVON,50750,6518.0,0.206279,17008.0,0.538262,1072.0,0.033926,7000.0,0.221533,31598,0.622621
1,2019,W07000058,ABERCONWY,44699,14687.0,0.460913,12653.0,0.397081,1821.0,0.057147,2704.0,0.084858,31865,0.712879
2,2019,S14000001,ABERDEEN NORTH,62489,7535.0,0.201401,4939.0,0.132013,2846.0,0.076070,22093.0,0.590517,37413,0.598713
3,2019,S14000002,ABERDEEN SOUTH,65719,16398.0,0.359306,3834.0,0.084009,5018.0,0.109952,20388.0,0.446733,45638,0.694441
4,2019,S14000003,AIRDRIE AND SHOTTS,64011,7011.0,0.176280,12728.0,0.320024,1419.0,0.035678,18614.0,0.468018,39772,0.621331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
645,2019,E14001059,WYTHENSHAWE AND SALE EAST,76313,13459.0,0.300699,23855.0,0.532965,3111.0,0.069506,4334.0,0.096830,44759,0.586519
646,2019,E14001060,YEOVIL,82468,34588.0,0.583665,3761.0,0.063466,18407.0,0.310614,2504.0,0.042254,59260,0.718582
647,2019,W07000041,YNYS MON,51925,12959.0,0.354536,10991.0,0.300695,0.0,0.000000,0.0,0.000000,36552,0.703938
648,2019,E14001061,YORK CENTRAL,74899,13767.0,0.278093,27312.0,0.551702,4149.0,0.083810,4277.0,0.086395,49505,0.660957


In [26]:
# Applying the function to all the years in scope
cleaned_election_results = pd.concat([clean_data(data, key) for key in years_in_scope])
cleaned_election_results


Unnamed: 0,year,constituency_id,constituency,electorate,conservative_votes,conservative_vote_share,labour_votes,labour_vote_share,liberal_democrats_votes,liberal_democrats_vote_share,other_parties_votes,other_parties_vote_share,total_votes,tornout
0,2001,1,BEDFORD,67762,13297.0,0.327682,19454.0,0.479411,6425.0,0.158333,1403.0,0.034575,40579,0.598846
1,2001,2,LUTON NORTH,67554,12210.0,0.312069,22187.0,0.567065,3795.0,0.096994,934.0,0.023872,39126,0.579181
2,2001,3,LUTON SOUTH,71439,11586.0,0.294427,21719.0,0.551930,4292.0,0.109070,1754.0,0.044573,39351,0.550834
3,2001,4,MID BEDFORDSHIRE,70794,22109.0,0.474055,14043.0,0.301106,9205.0,0.197371,1281.0,0.027467,46638,0.658785
4,2001,5,NORTH EAST BEDFORDSHIRE,69877,22586.0,0.499182,14009.0,0.309619,7409.0,0.163749,1242.0,0.027450,45246,0.647509
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
645,2019,E14001059,WYTHENSHAWE AND SALE EAST,76313,13459.0,0.300699,23855.0,0.532965,3111.0,0.069506,4334.0,0.096830,44759,0.586519
646,2019,E14001060,YEOVIL,82468,34588.0,0.583665,3761.0,0.063466,18407.0,0.310614,2504.0,0.042254,59260,0.718582
647,2019,W07000041,YNYS MON,51925,12959.0,0.354536,10991.0,0.300695,0.0,0.000000,0.0,0.000000,36552,0.703938
648,2019,E14001061,YORK CENTRAL,74899,13767.0,0.278093,27312.0,0.551702,4149.0,0.083810,4277.0,0.086395,49505,0.660957


In [27]:
#Check the total share is 1
cleaned_election_results['check'] = cleaned_election_results['conservative_vote_share']+ cleaned_election_results['labour_vote_share']+cleaned_election_results['liberal_democrats_vote_share']+cleaned_election_results['other_parties_vote_share']
cleaned_election_results


Unnamed: 0,year,constituency_id,constituency,electorate,conservative_votes,conservative_vote_share,labour_votes,labour_vote_share,liberal_democrats_votes,liberal_democrats_vote_share,other_parties_votes,other_parties_vote_share,total_votes,tornout,check
0,2001,1,BEDFORD,67762,13297.0,0.327682,19454.0,0.479411,6425.0,0.158333,1403.0,0.034575,40579,0.598846,1.000000
1,2001,2,LUTON NORTH,67554,12210.0,0.312069,22187.0,0.567065,3795.0,0.096994,934.0,0.023872,39126,0.579181,1.000000
2,2001,3,LUTON SOUTH,71439,11586.0,0.294427,21719.0,0.551930,4292.0,0.109070,1754.0,0.044573,39351,0.550834,1.000000
3,2001,4,MID BEDFORDSHIRE,70794,22109.0,0.474055,14043.0,0.301106,9205.0,0.197371,1281.0,0.027467,46638,0.658785,1.000000
4,2001,5,NORTH EAST BEDFORDSHIRE,69877,22586.0,0.499182,14009.0,0.309619,7409.0,0.163749,1242.0,0.027450,45246,0.647509,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
645,2019,E14001059,WYTHENSHAWE AND SALE EAST,76313,13459.0,0.300699,23855.0,0.532965,3111.0,0.069506,4334.0,0.096830,44759,0.586519,1.000000
646,2019,E14001060,YEOVIL,82468,34588.0,0.583665,3761.0,0.063466,18407.0,0.310614,2504.0,0.042254,59260,0.718582,1.000000
647,2019,W07000041,YNYS MON,51925,12959.0,0.354536,10991.0,0.300695,0.0,0.000000,0.0,0.000000,36552,0.703938,0.655231
648,2019,E14001061,YORK CENTRAL,74899,13767.0,0.278093,27312.0,0.551702,4149.0,0.083810,4277.0,0.086395,49505,0.660957,1.000000


In [18]:
# save to a csv file
cleaned_election_results.to_csv("/Users/andreabrumana/code/willgreen93/UK_election/raw_data/cleaned_election_results.csv", index=False)


In [21]:
# Upload the file to the bucket
cleaned_election_results_blob = bucket.blob("cleaned_election_results.csv")
cleaned_election_results_blob.upload_from_filename("/Users/andreabrumana/code/willgreen93/UK_election/raw_data/cleaned_election_results.csv")
