In [20]:
import numpy as np
import pandas as pd


In [21]:
# The data is manually cleaned and will not change in future!

input_df = pd.read_csv("/Users/andreabrumana/code/willgreen93/UK_election/raw_data/general_election_results.csv")


In [22]:
input_df.head()


Unnamed: 0,constituency_id,constituency_name,country/region,electorate,con_votes,con_share,lib_votes,lib_share,lab_votes,lab_share,natSW_votes,natSW_share,oth_votes,oth_share,total_votes,turnout,election
0,W07000049,Aberavon,Wales,51242,3064.0,0.102,4140.0,0.138,18077.0,0.6,3545.0,0.118,1278.0,0.042,30104,0.587,2005
1,S14000001,Aberdeen North,Scotland,64834,3456.0,0.094,8762.0,0.239,15557.0,0.425,8168.0,0.223,691.0,0.019,36634,0.565,2005
2,S14000002,Aberdeen South,Scotland,65995,7134.0,0.171,13924.0,0.335,15272.0,0.367,4120.0,0.099,1171.0,0.028,41621,0.631,2005
3,S14000003,Airdrie & Shotts,Scotland,61865,3271.0,0.099,3792.0,0.114,19568.0,0.59,5484.0,0.165,1043.0,0.031,33158,0.536,2005
4,E14000530,Aldershot,South East,77644,20572.0,0.427,15238.0,0.317,9895.0,0.206,,,2436.0,0.051,48141,0.62,2005


In [23]:
input_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3246 entries, 0 to 3245
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   constituency_id    3246 non-null   object 
 1   constituency_name  3246 non-null   object 
 2   country/region     3246 non-null   object 
 3   electorate         3246 non-null   int64  
 4   con_votes          3182 non-null   float64
 5   con_share          3182 non-null   float64
 6   lib_votes          3131 non-null   float64
 7   lib_share          3131 non-null   float64
 8   lab_votes          3152 non-null   float64
 9   lab_share          3152 non-null   float64
 10  natSW_votes        491 non-null    float64
 11  natSW_share        491 non-null    float64
 12  oth_votes          3105 non-null   float64
 13  oth_share          3105 non-null   float64
 14  total_votes        3246 non-null   int64  
 15  turnout            3246 non-null   float64
 16  election           3246 

# Step 1. Simplify DF structure


In [24]:
# Clean the column names
input_df.columns = input_df.columns.str.strip()
# Fill the NaN with 0 (when NaN means 0, since no votes)
input_df.fillna(0, inplace=True)


In [25]:
# Remove and rename columns
input_df.drop(['electorate', 'turnout'], axis=1, inplace=True)
input_df.rename(columns={'election': 'year'}, inplace=True)


In [26]:
# group oth and natSW, and drop the ones that are not needed
input_df['other_votes'] = input_df['natSW_votes']+input_df['oth_votes']
input_df['other_share'] = input_df['natSW_share']+input_df['oth_share']
input_df.drop(['natSW_votes', 'oth_votes', 'natSW_share', 'oth_share'], axis=1, inplace=True)


In [27]:
input_df.head()


Unnamed: 0,constituency_id,constituency_name,country/region,con_votes,con_share,lib_votes,lib_share,lab_votes,lab_share,total_votes,year,other_votes,other_share
0,W07000049,Aberavon,Wales,3064.0,0.102,4140.0,0.138,18077.0,0.6,30104,2005,4823.0,0.16
1,S14000001,Aberdeen North,Scotland,3456.0,0.094,8762.0,0.239,15557.0,0.425,36634,2005,8859.0,0.242
2,S14000002,Aberdeen South,Scotland,7134.0,0.171,13924.0,0.335,15272.0,0.367,41621,2005,5291.0,0.127
3,S14000003,Airdrie & Shotts,Scotland,3271.0,0.099,3792.0,0.114,19568.0,0.59,33158,2005,6527.0,0.196
4,E14000530,Aldershot,South East,20572.0,0.427,15238.0,0.317,9895.0,0.206,48141,2005,2436.0,0.051


# Step 2 - Get previous data per each year


In [28]:
# Get previous share per party by constituency
input_df['con_share_prev'] = input_df.groupby('constituency_id')['con_share'].shift(1)
input_df['lib_share_prev'] = input_df.groupby('constituency_id')['lib_share'].shift(1)
input_df['lab_share_prev'] = input_df.groupby('constituency_id')['lab_share'].shift(1)
input_df['other_share_prev'] = input_df.groupby('constituency_id')['other_share'].shift(1)


In [29]:
input_df = input_df[input_df.year != 2005]


In [30]:
input_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2600 entries, 646 to 3245
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   constituency_id    2600 non-null   object 
 1   constituency_name  2600 non-null   object 
 2   country/region     2600 non-null   object 
 3   con_votes          2600 non-null   float64
 4   con_share          2600 non-null   float64
 5   lib_votes          2600 non-null   float64
 6   lib_share          2600 non-null   float64
 7   lab_votes          2600 non-null   float64
 8   lab_share          2600 non-null   float64
 9   total_votes        2600 non-null   int64  
 10  year               2600 non-null   int64  
 11  other_votes        2600 non-null   float64
 12  other_share        2600 non-null   float64
 13  con_share_prev     2587 non-null   float64
 14  lib_share_prev     2587 non-null   float64
 15  lab_share_prev     2587 non-null   float64
 16  other_share_prev   258

In [31]:
# in 2010, do not have previous data for 13 constituencies (new ones), so we drop
input_df[input_df['con_share_prev'].isnull()]


Unnamed: 0,constituency_id,constituency_name,country/region,con_votes,con_share,lib_votes,lib_share,lab_votes,lab_share,total_votes,year,other_votes,other_share,con_share_prev,lib_share_prev,lab_share_prev,other_share_prev
738,E14000603,Broadland,Eastern,24338.0,0.462,17046.0,0.324,7287.0,0.138,52676,2010,4005.0,0.076,,,,
767,E14000623,Central Devon,South West,27737.0,0.515,18507.0,0.344,3715.0,0.069,53873,2010,3914.0,0.073,,,,
774,E14000629,Chelsea And Fulham,London,24093.0,0.605,6473.0,0.162,7371.0,0.185,39856,2010,1919.0,0.048,,,,
780,E14000635,Chippenham,South West,21500.0,0.41,23970.0,0.458,3620.0,0.069,52385,2010,3295.0,0.063,,,,
872,E14000702,Filton And Bradley Stoke,South West,19686.0,0.408,12197.0,0.253,12772.0,0.264,48301,2010,3646.0,0.075,,,,
951,E14000767,Kenilworth And Southam,West Midlands,25945.0,0.536,13393.0,0.277,6949.0,0.143,48431,2010,2144.0,0.044,,,,
1002,E14000811,Meon Valley,South East,28818.0,0.562,16693.0,0.326,3266.0,0.064,51238,2010,2461.0,0.048,,,,
1006,E14000814,Mid Derbyshire,East Midlands,22877.0,0.483,9711.0,0.205,11585.0,0.245,47342,2010,3169.0,0.067,,,,
1152,E14000933,South Basildon And East Thurrock,Eastern,19624.0,0.439,5977.0,0.134,13852.0,0.31,44735,2010,5282.0,0.118,,,,
1162,E14000942,South Northamptonshire,East Midlands,33081.0,0.552,12603.0,0.21,10380.0,0.173,59890,2010,3826.0,0.064,,,,


In [32]:
input_df.dropna(inplace=True)


# Now let's put the code in just 1 function!


In [85]:
input_df = pd.read_csv("/Users/andreabrumana/code/willgreen93/UK_election/raw_data/general_election_results.csv")
input_df


Unnamed: 0,constituency_id,constituency_name,country/region,electorate,con_votes,con_share,lib_votes,lib_share,lab_votes,lab_share,natSW_votes,natSW_share,oth_votes,oth_share,total_votes,turnout,election
0,W07000049,Aberavon,Wales,51242,3064.0,0.102,4140.0,0.138,18077.0,0.600,3545.0,0.118,1278.0,0.042,30104,0.587,2005
1,S14000001,Aberdeen North,Scotland,64834,3456.0,0.094,8762.0,0.239,15557.0,0.425,8168.0,0.223,691.0,0.019,36634,0.565,2005
2,S14000002,Aberdeen South,Scotland,65995,7134.0,0.171,13924.0,0.335,15272.0,0.367,4120.0,0.099,1171.0,0.028,41621,0.631,2005
3,S14000003,Airdrie & Shotts,Scotland,61865,3271.0,0.099,3792.0,0.114,19568.0,0.590,5484.0,0.165,1043.0,0.031,33158,0.536,2005
4,E14000530,Aldershot,South East,77644,20572.0,0.427,15238.0,0.317,9895.0,0.206,,,2436.0,0.051,48141,0.620,2005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3241,E14001059,Wythenshawe & Sale East,North West,76313,13459.0,0.301,3111.0,0.070,23855.0,0.533,,,4334.0,0.097,44759,0.587,2019
3242,E14001060,Yeovil,South West,82468,34588.0,0.584,18407.0,0.311,3761.0,0.063,,,2504.0,0.042,59260,0.719,2019
3243,W07000041,Ynys Mon,Wales,51925,12959.0,0.355,,,10991.0,0.301,10418.0,0.285,2184.0,0.060,36552,0.704,2019
3244,E14001061,York Central,Yorkshire and The Humber,74899,13767.0,0.278,4149.0,0.084,27312.0,0.552,,,4277.0,0.086,49505,0.661,2019


In [157]:
def clean_election_results(election_results:pd.DataFrame):
    """Clean the election results data"""
    clean_election_results = election_results.copy()
    # Clean the column names
    clean_election_results.columns = clean_election_results.columns.str.strip()
    # Fill the NaN with 0 (when NaN means 0, since no votes)
    clean_election_results = clean_election_results.fillna(0)
    # Remove and rename columns
    clean_election_results.drop(['electorate', 'turnout'], axis=1, inplace=True)
    clean_election_results.rename(columns={'election': 'year'}, inplace=True)
    # group oth and natSW, and drop the ones that are not needed
    clean_election_results['oth_votes'] = clean_election_results['natSW_votes']+clean_election_results['oth_votes']
    clean_election_results['oth_share'] = clean_election_results['natSW_share']+clean_election_results['oth_share']
    clean_election_results.drop(['natSW_votes', 'natSW_share'], axis=1, inplace=True)
    # Get previous share per party by constituency
    clean_election_results['con_share_prev'] = clean_election_results.groupby('constituency_id')['con_share'].shift(1)
    clean_election_results['lib_share_prev'] = clean_election_results.groupby('constituency_id')['lib_share'].shift(1)
    clean_election_results['lab_share_prev'] = clean_election_results.groupby('constituency_id')['lab_share'].shift(1)
    clean_election_results['oth_share_prev'] = clean_election_results.groupby('constituency_id')['oth_share'].shift(1)
    # select data from 2010 onwards, and reset index
    clean_election_results = clean_election_results[clean_election_results.year != 2005].reset_index(drop=True)
    # in 2010, do not have previous data for 13 constituencies (new ones), so we drop
    clean_election_results.dropna(inplace=True)
    # reorder columns
    column_order = ['year', 'constituency_id', 'constituency_name', 'country/region',
                    'total_votes',
                    'con_votes', 'lab_votes', 'lib_votes', 'oth_votes',
                    'con_share', 'lab_share', 'lib_share', 'oth_share',
                    'con_share_prev', 'lab_share_prev', 'lib_share_prev', 'oth_share_prev',
                    ]
    clean_election_results = clean_election_results[column_order]
    # I need to  manually create data for 2024
    future_df = clean_election_results[clean_election_results.year == 2019]
    future_df['year'] = 2024
    future_df[['con_share_prev','lab_share_prev','lib_share_prev','oth_share_prev']] = future_df[['con_share','lab_share','lib_share','oth_share']]
    future_df[['total_votes','con_votes','lab_votes','lib_votes','oth_votes','con_share','lab_share','lib_share','oth_share']] = np.nan
    clean_election_results = pd.concat([clean_election_results, future_df], ignore_index=True)
    # Calculate the avg for GE
    clean_election_results['mean_con_share_ge'] = clean_election_results.groupby('year')['con_share_prev'].transform('mean')
    clean_election_results['mean_lab_share_ge'] = clean_election_results.groupby('year')['lab_share_prev'].transform('mean')
    clean_election_results['mean_lib_share_ge'] = clean_election_results.groupby('year')['lib_share_prev'].transform('mean')
    clean_election_results['mean_oth_share_ge'] = clean_election_results.groupby('year')['oth_share_prev'].transform('mean')
    return clean_election_results


In [158]:
output_df = clean_election_results(input_df)
output_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_df['year'] = 2024
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_df[['con_share_prev','lab_share_prev','lib_share_prev','oth_share_prev']] = future_df[['con_share','lab_share','lib_share','oth_share']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_df[['total_votes','con_votes'

Unnamed: 0,year,constituency_id,constituency_name,country/region,total_votes,con_votes,lab_votes,lib_votes,oth_votes,con_share,...,lib_share,oth_share,con_share_prev,lab_share_prev,lib_share_prev,oth_share_prev,mean_con_share_ge,mean_lab_share_ge,mean_lib_share_ge,mean_oth_share_ge
0,2010,W07000049,Aberavon,Wales,30958.0,4411.0,16073.0,5034.0,5440.0,0.142,...,0.163,0.176,0.102,0.600,0.138,0.160,0.307918,0.365126,0.215917,0.110826
1,2010,W07000058,Aberconwy,Wales,29966.0,10734.0,7336.0,5786.0,6110.0,0.358,...,0.193,0.204,0.279,0.371,0.200,0.150,0.307918,0.365126,0.215917,0.110826
2,2010,S14000001,Aberdeen North,Scotland,37701.0,4666.0,16746.0,7001.0,9288.0,0.124,...,0.186,0.246,0.094,0.425,0.239,0.242,0.307918,0.365126,0.215917,0.110826
3,2010,S14000002,Aberdeen South,Scotland,43034.0,8914.0,15722.0,12216.0,6182.0,0.207,...,0.284,0.144,0.171,0.367,0.335,0.127,0.307918,0.365126,0.215917,0.110826
4,2010,S14000003,Airdrie & Shotts,Scotland,35849.0,3133.0,20849.0,2898.0,8969.0,0.087,...,0.081,0.250,0.099,0.590,0.114,0.196,0.307918,0.365126,0.215917,0.110826
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3232,2024,E14001059,Wythenshawe & Sale East,North West,,,,,,,...,,,0.301,0.533,0.070,0.097,0.427471,0.328280,0.108432,0.135811
3233,2024,E14001060,Yeovil,South West,,,,,,,...,,,0.584,0.063,0.311,0.042,0.427471,0.328280,0.108432,0.135811
3234,2024,W07000041,Ynys Mon,Wales,,,,,,,...,,,0.355,0.301,0.000,0.345,0.427471,0.328280,0.108432,0.135811
3235,2024,E14001061,York Central,Yorkshire and The Humber,,,,,,,...,,,0.278,0.552,0.084,0.086,0.427471,0.328280,0.108432,0.135811


In [145]:
output_df['mean_con_share_ge'] = output_df.groupby('year')['con_share_prev'].transform('mean')
output_df['mean_lab_share_ge'] = output_df.groupby('year')['lab_share_prev'].transform('mean')
output_df['mean_lib_share_ge'] = output_df.groupby('year')['lib_share_prev'].transform('mean')
output_df['mean_oth_share_ge'] = output_df.groupby('year')['oth_share_prev'].transform('mean')


In [154]:
# get polling data
polling_df = pd.read_csv('/Users/andreabrumana/code/willgreen93/UK_election/raw_data/preprocessed/preprocessed_polling.csv')
elections_polling_df = output_df.merge(polling_df, on='year', how='left')
elections_polling_df


Unnamed: 0,year,constituency_id_x,constituency_name_x,country/region_x,total_votes_x,con_votes_x,lab_votes_x,lib_votes_x,oth_votes_x,con_share_x,...,lib_share_prev_y,oth_share_prev_y,mean_con_share_ge_y,mean_lab_share_ge_y,mean_lib_share_ge_y,mean_oth_share_ge_y,con_pre_ge_poll,lab_pre_ge_poll,lib_pre_ge_poll,oth_pre_ge_poll
0,2010,W07000049,Aberavon,Wales,30958.0,4411.0,16073.0,5034.0,5440.0,0.142,...,0.138,0.160,0.307918,0.365126,0.215917,0.110826,0.358,0.281,0.27,0.091
1,2010,W07000049,Aberavon,Wales,30958.0,4411.0,16073.0,5034.0,5440.0,0.142,...,0.200,0.150,0.307918,0.365126,0.215917,0.110826,0.358,0.281,0.27,0.091
2,2010,W07000049,Aberavon,Wales,30958.0,4411.0,16073.0,5034.0,5440.0,0.142,...,0.239,0.242,0.307918,0.365126,0.215917,0.110826,0.358,0.281,0.27,0.091
3,2010,W07000049,Aberavon,Wales,30958.0,4411.0,16073.0,5034.0,5440.0,0.142,...,0.335,0.127,0.307918,0.365126,0.215917,0.110826,0.358,0.281,0.27,0.091
4,2010,W07000049,Aberavon,Wales,30958.0,4411.0,16073.0,5034.0,5440.0,0.142,...,0.114,0.196,0.307918,0.365126,0.215917,0.110826,0.358,0.281,0.27,0.091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2095764,2024,E14001062,York Outer,Yorkshire and The Humber,,,,,,,...,0.070,0.097,0.427471,0.328280,0.108432,0.135811,,,,
2095765,2024,E14001062,York Outer,Yorkshire and The Humber,,,,,,,...,0.311,0.042,0.427471,0.328280,0.108432,0.135811,,,,
2095766,2024,E14001062,York Outer,Yorkshire and The Humber,,,,,,,...,0.000,0.345,0.427471,0.328280,0.108432,0.135811,,,,
2095767,2024,E14001062,York Outer,Yorkshire and The Humber,,,,,,,...,0.084,0.086,0.427471,0.328280,0.108432,0.135811,,,,


In [151]:
# Save output to csv
output_df.to_csv("/Users/andreabrumana/code/willgreen93/UK_election/raw_data/preprocessed/preprocessed_election_result.csv", index=False)
elections_polling_df.to_csv('/Users/andreabrumana/code/willgreen93/UK_election/raw_data/preprocessed/preprocessed_election_polling.csv', index=False)
