In [46]:
import numpy as np
import pandas as pd


In [47]:
# The data is manually cleaned and will not change in future!

input_df = pd.read_csv("/Users/andreabrumana/code/willgreen93/UK_election/raw_data/general_election_results.csv")


In [48]:
input_df.head()


Unnamed: 0,constituency_id,constituency_name,country/region,electorate,con_votes,con_share,lib_votes,lib_share,lab_votes,lab_share,natSW_votes,natSW_share,oth_votes,oth_share,total_votes,turnout,election
0,W07000049,Aberavon,Wales,51242,3064.0,0.102,4140.0,0.138,18077.0,0.6,3545.0,0.118,1278.0,0.042,30104,0.587,2005
1,S14000001,Aberdeen North,Scotland,64834,3456.0,0.094,8762.0,0.239,15557.0,0.425,8168.0,0.223,691.0,0.019,36634,0.565,2005
2,S14000002,Aberdeen South,Scotland,65995,7134.0,0.171,13924.0,0.335,15272.0,0.367,4120.0,0.099,1171.0,0.028,41621,0.631,2005
3,S14000003,Airdrie & Shotts,Scotland,61865,3271.0,0.099,3792.0,0.114,19568.0,0.59,5484.0,0.165,1043.0,0.031,33158,0.536,2005
4,E14000530,Aldershot,South East,77644,20572.0,0.427,15238.0,0.317,9895.0,0.206,,,2436.0,0.051,48141,0.62,2005


In [49]:
input_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3246 entries, 0 to 3245
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   constituency_id    3246 non-null   object 
 1   constituency_name  3246 non-null   object 
 2   country/region     3246 non-null   object 
 3   electorate         3246 non-null   int64  
 4   con_votes          3182 non-null   float64
 5   con_share          3182 non-null   float64
 6   lib_votes          3131 non-null   float64
 7   lib_share          3131 non-null   float64
 8   lab_votes          3152 non-null   float64
 9   lab_share          3152 non-null   float64
 10  natSW_votes        491 non-null    float64
 11  natSW_share        491 non-null    float64
 12  oth_votes          3105 non-null   float64
 13  oth_share          3105 non-null   float64
 14  total_votes        3246 non-null   int64  
 15  turnout            3246 non-null   float64
 16  election           3246 

# Step 1. Simplify DF structure


In [50]:
# Clean the column names
input_df.columns = input_df.columns.str.strip()
# Fill the NaN with 0 (when NaN means 0, since no votes)
input_df.fillna(0, inplace=True)


In [51]:
# Remove and rename columns
input_df.drop(['electorate', 'turnout'], axis=1, inplace=True)
input_df.rename(columns={'election': 'year'}, inplace=True)


In [54]:
# group oth and natSW, and drop the ones that are not needed
input_df['other_votes'] = input_df['natSW_votes']+input_df['oth_votes']
input_df['other_share'] = input_df['natSW_share']+input_df['oth_share']
input_df.drop(['natSW_votes', 'oth_votes', 'natSW_share', 'oth_share'], axis=1, inplace=True)


In [57]:
input_df.head()


Unnamed: 0,constituency_id,constituency_name,country/region,con_votes,con_share,lib_votes,lib_share,lab_votes,lab_share,total_votes,year,other_votes,other_share
0,W07000049,Aberavon,Wales,3064.0,0.102,4140.0,0.138,18077.0,0.6,30104,2005,4823.0,0.16
1,S14000001,Aberdeen North,Scotland,3456.0,0.094,8762.0,0.239,15557.0,0.425,36634,2005,8859.0,0.242
2,S14000002,Aberdeen South,Scotland,7134.0,0.171,13924.0,0.335,15272.0,0.367,41621,2005,5291.0,0.127
3,S14000003,Airdrie & Shotts,Scotland,3271.0,0.099,3792.0,0.114,19568.0,0.59,33158,2005,6527.0,0.196
4,E14000530,Aldershot,South East,20572.0,0.427,15238.0,0.317,9895.0,0.206,48141,2005,2436.0,0.051


# Step 2 - Get previous data per each year


In [58]:
# Get previous share per party by constituency
input_df['con_share_prev'] = input_df.groupby('constituency_id')['con_share'].shift(1)
input_df['lib_share_prev'] = input_df.groupby('constituency_id')['lib_share'].shift(1)
input_df['lab_share_prev'] = input_df.groupby('constituency_id')['lab_share'].shift(1)
input_df['other_share_prev'] = input_df.groupby('constituency_id')['other_share'].shift(1)


In [63]:
input_df = input_df[input_df.year != 2005]


In [66]:
input_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2600 entries, 646 to 3245
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   constituency_id    2600 non-null   object 
 1   constituency_name  2600 non-null   object 
 2   country/region     2600 non-null   object 
 3   con_votes          2600 non-null   float64
 4   con_share          2600 non-null   float64
 5   lib_votes          2600 non-null   float64
 6   lib_share          2600 non-null   float64
 7   lab_votes          2600 non-null   float64
 8   lab_share          2600 non-null   float64
 9   total_votes        2600 non-null   int64  
 10  year               2600 non-null   int64  
 11  other_votes        2600 non-null   float64
 12  other_share        2600 non-null   float64
 13  con_share_prev     2587 non-null   float64
 14  lib_share_prev     2587 non-null   float64
 15  lab_share_prev     2587 non-null   float64
 16  other_share_prev   258

In [75]:
# in 2010, do not have previous data for 13 constituencies (new ones), so we drop
input_df[input_df['con_share_prev'].isnull()]


Unnamed: 0,constituency_id,constituency_name,country/region,con_votes,con_share,lib_votes,lib_share,lab_votes,lab_share,total_votes,year,other_votes,other_share,con_share_prev,lib_share_prev,lab_share_prev,other_share_prev,check


In [73]:
input_df.dropna(inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_df.dropna(inplace=True)


# Now let's put the code in just 1 function!


In [76]:
input_df = pd.read_csv("/Users/andreabrumana/code/willgreen93/UK_election/raw_data/input_general_election_results.csv")


In [77]:
def clean_election_results(input_df:pd.DataFrame):
    """Clean the election results data"""
    # Clean the column names
    input_df.columns = input_df.columns.str.strip()
    # Fill the NaN with 0 (when NaN means 0, since no votes)
    input_df.fillna(0, inplace=True)
    # Remove and rename columns
    input_df.drop(['electorate', 'turnout'], axis=1, inplace=True)
    input_df.rename(columns={'election': 'year'}, inplace=True)
    # group oth and natSW, and drop the ones that are not needed
    input_df['other_votes'] = input_df['natSW_votes']+input_df['oth_votes']
    input_df['other_share'] = input_df['natSW_share']+input_df['oth_share']
    input_df.drop(['natSW_votes', 'oth_votes', 'natSW_share', 'oth_share'], axis=1, inplace=True)
    # Get previous share per party by constituency
    input_df['con_share_prev'] = input_df.groupby('constituency_id')['con_share'].shift(1)
    input_df['lib_share_prev'] = input_df.groupby('constituency_id')['lib_share'].shift(1)
    input_df['lab_share_prev'] = input_df.groupby('constituency_id')['lab_share'].shift(1)
    input_df['other_share_prev'] = input_df.groupby('constituency_id')['other_share'].shift(1)
    # in 2010, do not have previous data for 13 constituencies (new ones), so we drop
    input_df.dropna(inplace=True)
    return input_df


In [78]:
output_df = clean_election_results(input_df)


In [79]:
output_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2595 entries, 43 to 3245
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   constituency_id    2595 non-null   object 
 1   constituency_name  2595 non-null   object 
 2   country/region     2595 non-null   object 
 3   con_votes          2595 non-null   float64
 4   con_share          2595 non-null   float64
 5   lib_votes          2595 non-null   float64
 6   lib_share          2595 non-null   float64
 7   lab_votes          2595 non-null   float64
 8   lab_share          2595 non-null   float64
 9   total_votes        2595 non-null   int64  
 10  year               2595 non-null   int64  
 11  other_votes        2595 non-null   float64
 12  other_share        2595 non-null   float64
 13  con_share_prev     2595 non-null   float64
 14  lib_share_prev     2595 non-null   float64
 15  lab_share_prev     2595 non-null   float64
 16  other_share_prev   2595