## PRESIDENTIAL ELECTIONS DATA

Importing nessecary libraries and reading cleaned data

In [1]:
# Surpress warnings:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

import numpy as np
import pandas as pd

In [2]:
presid=pd.read_csv('data/cleaned_1976-2020-president.csv')
presid.drop(columns=['state_cen','state_ic'],inplace=True)
presid.rename(columns={'state_fips':'fips'},inplace=True)

presid.head()

Unnamed: 0,year,state,state_po,fips,candidate,candidatevotes,totalvotes,party
0,1976,ALABAMA,AL,1,"CARTER, JIMMY",659170,1182850,DEMOCRAT
1,1976,ALABAMA,AL,1,"FORD, GERALD",504070,1182850,REPUBLICAN
2,1976,ALASKA,AK,2,"FORD, GERALD",71555,123574,REPUBLICAN
3,1976,ALASKA,AK,2,"CARTER, JIMMY",44058,123574,DEMOCRAT
4,1976,ARIZONA,AZ,4,"FORD, GERALD",418642,742719,REPUBLICAN


In [3]:
presid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1224 entries, 0 to 1223
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   year            1224 non-null   int64 
 1   state           1224 non-null   object
 2   state_po        1224 non-null   object
 3   fips            1224 non-null   int64 
 4   candidate       1224 non-null   object
 5   candidatevotes  1224 non-null   int64 
 6   totalvotes      1224 non-null   int64 
 7   party           1224 non-null   object
dtypes: int64(4), object(4)
memory usage: 76.6+ KB


Storing the names of the candidates here in case they are needed later, and deleting them from the dataset

In [4]:
candidates=presid.groupby(['year','party'])['candidate'].first()
candidates

year  party     
1976  DEMOCRAT            CARTER, JIMMY
      REPUBLICAN           FORD, GERALD
1980  DEMOCRAT            CARTER, JIMMY
      REPUBLICAN         REAGAN, RONALD
1984  DEMOCRAT          MONDALE, WALTER
      REPUBLICAN         REAGAN, RONALD
1988  DEMOCRAT         DUKAKIS, MICHAEL
      REPUBLICAN      BUSH, GEORGE H.W.
1992  DEMOCRAT            CLINTON, BILL
      REPUBLICAN      BUSH, GEORGE H.W.
1996  DEMOCRAT            CLINTON, BILL
      REPUBLICAN           DOLE, ROBERT
2000  DEMOCRAT                 GORE, AL
      REPUBLICAN        BUSH, GEORGE W.
2004  DEMOCRAT              KERRY, JOHN
      REPUBLICAN        BUSH, GEORGE W.
2008  DEMOCRAT         OBAMA, BARACK H.
      REPUBLICAN           MCCAIN, JOHN
2012  DEMOCRAT         OBAMA, BARACK H.
      REPUBLICAN           ROMNEY, MITT
2016  DEMOCRAT         CLINTON, HILLARY
      REPUBLICAN       TRUMP, DONALD J.
2020  DEMOCRAT      BIDEN, JOSEPH R. JR
      REPUBLICAN       TRUMP, DONALD J.
Name: candidate, dtype:

In [5]:
presid.drop(columns=['candidate'],inplace=True)

Transforming the DataFrame to leave one row for each year-state combination and put Democratic and Republican votes in separate columns. This is achieved by splitting the dataset into two parts, renaming the 'candidatevotes' column, dropping 'party' column and merging them back together.

In [6]:
dem_df=presid[presid.party=='DEMOCRAT'].rename(columns={'candidatevotes':'d_votes'}).drop(columns=['party'])
rep_df=presid[presid.party=='REPUBLICAN'].rename(columns={'candidatevotes':'r_votes'}).drop(columns=['party'])
presid=pd.merge(dem_df, rep_df, on = ['year','state','state_po','fips','totalvotes'], how = "outer")

presid.head()

Unnamed: 0,year,state,state_po,fips,d_votes,totalvotes,r_votes
0,1976,ALABAMA,AL,1,659170,1182850,504070
1,1976,ALASKA,AK,2,44058,123574,71555
2,1976,ARIZONA,AZ,4,295602,742719,418642
3,1976,ARKANSAS,AR,5,498604,767535,267903
4,1976,CALIFORNIA,CA,6,3742284,7803770,3882244


In [7]:
presid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 612 entries, 0 to 611
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   year        612 non-null    int64 
 1   state       612 non-null    object
 2   state_po    612 non-null    object
 3   fips        612 non-null    int64 
 4   d_votes     612 non-null    int64 
 5   totalvotes  612 non-null    int64 
 6   r_votes     612 non-null    int64 
dtypes: int64(5), object(2)
memory usage: 33.6+ KB


Now, since numbers of votes are different state-by-state and in time, it's better to analyse them comparatively. Here, new values are created by calculating proportions of votes and differences between them for each year and state. By convention, Democratic votes will go first, so if the value of 'dr_percent' is positive, then the Democratic candidate won the election in that exact state and year, otherwise, the Republican candidate won.

In [8]:
presid['d_percent']=(presid.d_votes/presid.totalvotes)*100
presid['r_percent']=(presid.r_votes/presid.totalvotes)*100
presid['dr_percent']=presid['d_percent']-presid['r_percent']

presid.head()

Unnamed: 0,year,state,state_po,fips,d_votes,totalvotes,r_votes,d_percent,r_percent,dr_percent
0,1976,ALABAMA,AL,1,659170,1182850,504070,55.727269,42.614871,13.112398
1,1976,ALASKA,AK,2,44058,123574,71555,35.653131,57.904575,-22.251444
2,1976,ARIZONA,AZ,4,295602,742719,418642,39.799978,56.366136,-16.566158
3,1976,ARKANSAS,AR,5,498604,767535,267903,64.961728,34.904337,30.057392
4,1976,CALIFORNIA,CA,6,3742284,7803770,3882244,47.954822,49.748314,-1.793492


Adding a column encoding this : 'winner'=1 means the state was won by Democrats, 'winner'=0 means the state was won by Republicans.

In [9]:
presid['winner']=(presid['dr_percent']>0).astype(int)
presid.head()

Unnamed: 0,year,state,state_po,fips,d_votes,totalvotes,r_votes,d_percent,r_percent,dr_percent,winner
0,1976,ALABAMA,AL,1,659170,1182850,504070,55.727269,42.614871,13.112398,1
1,1976,ALASKA,AK,2,44058,123574,71555,35.653131,57.904575,-22.251444,0
2,1976,ARIZONA,AZ,4,295602,742719,418642,39.799978,56.366136,-16.566158,0
3,1976,ARKANSAS,AR,5,498604,767535,267903,64.961728,34.904337,30.057392,1
4,1976,CALIFORNIA,CA,6,3742284,7803770,3882244,47.954822,49.748314,-1.793492,0


Adding a new column that will show the value of 'dr_percent' in the same state in the previous election. Obviously, these values for 1976 are going to be meaningless, since we don't have the data for 1972 here.

In [10]:
presid['prev_dr_percent']=np.append(100*np.ones(shape=(51,)), presid.dr_percent.to_numpy()[:-51])

Checking the correctness of the 'prev_dr_percent' value in every row from year 1980 onwards.

In [11]:
good=True
for year in range(1980,2021,4):
    for state in presid.state.unique():
        if ((presid[(presid.year==year) & (presid.state==state)].prev_dr_percent.iloc[0]) 
           != (presid[(presid.year==year-4) & (presid.state==state)].dr_percent.iloc[0])):
            good=False
            break

print(good)

True


The compared values have been equal every time, therefore, they are correct.

Adding the parameter showing the swing: how have the results in the state changed since the previous election. Again, if this parameter is positive, then the difference between the vote percentages has improved for Democrats, otherwise it has improved for Republicans.

In [12]:
presid['dr_swing']=presid.dr_percent-presid.prev_dr_percent
presid.tail()

Unnamed: 0,year,state,state_po,fips,d_votes,totalvotes,r_votes,d_percent,r_percent,dr_percent,winner,prev_dr_percent,dr_swing
607,2020,VIRGINIA,VA,51,2413568,4460524,1962430,54.109517,43.995504,10.114014,1,5.323706,4.790308
608,2020,WASHINGTON,WA,53,2369612,4087631,1584651,57.970301,38.766978,19.203323,1,15.706,3.497324
609,2020,WEST VIRGINIA,WV,54,235984,794652,545382,29.696521,68.631552,-38.935031,0,-42.153647,3.218616
610,2020,WISCONSIN,WI,55,1630866,3298041,1610184,49.449537,48.822437,0.6271,1,-0.764343,1.391443
611,2020,WYOMING,WY,56,73491,278503,193559,26.387867,69.49979,-43.111923,0,-45.76951,2.657587


Adding a column encoding this : 'swing'=1 means the state swung towards Democrats, 'swing'=0 means the state swung towards Republicans.

In [13]:
presid['swing']=(presid['dr_swing']>0).astype(int)
presid.tail()

Unnamed: 0,year,state,state_po,fips,d_votes,totalvotes,r_votes,d_percent,r_percent,dr_percent,winner,prev_dr_percent,dr_swing,swing
607,2020,VIRGINIA,VA,51,2413568,4460524,1962430,54.109517,43.995504,10.114014,1,5.323706,4.790308,1
608,2020,WASHINGTON,WA,53,2369612,4087631,1584651,57.970301,38.766978,19.203323,1,15.706,3.497324,1
609,2020,WEST VIRGINIA,WV,54,235984,794652,545382,29.696521,68.631552,-38.935031,0,-42.153647,3.218616,1
610,2020,WISCONSIN,WI,55,1630866,3298041,1610184,49.449537,48.822437,0.6271,1,-0.764343,1.391443,1
611,2020,WYOMING,WY,56,73491,278503,193559,26.387867,69.49979,-43.111923,0,-45.76951,2.657587,1


Here it can be seen that although overall the distribution of values of 'swing' is not unbalanced, however, in every election most states tend to swing towards only one party, particularly the party of the winner of the election. This spells trouble for the potential of predicting this parameter.

In [14]:
presid[presid.year!=1976].groupby('year')['swing'].value_counts()

year  swing
1980  0        50
      1         1
1984  0        43
      1         8
1988  1        49
      0         2
1992  1        50
      0         1
1996  1        30
      0        21
2000  0        49
      1         2
2004  0        33
      1        18
2008  1        46
      0         5
2012  0        45
      1         6
2016  0        38
      1        13
2020  1        44
      0         7
Name: count, dtype: int64

In [15]:
presid[presid.year!=1976]['swing'].value_counts()

swing
0    294
1    267
Name: count, dtype: int64

Calculating the national vote totals for every election in a new DataFrame.

In [16]:
presid_national=pd.DataFrame(presid.groupby('year')[['d_votes','r_votes','totalvotes']].sum()).reset_index()
presid_national

Unnamed: 0,year,d_votes,r_votes,totalvotes
0,1976,40825839,39145771,81601344
1,1980,35480948,43898770,86496851
2,1984,37568137,54455073,92654861
3,1988,41809074,48886097,91586825
4,1992,44954303,39103872,104599780
5,1996,47401898,39198482,96389818
6,2000,50996062,50456169,105593982
7,2004,59028079,62028285,122349450
8,2008,69498459,59948283,131419253
9,2012,65899660,60932152,129139997


To note, the national vote totals calculated here do not match the final official results from 2020, which is presumably due to inacuracies in the MIT dataset. For example, the official number of total votes cast is 158,429,631. However, for the purposes of this project, this difference of 0.1% is negligible.

Adding the same relative features as in the original dataset.

In [17]:
presid_national['d_percent']=(presid_national.d_votes/presid_national.totalvotes)*100
presid_national['r_percent']=(presid_national.r_votes/presid_national.totalvotes)*100
presid_national['dr_percent']=presid_national['d_percent']-presid_national['r_percent']
presid_national['prev_dr_percent']=np.append(100, presid_national.dr_percent.to_numpy()[:-1])
presid_national['dr_swing']=presid_national.dr_percent-presid_national.prev_dr_percent

presid_national

Unnamed: 0,year,d_votes,r_votes,totalvotes,d_percent,r_percent,dr_percent,prev_dr_percent,dr_swing
0,1976,40825839,39145771,81601344,50.030841,47.971968,2.058873,100.0,-97.941127
1,1980,35480948,43898770,86496851,41.01993,50.751871,-9.73194,2.058873,-11.790813
2,1984,37568137,54455073,92654861,40.546321,58.771955,-18.225634,-9.73194,-8.493694
3,1988,41809074,48886097,91586825,45.64966,53.376779,-7.727119,-18.225634,10.498515
4,1992,44954303,39103872,104599780,42.977436,37.384277,5.593158,-7.727119,13.320277
5,1996,47401898,39198482,96389818,49.177288,40.666621,8.510667,5.593158,2.917509
6,2000,50996062,50456169,105593982,48.294478,47.783186,0.511291,8.510667,-7.999375
7,2004,59028079,62028285,122349450,48.24548,50.697641,-2.452161,0.511291,-2.963453
8,2008,69498459,59948283,131419253,52.883012,45.616058,7.266953,-2.452161,9.719115
9,2012,65899660,60932152,129139997,51.029628,47.183021,3.846607,7.266953,-3.420347


Adding a column encoding the winner of national popular vote : 'pv_winner'=1 means the national popular vote was won by Democrats, 'pv_winner'=0 means the national popular vote was won by Republicans.

In [18]:
#presid_national['swing_national']=(presid_national['dr_swing']>0).astype(int)
presid_national['pv_winner']=(presid_national['dr_percent']>0).astype(int)

presid_national

Unnamed: 0,year,d_votes,r_votes,totalvotes,d_percent,r_percent,dr_percent,prev_dr_percent,dr_swing,pv_winner
0,1976,40825839,39145771,81601344,50.030841,47.971968,2.058873,100.0,-97.941127,1
1,1980,35480948,43898770,86496851,41.01993,50.751871,-9.73194,2.058873,-11.790813,0
2,1984,37568137,54455073,92654861,40.546321,58.771955,-18.225634,-9.73194,-8.493694,0
3,1988,41809074,48886097,91586825,45.64966,53.376779,-7.727119,-18.225634,10.498515,0
4,1992,44954303,39103872,104599780,42.977436,37.384277,5.593158,-7.727119,13.320277,1
5,1996,47401898,39198482,96389818,49.177288,40.666621,8.510667,5.593158,2.917509,1
6,2000,50996062,50456169,105593982,48.294478,47.783186,0.511291,8.510667,-7.999375,1
7,2004,59028079,62028285,122349450,48.24548,50.697641,-2.452161,0.511291,-2.963453,0
8,2008,69498459,59948283,131419253,52.883012,45.616058,7.266953,-2.452161,9.719115,1
9,2012,65899660,60932152,129139997,51.029628,47.183021,3.846607,7.266953,-3.420347,1


Dropping the columns that won't be needed anymore, and renaming the remaining columns to explicitly show that this is national data.

Also, adding a column encoding the winner of the national election : 'el_winner'=1 means the national election was won by Democrats, 'el_winner'=0 means the national election was won by Republicans. In 2000 and 2016, Republican candidates won the national election despite losing the popular vote. In all other elections, the winner of the national popular vote actually won the national election. 

In [19]:
presid_national=presid_national[['year','dr_percent','dr_swing','pv_winner']]
presid_national.rename(columns={'dr_percent':'dr_percent_national',
                               'dr_swing':'dr_swing_national'},inplace=True)
presid_national['el_winner']=presid_national.pv_winner
presid_national.iloc[6,4]=0
presid_national.iloc[10,4]=0
presid_national

Unnamed: 0,year,dr_percent_national,dr_swing_national,pv_winner,el_winner
0,1976,2.058873,-97.941127,1,1
1,1980,-9.73194,-11.790813,0,0
2,1984,-18.225634,-8.493694,0,0
3,1988,-7.727119,10.498515,0,0
4,1992,5.593158,13.320277,1,1
5,1996,8.510667,2.917509,1,1
6,2000,0.511291,-7.999375,1,0
7,2004,-2.452161,-2.963453,0,0
8,2008,7.266953,9.719115,1,1
9,2012,3.846607,-3.420347,1,1


Adding a few binary features encoding important information about a presidential election: whether an incumbent is running for reelection and which party won the previous two presidential elections.

In [20]:
presid_national['d_el_winner_4y_ago']=np.append(0, presid_national.el_winner.to_numpy()[:-1])
presid_national['r_el_winner_4y_ago']=np.abs(presid_national['d_el_winner_4y_ago']-1)
presid_national['d_el_winner_8y_ago']=np.append([0, 0], presid_national.el_winner.to_numpy()[:-2])
presid_national['r_el_winner_8y_ago']=np.abs(presid_national['d_el_winner_8y_ago']-1)
presid_national['incumbent_running']=pd.Series([1,1,1,0,1,1,0,1,0,1,0,1])
presid_national['incumbent_not_running']=np.abs(presid_national['incumbent_running']-1)

presid_national

Unnamed: 0,year,dr_percent_national,dr_swing_national,pv_winner,el_winner,d_el_winner_4y_ago,r_el_winner_4y_ago,d_el_winner_8y_ago,r_el_winner_8y_ago,incumbent_running,incumbent_not_running
0,1976,2.058873,-97.941127,1,1,0,1,0,1,1,0
1,1980,-9.73194,-11.790813,0,0,1,0,0,1,1,0
2,1984,-18.225634,-8.493694,0,0,0,1,1,0,1,0
3,1988,-7.727119,10.498515,0,0,0,1,0,1,0,1
4,1992,5.593158,13.320277,1,1,0,1,0,1,1,0
5,1996,8.510667,2.917509,1,1,1,0,0,1,1,0
6,2000,0.511291,-7.999375,1,0,1,0,1,0,0,1
7,2004,-2.452161,-2.963453,0,0,0,1,1,0,1,0
8,2008,7.266953,9.719115,1,1,0,1,0,1,0,1
9,2012,3.846607,-3.420347,1,1,1,0,0,1,1,0


Adding the national values to the original dataset.

In [21]:
presid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 612 entries, 0 to 611
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   year             612 non-null    int64  
 1   state            612 non-null    object 
 2   state_po         612 non-null    object 
 3   fips             612 non-null    int64  
 4   d_votes          612 non-null    int64  
 5   totalvotes       612 non-null    int64  
 6   r_votes          612 non-null    int64  
 7   d_percent        612 non-null    float64
 8   r_percent        612 non-null    float64
 9   dr_percent       612 non-null    float64
 10  winner           612 non-null    int32  
 11  prev_dr_percent  612 non-null    float64
 12  dr_swing         612 non-null    float64
 13  swing            612 non-null    int32  
dtypes: float64(5), int32(2), int64(5), object(2)
memory usage: 62.3+ KB


In [22]:
presid_final=pd.merge(presid, presid_national, on = ['year'], how = "outer")

In [23]:
presid_final.drop(columns=['d_votes','r_votes','d_percent','r_percent','totalvotes','prev_dr_percent'],inplace=True)

In [24]:
presid_final

Unnamed: 0,year,state,state_po,fips,dr_percent,winner,dr_swing,swing,dr_percent_national,dr_swing_national,pv_winner,el_winner,d_el_winner_4y_ago,r_el_winner_4y_ago,d_el_winner_8y_ago,r_el_winner_8y_ago,incumbent_running,incumbent_not_running
0,1976,ALABAMA,AL,1,13.112398,1,-86.887602,0,2.058873,-97.941127,1,1,0,1,0,1,1,0
1,1976,ALASKA,AK,2,-22.251444,0,-122.251444,0,2.058873,-97.941127,1,1,0,1,0,1,1,0
2,1976,ARIZONA,AZ,4,-16.566158,0,-116.566158,0,2.058873,-97.941127,1,1,0,1,0,1,1,0
3,1976,ARKANSAS,AR,5,30.057392,1,-69.942608,0,2.058873,-97.941127,1,1,0,1,0,1,1,0
4,1976,CALIFORNIA,CA,6,-1.793492,0,-101.793492,0,2.058873,-97.941127,1,1,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607,2020,VIRGINIA,VA,51,10.114014,1,4.790308,1,4.448892,2.351825,1,1,0,1,1,0,1,0
608,2020,WASHINGTON,WA,53,19.203323,1,3.497324,1,4.448892,2.351825,1,1,0,1,1,0,1,0
609,2020,WEST VIRGINIA,WV,54,-38.935031,0,3.218616,1,4.448892,2.351825,1,1,0,1,1,0,1,0
610,2020,WISCONSIN,WI,55,0.627100,1,1.391443,1,4.448892,2.351825,1,1,0,1,1,0,1,0


In [25]:
presid_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 612 entries, 0 to 611
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   year                   612 non-null    int64  
 1   state                  612 non-null    object 
 2   state_po               612 non-null    object 
 3   fips                   612 non-null    int64  
 4   dr_percent             612 non-null    float64
 5   winner                 612 non-null    int32  
 6   dr_swing               612 non-null    float64
 7   swing                  612 non-null    int32  
 8   dr_percent_national    612 non-null    float64
 9   dr_swing_national      612 non-null    float64
 10  pv_winner              612 non-null    int32  
 11  el_winner              612 non-null    int32  
 12  d_el_winner_4y_ago     612 non-null    int32  
 13  r_el_winner_4y_ago     612 non-null    int32  
 14  d_el_winner_8y_ago     612 non-null    int32  
 15  r_el_w

In [26]:
presid_final.groupby('year')['el_winner'].value_counts()

year  el_winner
1976  1            51
1980  0            51
1984  0            51
1988  0            51
1992  1            51
1996  1            51
2000  0            51
2004  0            51
2008  1            51
2012  1            51
2016  0            51
2020  1            51
Name: count, dtype: int64

Adding a column encoding the relationship between state vote margin and national vote margin: 'percent_vs_national'=1 means state vote margin is better For Democrats than national vote margin (i.e. the state was won by Democrats with a margin higher than national, or the state was won by Democrats while the national popular vote was won by Republicans, or the state was won by Republicans with a margin lower than national), 'swing_vs_national'=0 means state vote margin is better For Republicans than national vote margin.

In [27]:
presid_final['percent_vs_national']=(presid_final['dr_percent']>presid_final['dr_percent_national']).astype(int)
presid_final.drop(columns=['dr_percent','dr_percent_national'],inplace=True)

Adding a column encoding the relationship between state-by-state swing and national swing: 'swing_vs_national'=1 means state-by-state swing is better for Democrats than national swing (i.e. the swing towards Democrats in the state is higher than national, or the state swung towards Democrats while the nation swung towards Republicans, or the swing towards Republicans in the state is lower than national), 'swing_vs_national'=0 means state-by-state swing is better for Republicans than national swing.

In [28]:
presid_final['swing_vs_national']=(presid_final['dr_swing']>presid_final['dr_swing_national']).astype(int)
presid_final.drop(columns=['dr_swing','dr_swing_national'],inplace=True)

Adding a column encoding the difference between winner of the election in the state and winner of the national election: 'winner_vs_national_winner'=1 means the winners are different, 'winner_vs_national_winner'=0 means the winner is the same.

In [29]:
presid_final['winner_vs_national_winner']=(presid_final['winner']!=presid_final['el_winner']).astype(int)
presid_final.drop(columns=['el_winner'],inplace=True)

Adding a column encoding the difference between winner of the election in the state and winner of the national popular vote: 'winner_vs_national_pv_winner'=1 means the winners are different, 'winner_vs_national_pv_winner'=0 means the winner is the same.

In [30]:
presid_final['winner_vs_national_pv_winner']=(presid_final['winner']!=presid_final['pv_winner']).astype(int)
presid_final.drop(columns=['pv_winner'],inplace=True)

Dropping the data for 1976, which won't be used, since the swing features require knowledge of previous election results.

In [31]:
presid_final=presid_final[presid_final.year!=1976]

In [32]:
presid_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 561 entries, 51 to 611
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   year                          561 non-null    int64 
 1   state                         561 non-null    object
 2   state_po                      561 non-null    object
 3   fips                          561 non-null    int64 
 4   winner                        561 non-null    int32 
 5   swing                         561 non-null    int32 
 6   d_el_winner_4y_ago            561 non-null    int32 
 7   r_el_winner_4y_ago            561 non-null    int32 
 8   d_el_winner_8y_ago            561 non-null    int32 
 9   r_el_winner_8y_ago            561 non-null    int32 
 10  incumbent_running             561 non-null    int64 
 11  incumbent_not_running         561 non-null    int64 
 12  percent_vs_national           561 non-null    int32 
 13  swing_vs_national       

## ECONOMIC DATA

Reading the sainc.csv file, containing all the economic data from the sainc30, sainc35, sainc4 and sainc50 ipynb files.

In [33]:
sainc=pd.read_csv('data/sainc.csv')

sainc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5411 entries, 0 to 5410
Data columns (total 50 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   fips       5411 non-null   object 
 1   state      5411 non-null   object 
 2   TableName  5411 non-null   object 
 3   code       5411 non-null   object 
 4   features   5411 non-null   object 
 5   1976       5411 non-null   float64
 6   1977       5411 non-null   float64
 7   1978       5411 non-null   float64
 8   1979       5411 non-null   float64
 9   1980       5411 non-null   float64
 10  1981       5411 non-null   float64
 11  1982       5411 non-null   float64
 12  1983       5411 non-null   float64
 13  1984       5411 non-null   float64
 14  1985       5411 non-null   float64
 15  1986       5411 non-null   float64
 16  1987       5411 non-null   float64
 17  1988       5411 non-null   float64
 18  1989       5411 non-null   float64
 19  1990       5411 non-null   float64
 20  1991    

Only the data for the election years and the years right before an election is needed. Dropping the data for the other years.

In [34]:
cols_to_drop=np.union1d(np.arange(1977,2021,4),np.arange(1978,2021,4))
cols_to_drop

array([1977, 1978, 1981, 1982, 1985, 1986, 1989, 1990, 1993, 1994, 1997,
       1998, 2001, 2002, 2005, 2006, 2009, 2010, 2013, 2014, 2017, 2018])

In [35]:
sainc.drop(columns=[str(x) for x in cols_to_drop],inplace=True)

In [36]:
sainc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5411 entries, 0 to 5410
Data columns (total 28 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   fips       5411 non-null   object 
 1   state      5411 non-null   object 
 2   TableName  5411 non-null   object 
 3   code       5411 non-null   object 
 4   features   5411 non-null   object 
 5   1976       5411 non-null   float64
 6   1979       5411 non-null   float64
 7   1980       5411 non-null   float64
 8   1983       5411 non-null   float64
 9   1984       5411 non-null   float64
 10  1987       5411 non-null   float64
 11  1988       5411 non-null   float64
 12  1991       5411 non-null   float64
 13  1992       5411 non-null   float64
 14  1995       5411 non-null   float64
 15  1996       5411 non-null   float64
 16  1999       5411 non-null   float64
 17  2000       5411 non-null   float64
 18  2003       5411 non-null   float64
 19  2004       5411 non-null   float64
 20  2007    

Removing the extra headers, that are in the dataset because it has been created from multiple notebooks, and removing unnecessary stars from state names.

In [37]:
sainc[['fips','state']].drop_duplicates()

Unnamed: 0,fips,state
0,0,United States
38,1,Alabama
76,2,Alaska
114,4,Arizona
152,5,Arkansas
190,6,California
228,8,Colorado
266,9,Connecticut
304,10,Delaware
342,11,District of Columbia


Removing prefixes from feature names.

In [38]:
sainc=sainc[sainc.fips!='fips']
sainc=sainc.astype({'fips': 'int'})

In [39]:
def remove_star(x):
    x=x.strip()
    if (x[-1]=='*'):
        return x[:-2]
    else:
        return x

In [40]:
sainc.state = sainc.state.apply(remove_star)

In [41]:
sainc[['TableName','code','features']].drop_duplicates()

Unnamed: 0,TableName,code,features
0,SAINC30,10,"Personal income (millions of dollars), Million..."
1,SAINC30,45,"Net earnings by place of residence, Millions o..."
2,SAINC30,50,"Personal current transfer receipts, Millions o..."
3,SAINC30,60,"Income maintenance benefits, Millions of dollars"
4,SAINC30,70,"Unemployment insurance compensation, Millions ..."
...,...,...,...
4845,SAINC50,70,"Federal government, Thousands of dollars"
4846,SAINC50,80,"Income taxes (net of refunds), Thousands of do..."
4847,SAINC50,81,"Income taxes (gross), Thousands of dollars"
4848,SAINC50,82,"Less: Refunds, Thousands of dollars"


In [42]:
def remove_prefix(x):
    if (x[:6]=='Equals'):
        return x[8:]
    if ((x[:4]=='Less') or (x[:4]=='Plus')):
        return x[6:]
    return x

In [43]:
sainc.features = sainc.features.apply(remove_prefix)

Showing all feature names and what initial dataset are they from. There are a few duplicates.

In [44]:
grouped=sainc.groupby('features').TableName.unique()
with pd.option_context('display.max_rows', None): 
    print(grouped)

features
Average earnings per job (dollars), Dollars                                                              [SAINC30]
Average nonfarm proprietors' income, Dollars                                                             [SAINC30]
Average wages and salaries, Dollars                                                                      [SAINC30]
Contributions for government social insurance, Millions of dollars                                        [SAINC4]
Current transfer receipts of individuals from businesses , Thousands of dollars                          [SAINC35]
Current transfer receipts of individuals from governments, Thousands of dollars                          [SAINC35]
Current transfer receipts of nonprofit institutions, Thousands of dollars                                [SAINC35]
Disposable personal income, Thousands of dollars                                                         [SAINC50]
Dividends, interest, and rent, Millions of dollars                     

In [45]:
sainc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5408 entries, 0 to 5410
Data columns (total 28 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   fips       5408 non-null   int32  
 1   state      5408 non-null   object 
 2   TableName  5408 non-null   object 
 3   code       5408 non-null   object 
 4   features   5408 non-null   object 
 5   1976       5408 non-null   float64
 6   1979       5408 non-null   float64
 7   1980       5408 non-null   float64
 8   1983       5408 non-null   float64
 9   1984       5408 non-null   float64
 10  1987       5408 non-null   float64
 11  1988       5408 non-null   float64
 12  1991       5408 non-null   float64
 13  1992       5408 non-null   float64
 14  1995       5408 non-null   float64
 15  1996       5408 non-null   float64
 16  1999       5408 non-null   float64
 17  2000       5408 non-null   float64
 18  2003       5408 non-null   float64
 19  2004       5408 non-null   float64
 20  2007       54

Deleting the exact duplications originating from the sainc4 dataset 

In [46]:
sainc=sainc[(sainc.TableName!='SAINC4') | 
            (sainc.features.isin(['Contributions for government social insurance, Millions of dollars',
                                  'Employee and self-employed contributions for government social insurance, Millions of dollars',
                                  'Nonfarm personal income, Millions of dollars']))]

Deleting the features that already exist with different units.

In [47]:
sainc=sainc[~(sainc.features.isin(['Earned Income Tax Credit (EITC) , Thousands of dollars',
                                  'Income maintenance benefits, Thousands of dollars',
                                  'Per capita personal income (dollars), Dollars',
                                  'Personal current transfer receipts (thousands of dollars), Thousands of dollars',
                                  'Personal income (thousands of dollars), Thousands of dollars',
                                  'Unemployment insurance compensation, Thousands of dollars']))]

Deleting the features that are already calculated per capita.

In [48]:
sainc=sainc[~(sainc.features.isin(['Disposable personal income, Thousands of dollars',
                                  'Dividends, interest, and rent, Millions of dollars',
                                  'Income maintenance benefits, Millions of dollars',
                                  'Net earnings by place of residence, Millions of dollars',
                                  'Personal current transfer receipts, Millions of dollars',
                                  'Personal income (millions of dollars), Millions of dollars',
                                  'Rental income of persons, Millions of dollars',
                                  'Retirement and other, Millions of dollars',
                                  'Unemployment insurance compensation, Millions of dollars',
                                  "Nonfarm proprietors' income, Millions of dollars",
                                  'Wages and salaries, Millions of dollars',
                                  'Earnings by place of work, Millions of dollars']))]

In [49]:
sainc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3640 entries, 7 to 5410
Data columns (total 28 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   fips       3640 non-null   int32  
 1   state      3640 non-null   object 
 2   TableName  3640 non-null   object 
 3   code       3640 non-null   object 
 4   features   3640 non-null   object 
 5   1976       3640 non-null   float64
 6   1979       3640 non-null   float64
 7   1980       3640 non-null   float64
 8   1983       3640 non-null   float64
 9   1984       3640 non-null   float64
 10  1987       3640 non-null   float64
 11  1988       3640 non-null   float64
 12  1991       3640 non-null   float64
 13  1992       3640 non-null   float64
 14  1995       3640 non-null   float64
 15  1996       3640 non-null   float64
 16  1999       3640 non-null   float64
 17  2000       3640 non-null   float64
 18  2003       3640 non-null   float64
 19  2004       3640 non-null   float64
 20  2007       36

In [50]:
grouped=sainc.groupby('features').TableName.unique()
with pd.option_context('display.max_rows', None): 
    print(grouped)

features
Average earnings per job (dollars), Dollars                                                      [SAINC30]
Average nonfarm proprietors' income, Dollars                                                     [SAINC30]
Average wages and salaries, Dollars                                                              [SAINC30]
Contributions for government social insurance, Millions of dollars                                [SAINC4]
Current transfer receipts of individuals from businesses , Thousands of dollars                  [SAINC35]
Current transfer receipts of individuals from governments, Thousands of dollars                  [SAINC35]
Current transfer receipts of nonprofit institutions, Thousands of dollars                        [SAINC35]
Earned Income Tax Credit (EITC), Thousands of dollars                                            [SAINC35]
Education and training assistance , Thousands of dollars                                         [SAINC35]
Employee and self-employed c

In [51]:
sainc.features.unique().size

69

In the end, we have no more duplicates and 69 economic features. This amount was achieved unintentionally.

Dropping the columns showing the original source of data.

In [52]:
sainc.drop(columns=['TableName','code'],inplace=True)

Showing how many rows of the DataFrame correspond to each feature. The correct amount is 52: 50 states + D.C. + national data.

In [53]:
with pd.option_context('display.max_rows', None): 
    print(sainc.features.value_counts())

features
Personal current taxes, Thousands of dollars                                                     104
Personal dividend income, Millions of dollars                                                     52
Excluding family assistance, Thousands of dollars                                                 52
Veterans' pension and disability benefits, Thousands of dollars                                   52
Veterans' benefits, Thousands of dollars                                                          52
Unemployment compensation for veterans (UCX), Thousands of dollars                                52
Unemployment compensation for Federal civilian employees (UCFE), Thousands of dollars             52
Excluding state unemployment insurance compensation, Thousands of dollars                         52
State unemployment insurance compensation, Thousands of dollars                                   52
Family assistance, Thousands of dollars                                           

In [54]:
sainc.drop_duplicates(inplace=True)

After dropping the duplicate rows, the amount of rows is correct, since 52*69=3588

In [55]:
sainc[sainc.features=='Personal current taxes, Thousands of dollars'].shape[0]

52

In [56]:
sainc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3588 entries, 7 to 5410
Data columns (total 26 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   fips      3588 non-null   int32  
 1   state     3588 non-null   object 
 2   features  3588 non-null   object 
 3   1976      3588 non-null   float64
 4   1979      3588 non-null   float64
 5   1980      3588 non-null   float64
 6   1983      3588 non-null   float64
 7   1984      3588 non-null   float64
 8   1987      3588 non-null   float64
 9   1988      3588 non-null   float64
 10  1991      3588 non-null   float64
 11  1992      3588 non-null   float64
 12  1995      3588 non-null   float64
 13  1996      3588 non-null   float64
 14  1999      3588 non-null   float64
 15  2000      3588 non-null   float64
 16  2003      3588 non-null   float64
 17  2004      3588 non-null   float64
 18  2007      3588 non-null   float64
 19  2008      3588 non-null   float64
 20  2011      3588 non-null   float64
 

Some features in the dataset are calculated per capita, while others are calculated as an absolute value. Here, per capita features are identified.

In [57]:
def is_per_capita(feature_name):
    if (feature_name[:4]=='Per '):
        return True
    if (feature_name[:7]=='Average'):
        return True
    return False

In [58]:
ser=pd.Series(sainc.features.unique())
per_capita_features=ser.loc[ser.map(is_per_capita)].sort_values().tolist()
per_capita_features

['Average earnings per job (dollars), Dollars',
 "Average nonfarm proprietors' income, Dollars",
 'Average wages and salaries, Dollars',
 'Per capita disposable personal income (dollars), Dollars',
 'Per capita dividends, Dollars',
 'Per capita dividends, interest, and rent, Dollars',
 'Per capita income maintenance benefits, Dollars',
 'Per capita interest, Dollars',
 'Per capita net earnings, Dollars',
 'Per capita personal current transfer receipts, Dollars',
 'Per capita personal income, Dollars',
 'Per capita rent, Dollars',
 'Per capita retirement and other, Dollars',
 'Per capita unemployment insurance compensation, Dollars']

Creating a new DataFrame from all pairs of state and year values. Instead of merging, new features will just be added to this DataFrame as columns after having their values sorted in the same way.

In [60]:
df=sainc[sainc.fips!=0][['fips','state']].drop_duplicates()
years=[str(x) for x in np.arange(1980,2021,4)]
df['year']=np.tile(years,(51,1)).tolist()
df=df.explode(column='year').sort_values(by=['year','fips']).reset_index(drop=True)
df

Unnamed: 0,fips,state,year
0,1,Alabama,1980
1,2,Alaska,1980
2,4,Arizona,1980
3,5,Arkansas,1980
4,6,California,1980
...,...,...,...
556,51,Virginia,2020
557,53,Washington,2020
558,54,West Virginia,2020
559,55,Wisconsin,2020


The absolute values of the economic features don't matter as much, since most of them are measured in dollars, therefore they are affected by inflation. 

In order to adress this discrepancy, out of each economic feature, 4 new features for the project were created (their names were updated with the following prefixes):
1) '4ych_' : equals the current value (value during the election year) divided by the value during the year of the previous election (4 years previously),
2) '1ych_' : similar to 4ych_, but showing the change from 1 year previously
3) 'vn_' : equals the current value for a state divided by the current value for the whole U.S.
4) 'vn_4ych_' : equals the 4ych_ value for a state minus the 4ych_ value for the whole U.S.

All of these values are relative, and show the current value compared to either the national value or the value in the previous years.

Also, in order to address the discrepancy between per capita and non per capita features, for each non per capita feature, its values were divided by the appropriate population value. These new features are additionally highlited by the prefix 'pc_'.

In [61]:
all_features=sainc.features.unique()

prev4y_cols=[str(x) for x in np.arange(1976,2017,4)]
prev4y_cols.extend(['fips', 'state'])
prev1y_cols=[str(x) for x in np.arange(1979,2020,4)]
prev1y_cols.extend(['fips', 'state'])
cur_cols=[str(x) for x in np.arange(1980,2021,4)]
cur_cols.extend(['fips', 'state'])

In [62]:
population_feature='Population (persons), Number of persons'

prev4y_population=sainc[sainc.features==population_feature][prev4y_cols]
prev1y_population=sainc[sainc.features==population_feature][prev1y_cols]
curr_population=sainc[sainc.features==population_feature][cur_cols]

prev4y_population=prev4y_population.melt(id_vars=['fips', 'state']).sort_values(by=['variable','fips'])
prev1y_population=prev1y_population.melt(id_vars=['fips', 'state']).sort_values(by=['variable','fips'])
curr_population=curr_population.melt(id_vars=['fips', 'state']).sort_values(by=['variable','fips'])

curr_population

Unnamed: 0,fips,state,variable,value
0,0,United States,1980,227224719.0
1,1,Alabama,1980,3900368.0
2,2,Alaska,1980,405315.0
3,4,Arizona,1980,2737774.0
4,5,Arkansas,1980,2288738.0
...,...,...,...,...
567,51,Virginia,2020,8590563.0
568,53,Washington,2020,7693612.0
569,54,West Virginia,2020,1784787.0
570,55,Wisconsin,2020,5832655.0


In [66]:
for feat_name in all_features:
    prev4y=sainc[sainc.features==feat_name][prev4y_cols]
    prev1y=sainc[sainc.features==feat_name][prev1y_cols]
    curr=sainc[sainc.features==feat_name][cur_cols]
    
    prev4y=prev4y.melt(id_vars=['fips', 'state']).sort_values(by=['variable','fips'])
    prev1y=prev1y.melt(id_vars=['fips', 'state']).sort_values(by=['variable','fips'])
    curr=curr.melt(id_vars=['fips', 'state']).sort_values(by=['variable','fips'])
    
    curr.rename(columns={'variable':'year','value':'current'},inplace=True)
    curr['1yago']=prev1y.value
    curr['4yago']=prev4y.value

    if (feat_name not in per_capita_features):
        curr.current=curr.current/curr_population.value
        curr['1yago']=curr['1yago']/prev1y_population.value
        curr['4yago']=curr['4yago']/prev4y_population.value
        feat_name='pc_'+feat_name
    
    feat=curr[curr.fips!=0]
    feat_national=curr[curr.fips==0].drop(columns=['state','fips'])
    
    feat['4ych_'+feat_name]=feat.current/feat['4yago']
    feat['1ych_'+feat_name]=feat.current/feat['1yago']
    feat.drop(columns=['1yago','4yago'],inplace=True)
    
    feat_national['n_4ych_']=feat_national.current/feat_national['4yago']
    feat_national.rename(columns={'current':'n_current'},inplace=True)
    feat_national.drop(columns=['1yago','4yago'],inplace=True)
    
    feat=pd.merge(feat, feat_national, on = ['year'], how = "outer")
    feat.sort_values(by=['year','fips'],inplace=True)
    
    feat['vn_'+feat_name]=feat.current/feat.n_current
    feat['vn_4ych_'+feat_name]=feat['4ych_'+feat_name]-feat['n_4ych_']
    feat.drop(columns=['current','n_current','n_4ych_'],inplace=True)
    for feat_to_add in feat.columns[3:]:
        df[feat_to_add]=feat[feat_to_add]

In [67]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 561 entries, 0 to 560
Data columns (total 279 columns):
 #    Column                                                                                                    Dtype  
---   ------                                                                                                    -----  
 0    fips                                                                                                      int32  
 1    state                                                                                                     object 
 2    year                                                                                                      object 
 3    4ych_pc_Personal dividend income, Millions of dollars                                                     float64
 4    1ych_pc_Personal dividend income, Millions of dollars                                                     float64
 5    vn_pc_Personal dividend income, Millions of doll

Checking the correctness of the created features. Pulling out the value of '1ych_pc_Nonfarm personal income, Millions of dollars' in Mississippi in 2020.

In [68]:
df[(df.year=='2020') & (df.state=='Mississippi')][['fips','state','year','1ych_pc_Nonfarm personal income, Millions of dollars']]

Unnamed: 0,fips,state,year,"1ych_pc_Nonfarm personal income, Millions of dollars"
534,28,Mississippi,2020,1.080109


Now pulling out the values of the original feature 'Nonfarm personal income, Millions of dollars' for Mississippi in 2020 and 2019, as well as the population values.

In [69]:
sainc[(sainc.features=='Nonfarm personal income, Millions of dollars') & (sainc.state=='Mississippi')][['fips','state','features','2019','2020']]

Unnamed: 0,fips,state,features,2019,2020
4326,28,Mississippi,"Nonfarm personal income, Millions of dollars",115175.5,123924.2


In [70]:
sainc[(sainc.features=='Population (persons), Number of persons') & (sainc.state=='Mississippi')][['fips','state','features','2019','2020']]

Unnamed: 0,fips,state,features,2019,2020
963,28,Mississippi,"Population (persons), Number of persons",2978227.0,2966786.0


Calculating the 1 year change of per capita value. It is correct.

In [71]:
(123924.2/2966786)/(115175.5/2978227) 

1.08010902095199

Checking for any null or abnornmal values and seeing none

In [72]:
df[(df.isna()).any(axis=1)]

Unnamed: 0,fips,state,year,"4ych_pc_Personal dividend income, Millions of dollars","1ych_pc_Personal dividend income, Millions of dollars","vn_pc_Personal dividend income, Millions of dollars","vn_4ych_pc_Personal dividend income, Millions of dollars","4ych_pc_Personal interest income, Millions of dollars","1ych_pc_Personal interest income, Millions of dollars","vn_pc_Personal interest income, Millions of dollars",...,"vn_pc_Income taxes (gross), Thousands of dollars","vn_4ych_pc_Income taxes (gross), Thousands of dollars","4ych_pc_Refunds, Thousands of dollars","1ych_pc_Refunds, Thousands of dollars","vn_pc_Refunds, Thousands of dollars","vn_4ych_pc_Refunds, Thousands of dollars","4ych_pc_Local governments, Thousands of dollars","1ych_pc_Local governments, Thousands of dollars","vn_pc_Local governments, Thousands of dollars","vn_4ych_pc_Local governments, Thousands of dollars"


In [73]:
print(f'{df.describe().loc["min"].min():.10f}')

-13.0213373490


In [74]:
df.drop(columns='fips').describe().loc["max"].max()

48.440955836196956

## MERGE

In [75]:
presid_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 561 entries, 51 to 611
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   year                          561 non-null    int64 
 1   state                         561 non-null    object
 2   state_po                      561 non-null    object
 3   fips                          561 non-null    int64 
 4   winner                        561 non-null    int32 
 5   swing                         561 non-null    int32 
 6   d_el_winner_4y_ago            561 non-null    int32 
 7   r_el_winner_4y_ago            561 non-null    int32 
 8   d_el_winner_8y_ago            561 non-null    int32 
 9   r_el_winner_8y_ago            561 non-null    int32 
 10  incumbent_running             561 non-null    int64 
 11  incumbent_not_running         561 non-null    int64 
 12  percent_vs_national           561 non-null    int32 
 13  swing_vs_national       

Dropping some unnecessary columns and changing the type of others to ensure successful merging

In [76]:
presid_final.drop(columns=['state','state_po'],inplace=True)

In [77]:
presid_final=presid_final.astype({'fips': 'int32', 'year' : 'int32'})

In [78]:
df=df.astype({'year' : 'int32'})

Merging economic and election features for every pair of year and state values

In [79]:
df=pd.merge(df, presid_final, on = ['year','fips'], how = "outer")

In [80]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 561 entries, 0 to 560
Data columns (total 291 columns):
 #    Column                                                                                                    Dtype  
---   ------                                                                                                    -----  
 0    fips                                                                                                      int32  
 1    state                                                                                                     object 
 2    year                                                                                                      int32  
 3    4ych_pc_Personal dividend income, Millions of dollars                                                     float64
 4    1ych_pc_Personal dividend income, Millions of dollars                                                     float64
 5    vn_pc_Personal dividend income, Millions of doll

Checking if ther are any null values. There should be none

In [81]:
df[(df.isna()).any(axis=1)]

Unnamed: 0,fips,state,year,"4ych_pc_Personal dividend income, Millions of dollars","1ych_pc_Personal dividend income, Millions of dollars","vn_pc_Personal dividend income, Millions of dollars","vn_4ych_pc_Personal dividend income, Millions of dollars","4ych_pc_Personal interest income, Millions of dollars","1ych_pc_Personal interest income, Millions of dollars","vn_pc_Personal interest income, Millions of dollars",...,d_el_winner_4y_ago,r_el_winner_4y_ago,d_el_winner_8y_ago,r_el_winner_8y_ago,incumbent_running,incumbent_not_running,percent_vs_national,swing_vs_national,winner_vs_national_winner,winner_vs_national_pv_winner


## ONE-HOT ENCODING STATES AND SAVING DATA

While the information about the year of the election will only be used for sample weighting, the information about the state where the election took place is obviously valuable and relevant to the outcome of any election. Therefore, it is one-hot encoded, adding 51 more features to the dataset (1 for each of the 50 states and the District of Columbia). 

In [91]:
categ_col=['state']

In [92]:
df_final=pd.get_dummies(df,columns=categ_col)

In [93]:
df_final

Unnamed: 0,fips,year,"4ych_pc_Personal dividend income, Millions of dollars","1ych_pc_Personal dividend income, Millions of dollars","vn_pc_Personal dividend income, Millions of dollars","vn_4ych_pc_Personal dividend income, Millions of dollars","4ych_pc_Personal interest income, Millions of dollars","1ych_pc_Personal interest income, Millions of dollars","vn_pc_Personal interest income, Millions of dollars","vn_4ych_pc_Personal interest income, Millions of dollars",...,state_South Dakota,state_Tennessee,state_Texas,state_Utah,state_Vermont,state_Virginia,state_Washington,state_West Virginia,state_Wisconsin,state_Wyoming
0,1,1980,1.601931,1.080036,0.526631,0.030243,1.813107,1.230491,0.705246,0.049431,...,False,False,False,False,False,False,False,False,False,False
1,2,1980,1.719135,1.251543,0.587139,0.147447,1.586129,1.120950,1.372447,-0.177547,...,False,False,False,False,False,False,False,False,False,False
2,4,1980,1.640471,1.100359,1.121440,0.068783,1.706662,1.202292,1.187883,-0.057015,...,False,False,False,False,False,False,False,False,False,False
3,5,1980,1.611600,1.088103,0.520506,0.039911,1.878198,1.254437,0.764273,0.114522,...,False,False,False,False,False,False,False,False,False,False
4,6,1980,1.588607,1.120344,1.103942,0.016919,1.745817,1.199444,1.228959,-0.017860,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
556,51,2020,1.148401,0.981176,0.946027,-0.032671,1.018769,0.958009,1.190980,-0.060321,...,False,False,False,False,False,True,False,False,False,False
557,53,2020,1.121917,0.977260,1.275358,-0.059155,0.977064,0.970938,1.106520,-0.102027,...,False,False,False,False,False,False,True,False,False,False
558,54,2020,1.227611,0.994075,0.475965,0.046539,1.025806,0.959354,0.582111,-0.053284,...,False,False,False,False,False,False,False,True,False,False
559,55,2020,1.175882,0.986840,1.019589,-0.005190,1.042272,0.977234,0.818045,-0.036819,...,False,False,False,False,False,False,False,False,True,False


In [94]:
df_final.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 561 entries, 0 to 560
Data columns (total 341 columns):
 #    Column                                                                                                    Dtype  
---   ------                                                                                                    -----  
 0    fips                                                                                                      int32  
 1    year                                                                                                      int32  
 2    4ych_pc_Personal dividend income, Millions of dollars                                                     float64
 3    1ych_pc_Personal dividend income, Millions of dollars                                                     float64
 4    vn_pc_Personal dividend income, Millions of dollars                                                       float64
 5    vn_4ych_pc_Personal dividend income, Millions of

Checking the correctness of one-hot encoding.

In [152]:
flag=True

#for each fips value (in the same order as in the DataFrame) 
for order,[fips_value, _] in df[['fips','state']].drop_duplicates().iterrows():

    #checking if each one-hot encoded column only has 1 unique value (either True or False)
    #in the part of the DataFrame with the current fips value
    if (((df_final[df_final.fips==fips_value].iloc[:, -51:].nunique().to_numpy()==np.ones(51,)).any()) != True):
        flag=False
        break

    #checking if only the correct one-hot encoded column containts True values
    #in the part of the DataFrame with the current fips value
    if ((np.where((df_final[df_final.fips==fips_value].iloc[:, -51:].to_numpy().any(axis=0))==True)[0] == order*np.ones(1,))[0] != True):
        flag=False
        break

flag

True

Now the fips column can be dropped.

In [155]:
df_final.drop(columns=['fips'],inplace=True)

Converting the one-hot encoded columns to int for easier use in the classifiers via a conversion dictionary

In [159]:
{x:'int32' for x in df_final.columns[-51:]}

{'state_Alabama': 'int32',
 'state_Alaska': 'int32',
 'state_Arizona': 'int32',
 'state_Arkansas': 'int32',
 'state_California': 'int32',
 'state_Colorado': 'int32',
 'state_Connecticut': 'int32',
 'state_Delaware': 'int32',
 'state_District of Columbia': 'int32',
 'state_Florida': 'int32',
 'state_Georgia': 'int32',
 'state_Hawaii': 'int32',
 'state_Idaho': 'int32',
 'state_Illinois': 'int32',
 'state_Indiana': 'int32',
 'state_Iowa': 'int32',
 'state_Kansas': 'int32',
 'state_Kentucky': 'int32',
 'state_Louisiana': 'int32',
 'state_Maine': 'int32',
 'state_Maryland': 'int32',
 'state_Massachusetts': 'int32',
 'state_Michigan': 'int32',
 'state_Minnesota': 'int32',
 'state_Mississippi': 'int32',
 'state_Missouri': 'int32',
 'state_Montana': 'int32',
 'state_Nebraska': 'int32',
 'state_Nevada': 'int32',
 'state_New Hampshire': 'int32',
 'state_New Jersey': 'int32',
 'state_New Mexico': 'int32',
 'state_New York': 'int32',
 'state_North Carolina': 'int32',
 'state_North Dakota': 'int32'

In [160]:
df_final=df_final.astype({x:'int32' for x in df_final.columns[-51:]})

In [161]:
df_final.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 561 entries, 0 to 560
Data columns (total 340 columns):
 #    Column                                                                                                    Dtype  
---   ------                                                                                                    -----  
 0    year                                                                                                      int32  
 1    4ych_pc_Personal dividend income, Millions of dollars                                                     float64
 2    1ych_pc_Personal dividend income, Millions of dollars                                                     float64
 3    vn_pc_Personal dividend income, Millions of dollars                                                       float64
 4    vn_4ych_pc_Personal dividend income, Millions of dollars                                                  float64
 5    4ych_pc_Personal interest income, Millions of do

In [162]:
df_final

Unnamed: 0,year,"4ych_pc_Personal dividend income, Millions of dollars","1ych_pc_Personal dividend income, Millions of dollars","vn_pc_Personal dividend income, Millions of dollars","vn_4ych_pc_Personal dividend income, Millions of dollars","4ych_pc_Personal interest income, Millions of dollars","1ych_pc_Personal interest income, Millions of dollars","vn_pc_Personal interest income, Millions of dollars","vn_4ych_pc_Personal interest income, Millions of dollars","4ych_pc_Imputed interest receipts, Millions of dollars",...,state_South Dakota,state_Tennessee,state_Texas,state_Utah,state_Vermont,state_Virginia,state_Washington,state_West Virginia,state_Wisconsin,state_Wyoming
0,1980,1.601931,1.080036,0.526631,0.030243,1.813107,1.230491,0.705246,0.049431,1.613327,...,0,0,0,0,0,0,0,0,0,0
1,1980,1.719135,1.251543,0.587139,0.147447,1.586129,1.120950,1.372447,-0.177547,1.469219,...,0,0,0,0,0,0,0,0,0,0
2,1980,1.640471,1.100359,1.121440,0.068783,1.706662,1.202292,1.187883,-0.057015,1.393620,...,0,0,0,0,0,0,0,0,0,0
3,1980,1.611600,1.088103,0.520506,0.039911,1.878198,1.254437,0.764273,0.114522,1.615074,...,0,0,0,0,0,0,0,0,0,0
4,1980,1.588607,1.120344,1.103942,0.016919,1.745817,1.199444,1.228959,-0.017860,1.449129,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
556,2020,1.148401,0.981176,0.946027,-0.032671,1.018769,0.958009,1.190980,-0.060321,0.987719,...,0,0,0,0,0,1,0,0,0,0
557,2020,1.121917,0.977260,1.275358,-0.059155,0.977064,0.970938,1.106520,-0.102027,0.978427,...,0,0,0,0,0,0,1,0,0,0
558,2020,1.227611,0.994075,0.475965,0.046539,1.025806,0.959354,0.582111,-0.053284,1.019990,...,0,0,0,0,0,0,0,1,0,0
559,2020,1.175882,0.986840,1.019589,-0.005190,1.042272,0.977234,0.818045,-0.036819,1.003871,...,0,0,0,0,0,0,0,0,1,0


Saving the data to be used for classification.

In [163]:
df_final.to_csv('data/data_classification.csv',index=False)