In [1]:
import pandas as pd
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

## 1. Read in and Preprocess Datasets

### 1.1 Read in datasets

In [2]:
# Read in candidate-recipient contribution data
df_ag = pd.read_csv('/project/data/cand_con.csv/AG.csv')
df_ap = pd.read_csv('/project/data/cand_con.csv/AP.csv')
df_dc = pd.read_csv('/project/data/cand_con.csv/DC.csv')
df_gc = pd.read_csv('/project/data/cand_con.csv/GC.csv')
df_house = pd.read_csv('/project/data/cand_con.csv/House.csv')
df_sa = pd.read_csv('/project/data/cand_con.csv/SA.csv')
df_sc = pd.read_csv('/project/data/cand_con.csv/SC.csv')
df_senate = pd.read_csv('/project/data/cand_con.csv/Senate.csv')
df_ss = pd.read_csv('/project/data/cand_con.csv/SS.csv')
df_st = pd.read_csv('/project/data/cand_con.csv/ST.csv')

# Read in non-candidate-recipient contribution data
df_non_cand = pd.read_csv('/project/data/non_candidate_con.csv')

10 datasets on candidate-recipient contributions and 1 dataset on non-candidate-recipient contributions. They are seperate and not relational

### 1.2 Check for DataFrames' column consistency

In [3]:
# First check for DataFrames' column numbers
df_lst = [df_ag, df_ap, df_dc, df_gc, df_house, df_sa, df_sc, df_ss, df_st, df_senate, df_non_cand]
for df in df_lst:
    print(df.shape[1])

13
13
13
13
13
13
13
13
13
13
11


In [4]:
from utils.MN_util import datasets_col_consistent

datasets_col_consistent(df_lst[:-1])

All dfs have consistent columns


### 1.3 Adjust for DataFrames' column consistency

In [5]:
df_ag.columns, df_non_cand.columns

(Index(['OfficeSought', 'Party', 'District', 'CandRegNumb', 'CandFirstName',
        'CandLastName', 'CommitteeName', 'DonationDate', 'DonorType',
        'DonorName', 'DonationAmount', 'InKindDonAmount',
        'InKindDescriptionText'],
       dtype='object'),
 Index(['PCFRegNumb', 'Committee', 'ETType', 'ETSubType', 'DonationDate',
        'DonorType', 'DonorRegNumb', 'DonorName', 'DonationAmount',
        'InKindDonAmount', 'InKindDescriptionText'],
       dtype='object'))

Based on the project need and dataset consistency, use these columns: RegNumb, RecipientType, OfficeSought, CandFirstName,  CandLastName, Committee, DonationDate, DonorType, DonorName, DonationAmount, InKindDonAmount, InKindDescriptionText

In [6]:
from utils.MN_util import standardize_cand_df

df_ag = standardize_cand_df(df_ag)
df_ap = standardize_cand_df(df_ap)
df_dc = standardize_cand_df(df_dc)
df_gc = standardize_cand_df(df_gc)
df_house = standardize_cand_df(df_house)
df_sa = standardize_cand_df(df_sa)
df_sc = standardize_cand_df(df_sc)
df_ss = standardize_cand_df(df_ss)
df_st = standardize_cand_df(df_st)
df_senate = standardize_cand_df(df_senate)

In [7]:
from utils.MN_util import standardize_noncand_df
df_non_cand = standardize_noncand_df(df_non_cand)

In [8]:
from utils.MN_util import preprocess_contribution_df

new_df_lst = [df_ag, df_ap, df_dc, df_gc, df_house, df_sa, df_sc, df_ss, df_st, 
              df_senate, df_non_cand]

contribution_df = preprocess_contribution_df(new_df_lst)

In [9]:
contribution_df['DonorType'].unique()

array(['I', 'F', 'C', 'O', 'L', 'P', 'H', 'U', 'S', nan, 'B'],
      dtype=object)

#### Donor Types:
1. C: Candidate Committee (limited to state-level candidates who had a principal campaign committee registered with the Board from which the contribution was made)
2. I: Non-lobbyist individual 
3. L: Lobbyist  
4. F: Political Committee/Fund  
5. S: Supporting association of a political fund registered with the Board that donates to its own political fund
6. P: Political party unit
7. H: Local candidate committee (limited to candidates within Hennepin County who satisfy the definition of local candidate, did not exist until 2022)
8. O: Other (catch-all category that in some cases includes businesses, supporting associations of political funds registered with the Board that donate to their own political fund, associations that are not registered with the Board, and any entity that does not fall within one of the other categories)
9. U: Association not registered with the Board (may include a committee registered with the FEC or a regulatory committee in another state, a 501(c)(4), 501(c)(6), or 527 nonprofit organization, the campaign committee of a candidate for local office (excluding certain Hennepin County candidates from 2022 onward), etc.)
10. B: Business (company & corporation)

In [10]:
contribution_df['RecipientType'].unique()

array(['Candidate', 'PCF', 'PTU'], dtype=object)

In [11]:
contribution_df['OfficeSought'].unique()

array(['AG', nan, 'GC', 'House', 'Senate', 'SA', 'SS', 'SC', 'DC', 'AP',
       'ST'], dtype=object)

#### Recipient Types:
- Candidate
- PCF: Political committee or fund
- PTU: Political party unit

#### Office Types (within candidate recipient):
- AG = Attorney General
- AP = State Appeals Court Judge
- DC = State District Court Judge
- GC = Governor
- House = State Representative
- SA = State Auditor
- SC = State Supreme Court Justice
- SS = Secretary of State
- ST = State Treasurer (this office was abolished in 2003 and no longer exists)
- Senate = State Senator

### 1.4 Check column types

In [12]:
contribution_df.dtypes

OfficeSought                     object
RegNumb                         float64
CandFirstName                    object
CandLastName                     object
Committee                        object
DonationDate             datetime64[ns]
DonorType                        object
DonorName                        object
DonationAmount                  float64
InKindDonAmount                 float64
InKindDescriptionText            object
RecipientType                    object
DonationYear                    float64
TotalAmount                     float64
dtype: object

### 1.4 Check Missing Values

In [13]:
contribution_df.isna().sum()

OfficeSought              483861
RegNumb                      467
CandFirstName             483861
CandLastName              483861
Committee                    467
DonationDate                 536
DonorType                    580
DonorName                    477
DonationAmount                 0
InKindDonAmount                0
InKindDescriptionText    3508947
RecipientType                  0
DonationYear                 536
TotalAmount                    0
dtype: int64

In [14]:
print('Total number of contribution entries = ', len(contribution_df))
print('Total number of nonclassifiable contribution amount = ', len(contribution_df[contribution_df['TotalAmount'] == 0]))
print('Total number of nonclassifiable recipients = ', contribution_df['RegNumb'].isna().sum())
print('Total number of nonclassifiable donors = ', contribution_df['DonorName'].isna().sum())

Total number of contribution entries =  3548873
Total number of nonclassifiable contribution amount =  336051
Total number of nonclassifiable recipients =  467
Total number of nonclassifiable donors =  477


1. 467 of the 'nan' contributions belong to "Registration Fee for Netroots Event", which is a non-profit organization that help progressive activists. Based on their column values, these contributions have no monetary amount, no donor, and no recipient.
2. 483861 nan OfficeSought are mostly those recipients are non-candidates
3. Contribution entries with no contribution amount, recipient information, or donor information should be dropped

### 1.5 Drop Non-classifiable Contribution Data

In [15]:
contribution_df = contribution_df[contribution_df['TotalAmount'] != 0]
contribution_df = contribution_df.dropna(subset=['RegNumb', 'DonorName'], how='any')
contribution_df = contribution_df.reset_index(drop=True)

In [16]:
print('Contribution entries after dropping non-classifiable data = ', len(contribution_df))
print('Number of non-classifiable contrinutions = ', 3548873-len(contribution_df))

Contribution entries after dropping non-classifiable data =  3212822
Number of non-classifiable contrinutions =  336051


## 2. Top 10
### 2.1 Top 10 Donors

In [17]:
donation_by_year = contribution_df.groupby('DonationYear') 
donation_by_year['TotalAmount'].sum() # check the total contribution across years

DonationYear
1998.0    21973071.61
1999.0     2636233.26
2000.0    14373458.56
2001.0     5223945.02
2002.0    48261465.33
2003.0     4363080.36
2004.0    26942236.14
2005.0    16161501.54
2006.0    76698607.22
2007.0     8423719.54
2008.0    32462700.99
2009.0    17605765.69
2010.0    66311709.21
2011.0     6057089.02
2012.0    61005414.73
2013.0    20285780.15
2014.0    44521142.83
2015.0    21097676.32
2016.0    39298419.13
2017.0    20058310.89
2018.0    47781848.87
2019.0    25764905.60
2020.0    34614419.07
2021.0    26898778.21
2022.0    57692246.12
2023.0      264961.92
Name: TotalAmount, dtype: float64

In [18]:
# Group by 'Year' and 'Contributor' to calculate the total contribution for each contributor in each year
don_by_year_contributor = contribution_df.groupby(
    ['DonationYear', 'DonorName'])['TotalAmount'].sum().reset_index()

# Find the top 10 contributors
top_10_contributors = don_by_year_contributor.groupby('DonationYear').apply(
    lambda group: group.nlargest(10, 'TotalAmount')).reset_index(drop=True)

In [19]:
top_10_contributors[-10:] # Top 10 contributors in 2023

Unnamed: 0,DonationYear,DonorName,TotalAmount
250,2023.0,IBEW Local 292,11400.11
251,2023.0,SEIU local 26,5194.45
252,2023.0,Teamsters Local 346,3776.37
253,2023.0,"Johnson, Erin",2899.0
254,2023.0,Teamsters Local 792,2897.77
255,2023.0,"Zarth, John",2700.0
256,2023.0,"Zarth, Kelly",2700.0
257,2023.0,Teamsters Local 970,2401.89
258,2023.0,Teamsters Local 974,2390.62
259,2023.0,"Wilbert, Michelle",2241.0


### 2.2 Top 10 Recipients

In [20]:
import numpy as np  # Import numpy for handling NaN values

don_by_year_recipients = contribution_df.groupby(['DonationYear', 'RegNumb', 'CandLastName', 'CandFirstName', 'Committee', 'RecipientType'])['TotalAmount'].sum().reset_index()

don_by_year_recipients['CandLastName'] = np.nan
don_by_year_recipients['CandFirstName'] = np.nan

don_by_year_recipients.loc[don_by_year_recipients['RecipientType'] == 'Candidate', 'CandLastName'] = don_by_year_recipients['CandLastName']
don_by_year_recipients.loc[don_by_year_recipients['RecipientType'] == 'Candidate', 'CandFirstName'] = don_by_year_recipients['CandFirstName']

top_10_recipients = don_by_year_recipients.groupby('DonationYear').apply(
    lambda group: group.nlargest(10, 'TotalAmount')).reset_index(drop=True)


In [21]:
contribution_df1 = contribution_df.copy(deep=True)
contribution_df1['CandLastName'].fillna('NA', inplace=True)
contribution_df1['CandFirstName'].fillna('NA', inplace=True)

don_by_year_recipients = contribution_df1.groupby(
    ['DonationYear', 'RegNumb', 'RecipientType', 'CandLastName', 'CandFirstName'])['TotalAmount'].sum().reset_index()

top_10_recipient = don_by_year_recipients.groupby('DonationYear').apply(
    lambda group: group.nlargest(10, 'TotalAmount')).reset_index(drop=True)

In [22]:
top_10_recipient[-10:] # Top 10 recipients in 2023

Unnamed: 0,DonationYear,RegNumb,RecipientType,CandLastName,CandFirstName,TotalAmount
250,2023.0,41291.0,PCF,,,32650.0
251,2023.0,18690.0,Candidate,Jensen,Scott,32505.0
252,2023.0,30689.0,PCF,,,29900.0
253,2023.0,41345.0,PCF,,,28371.96
254,2023.0,30726.0,PCF,,,28000.0
255,2023.0,30013.0,PCF,,,18072.03
256,2023.0,30345.0,PCF,,,12000.0
257,2023.0,41281.0,PCF,,,11832.0
258,2023.0,41262.0,PCF,,,11805.0
259,2023.0,30119.0,PCF,,,11400.11


## 3. Compare donation by donor and recipient types

### 3.1 Compare donation by donor types

In [29]:
donor_type_mapping = {
    'B': 'Business',
    'C': 'Candidate committee',
    'F': 'Political committee or fund',
    'H': 'Local candidate committee registered with Hennepin County',
    'I': 'Non-lobbyist individual',
    'L': 'Lobbyist',
    'O': 'Other',
    'P': 'Political party unit',
    'S': 'Self',
    'U': 'Association not registered with the Board'
}

In [31]:
grouped = contribution_df.groupby(['DonationYear', 'DonorType'])['TotalAmount'].sum().reset_index()
grouped['FullDonorType'] = grouped['DonorType'].map(donor_type_mapping)

fig = px.bar(
    grouped,
    x='DonationYear',
    y='TotalAmount',
    color='FullDonorType',
    title='Donations by Donor Type from 1998 to 2022',
    labels={"DonationYear": "Year", "TotalAmount": "Total Contributions", "FullDonorType": "Donor Type"},
    category_orders={"FullDonorType": sorted(donor_type_mapping.values())}
)

fig.show()



In [33]:
filtered_df = contribution_df[(contribution_df['DonationYear'] >= 2018) & (contribution_df['DonationYear'] <= 2022)]
grouped2 = filtered_df.groupby(['DonationYear', 'DonorType'])['TotalAmount'].sum().reset_index()

grouped2['FullDonorType'] = grouped2['DonorType'].map(donor_type_mapping)

fig = px.bar(
    grouped2,
    x='DonationYear',
    y='TotalAmount',
    color='FullDonorType',
    title='Donations by Donor Type from 2018 to 2022',
    labels={"DonationYear": "Year", "TotalAmount": "Total Contributions", "FullDonorType": "Donor Type"},
    category_orders={"FullDonorType": sorted(donor_type_mapping.values())}
)

fig.show()

#### Observations and Interpretations
1. Individuals, excluding lobbyists, constitute the largest share of contributions in the MN dataset.
2. The second most substantial contributor category is General Purpose Political Committee or Fund, followed by lobbyists.
3. Contributions from other donor types are notably lower throughout the years.
4. Analyzing a sample from 2018 to 2022, we observe a cyclical pattern with a major increase in contributions, followed by three years of reduced contribution totals. This cycle aligns with the four-year election cycle.
5. From 1998 to 2023, there are several years with significantly lower contribution amount: 1999, 2001, 2003, 2007, 2011.

### 3.2 Compare donation by recipient types

In [35]:
grouped3 = contribution_df.groupby(['DonationYear', 'RecipientType'])['TotalAmount'].sum().reset_index()

fig = px.bar(
    grouped3,
    x='DonationYear',
    y='TotalAmount',
    color='RecipientType',
    title='Donations by Recipient Type from 1998 to 2022',
    labels={"DonationYear": "Year", "TotalAmount": "Total Contributions"},
)

fig.show()

In [27]:
grouped4 = filtered_df.groupby(['DonationYear', 'RecipientType'])['TotalAmount'].sum().reset_index()
fig = px.bar(
    grouped4,
    x='DonationYear',
    y='TotalAmount',
    color='RecipientType',
    title='Donations by Recipient Type for the Last 5 Years',
    labels={"DonationYear": "Year", "TotalAmount": "Total Contributions"},
)

fig.show()

#### Observations and Interpretations
1. Candidates, as the recipients, make up the overwhelming majority of contributions.
2. Examining the period from 1998 to 2023, a distinct cyclical pattern emerges, characterized by alternating years of increased and decreased contributions, which may correspond to congressional elections or MN state house representatives elections which take place every two years.
3. Starting in 2012, recipient types "Political Committee or Fund" and "Political Party Unit" began receiving a larger share of contributions compared to prior years.

In [37]:
race_type_mapping = {
    'AG': 'Attorney General',
    'AP': 'Political committee or fund',
    'DC': 'Candidate committee',
    'GC': 'Other',
    'House': 'Lobbyist',
    'SA': 'State Auditor',
    'SC': 'Local candidate committee registered with Hennepin County',
    'SS': 'Secretary of State',
    'Senate': 'Senate',
}

In [41]:
grouped5 = filtered_df.groupby(['DonationYear', 'OfficeSought'])['TotalAmount'].sum().reset_index()
grouped5['FullRaceType'] = grouped5['OfficeSought'].map(race_type_mapping)

fig = px.bar(
    grouped5,
    x='DonationYear',
    y='TotalAmount',
    color='FullRaceType',
    title='Donations by Candidate Recipient Race for the Last 5 Years',
    labels={"DonationYear": "Year", "TotalAmount": "Total Contributions"},
    category_orders={"FullRaceType": sorted(race_type_mapping.values())}
)

fig.show()

Clearly, state senators and house representatives recieve the most contributions