In [1]:
import pandas as pd
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

## 1. Read in and Preprocess Datasets

### 1.1 Read in datasets

In [2]:
# Read in candidate-recipient contribution data
df_ag = pd.read_csv('/project/data/cand_con.csv/AG.csv')
df_ap = pd.read_csv('/project/data/cand_con.csv/AP.csv')
df_dc = pd.read_csv('/project/data/cand_con.csv/DC.csv')
df_gc = pd.read_csv('/project/data/cand_con.csv/GC.csv')
df_house = pd.read_csv('/project/data/cand_con.csv/House.csv')
df_sa = pd.read_csv('/project/data/cand_con.csv/SA.csv')
df_sc = pd.read_csv('/project/data/cand_con.csv/SC.csv')
df_senate = pd.read_csv('/project/data/cand_con.csv/Senate.csv')
df_ss = pd.read_csv('/project/data/cand_con.csv/SS.csv')
df_st = pd.read_csv('/project/data/cand_con.csv/ST.csv')

# Read in non-candidate-recipient contribution data
df_non_cand = pd.read_csv('/project/data/non_candidate_con.csv')

10 datasets on candidate-recipient contributions and 1 dataset on non-candidate-recipient contributions. They are seperate and not relational

### 1.2 Check for DataFrames' column consistency

In [3]:
# First check for DataFrames' column numbers
df_lst = [df_ag, df_ap, df_dc, df_gc, df_house, df_sa, df_sc, df_ss, df_st, df_senate, df_non_cand]
for df in df_lst:
    print(df.shape[1])

13
13
13
13
13
13
13
13
13
13
11


In [4]:
from utils.MN_util import datasets_col_consistent

datasets_col_consistent(df_lst[:-1])

All dfs have consistent columns


### 1.3 Adjust for DataFrames' column consistency

In [5]:
df_ag.columns, df_non_cand.columns

(Index(['OfficeSought', 'Party', 'District', 'CandRegNumb', 'CandFirstName',
        'CandLastName', 'CommitteeName', 'DonationDate', 'DonorType',
        'DonorName', 'DonationAmount', 'InKindDonAmount',
        'InKindDescriptionText'],
       dtype='object'),
 Index(['PCFRegNumb', 'Committee', 'ETType', 'ETSubType', 'DonationDate',
        'DonorType', 'DonorRegNumb', 'DonorName', 'DonationAmount',
        'InKindDonAmount', 'InKindDescriptionText'],
       dtype='object'))

Based on the project need and dataset consistency, use these columns: RegNumb, RecipientType, OfficeSought, CandFirstName,  CandLastName, Committee, DonationDate, DonorType, DonorName, DonationAmount, InKindDonAmount, InKindDescriptionText

In [6]:
from utils.MN_util import standardize_cand_df

df_ag = standardize_cand_df(df_ag)
df_ap = standardize_cand_df(df_ap)
df_dc = standardize_cand_df(df_dc)
df_gc = standardize_cand_df(df_gc)
df_house = standardize_cand_df(df_house)
df_sa = standardize_cand_df(df_sa)
df_sc = standardize_cand_df(df_sc)
df_ss = standardize_cand_df(df_ss)
df_st = standardize_cand_df(df_st)
df_senate = standardize_cand_df(df_senate)

In [7]:
from utils.MN_util import standardize_noncand_df
df_non_cand = standardize_noncand_df(df_non_cand)

In [8]:
from utils.MN_util import preprocess_contribution_df

new_df_lst = [df_ag, df_ap, df_dc, df_gc, df_house, df_sa, df_sc, df_ss, df_st, 
              df_senate, df_non_cand]

contribution_df = preprocess_contribution_df(new_df_lst)

In [9]:
contribution_df['DonorType'].unique()

array(['I', 'F', 'C', 'O', 'L', 'P', 'H', 'U', 'S', nan, 'B'],
      dtype=object)

#### Donor Types:
- C: Candidate Committee (limited to state-level candidates who had a principal campaign committee registered with the Board from which the contribution was made)
- I: Non-lobbyist individual 
- L: Lobbyist  
- F: Political Committee/Fund  
- S: Supporting association of a political fund registered with the Board that donates to its own political fund
- P: Political party unit
- H: Local candidate committee (limited to candidates within Hennepin County who satisfy the definition of local candidate, did not exist until 2022)
- O: Other (catch-all category that in some cases includes businesses, supporting associations of political funds registered with the Board that donate to their own political fund, associations that are not registered with the Board, and any entity that does not fall within one of the other categories)
- U: Association not registered with the Board (may include a committee registered with the FEC or a regulatory committee in another state, a 501(c)(4), 501(c)(6), or 527 nonprofit organization, the campaign committee of a candidate for local office (excluding certain Hennepin County candidates from 2022 onward), etc.)
- B: Business (company & corporation)

In [10]:
contribution_df['OfficeSought'].unique()

array(['AG', nan, 'GC', 'House', 'Senate', 'SA', 'SS', 'SC', 'DC', 'AP',
       'ST'], dtype=object)

#### Recipient Types:
- Candidate
- PCF: Political committee or fund
- PTU: Political party unit

#### Office Types (within candidate recipient):
- AG = Attorney General
- AP = State Appeals Court Judge
- DC = State District Court Judge
- GC = Governor
- House = State Representative
- SA = State Auditor
- SC = State Supreme Court Justice
- SS = Secretary of State
- ST = State Treasurer (this office was abolished in 2003 and no longer exists)
- Senate = State Senator

### 1.4 Check column types

In [13]:
contribution_df.dtypes

OfficeSought                     object
RegNumb                         float64
CandFirstName                    object
CandLastName                     object
Committee                        object
DonationDate             datetime64[ns]
DonorType                        object
DonorName                        object
DonationAmount                  float64
InKindDonAmount                 float64
InKindDescriptionText            object
RecipientType                    object
DonationYear                    float64
TotalAmount                     float64
dtype: object

### 1.4 Check Missing Values

In [14]:
contribution_df.isna().sum()

OfficeSought              483861
RegNumb                      467
CandFirstName             483861
CandLastName              483861
Committee                    467
DonationDate                 536
DonorType                    580
DonorName                    477
DonationAmount            333217
InKindDonAmount           674347
InKindDescriptionText    3508947
RecipientType                  0
DonationYear                 536
TotalAmount               922715
dtype: int64

1. 467 of the 'nan' contributions belong to "Registration Fee for Netroots Event", which is a non-profit organization that provides trainings, resources and connection opportunities to help progressive activists. Based on their column values, these contributions have no monetary amount, no donor, and no recipient.
2. 483861 nan values are mostly those recipients are non-candidates

## 2. Top 10
### 2.1 Top 10 Donors

In [12]:
donation_by_year = contribution_df.groupby('DonationYear') 
donation_by_year['TotalAmount'].sum() # check the total contribution across years

DonationYear
1998.0    20678745.70
1999.0     1545675.89
2000.0    12322727.60
2001.0     4163753.21
2002.0    46600539.55
2003.0     2926595.94
2004.0    24702549.81
2005.0    14655970.48
2006.0    74537059.95
2007.0     7312400.10
2008.0    30794346.71
2009.0    16390499.70
2010.0    64319478.61
2011.0     4859395.88
2012.0    55252164.36
2013.0    16482255.60
2014.0    38667352.15
2015.0    19816206.40
2016.0    37338776.71
2017.0    15943601.05
2018.0    35105013.46
2019.0    22417142.37
2020.0    31573129.27
2021.0    19326521.95
2022.0    36476756.74
2023.0      160433.71
Name: TotalAmount, dtype: float64

In [16]:
# Group by 'Year' and 'Contributor' to calculate the total contribution for each contributor in each year
don_by_year_contributor = contribution_df.groupby(
    ['DonationYear', 'DonorName'])['TotalAmount'].sum().reset_index()

# Find the top 10 contributors
top_10_contributors = don_by_year_contributor.groupby('DonationYear').apply(
    lambda group: group.nlargest(10, 'TotalAmount')).reset_index(drop=True)

In [20]:
top_10_contributors[-10:] # Top 10 contributors in 2023

Unnamed: 0,DonationYear,DonorName,TotalAmount
250,2023.0,IBEW Local 292,11400.11
251,2023.0,SEIU local 26,5194.45
252,2023.0,"Zarth, John",2700.0
253,2023.0,"Zarth, Kelly",2700.0
254,2023.0,"Krech, Kathy",1800.0
255,2023.0,"Restemayer, Douglas",1585.93
256,2023.0,"Carlson, Jessica",1500.0
257,2023.0,"Collins, Greg",1500.0
258,2023.0,"Collins, Jane",1500.0
259,2023.0,"Doherty, John",1500.0


### 2.2 Top 10 Recipients

In [22]:
don_by_year_recipients = contribution_df.groupby(['DonationYear', 'RegNumb', 
                        'CandFirstName', 'CandLastName', 'Committee', 
                        'RecipientType'])['TotalAmount'].sum().reset_index()
top_10_recipients = don_by_year_recipients.groupby('DonationYear').apply(
        lambda group: group.nlargest(10, 'TotalAmount')).reset_index(drop=True)

In [23]:
top_10_recipients[-10:]

Unnamed: 0,DonationYear,RegNumb,CandFirstName,CandLastName,Committee,RecipientType,TotalAmount
244,2022.0,13262.0,Warren,Limmer,Limmer (Warren) for Senate Committee,Candidate,454275.0
245,2022.0,10601.0,Mary,Murphy,Mary Murphy Volunteer Committee,Candidate,452742.21
246,2022.0,12604.0,Gregory,Davids,People for (Gregory) Davids Committee,Candidate,402975.0
247,2022.0,17105.0,Carla,Nelson,Nelson (Carla) for Senate,Candidate,400540.0
248,2022.0,18732.0,Kim,Crockett,Kim Crockett for Secretary of State Committee,Candidate,362551.26
249,2022.0,17373.0,Torrey,Westrom,Westrom (Torrey) for Senate Committee,Candidate,358200.0
250,2023.0,18690.0,Scott,Jensen,Dr. Scott Jensen for Governor,Candidate,32505.0
251,2023.0,18708.0,Leslie,Lienemann,Leslie for Minnesota,Candidate,5400.0
252,2023.0,18731.0,James,Schultz,Jim Schultz For Minnesota Attorney General,Candidate,1500.0
253,2023.0,18973.0,Beth,Beebe,Beth Beebe for State Rep,Candidate,1000.0


## 3. Compare donation by donor and recipient types

### 3.1 Compare donation by donor types

In [29]:
grouped = contribution_df.groupby(['DonationYear', 'DonorType'])['TotalAmount'].sum().reset_index()

fig = px.bar(
    grouped,
    x='DonationYear',
    y='TotalAmount',
    color='DonorType',
    title='Donations by Donor Type from 1998 to 2023',
    labels={"DonationYear": "Year", "TotalAmount": "Total Contributions"},
)

fig.show()

In [28]:
filtered_df = contribution_df[(contribution_df['DonationYear'] >= 2018) & (contribution_df['DonationYear'] <= 2022)]
grouped2 = filtered_df.groupby(['DonationYear', 'DonorType'])['TotalAmount'].sum().reset_index()
fig = px.bar(
    grouped2,
    x='DonationYear',
    y='TotalAmount',
    color='DonorType',
    title='Donations by Donor Type for the Last 5 Years',
    labels={"DonationYear": "Year", "TotalAmount": "Total Contributions"},
)

fig.show()

#### Observations and Interpretations
1. Individuals, excluding lobbyists, constitute the largest share of contributions in the MN dataset.
2. The second most substantial contributor category is General Purpose Political Committee or Fund, followed by lobbyists.
3. Contributions from other donor types are notably lower throughout the years.
4. Analyzing a sample from 2018 to 2022, we observe a cyclical pattern with a major increase in contributions, followed by three years of reduced contribution totals. This cycle aligns with the four-year election cycle.
5. From 1998 to 2023, there are several years with significantly lower contribution amount: 1999, 2001, 2003, 2007, 2011.

### 3.2 Compare donation by recipient types

In [30]:
grouped3 = contribution_df.groupby(['DonationYear', 'RecipientType'])['TotalAmount'].sum().reset_index()

fig = px.bar(
    grouped3,
    x='DonationYear',
    y='TotalAmount',
    color='RecipientType',
    title='Donations by Recipient Type from 1998 to 2023',
    labels={"DonationYear": "Year", "TotalAmount": "Total Contributions"},
)

fig.show()

In [32]:
grouped4 = filtered_df.groupby(['DonationYear', 'RecipientType'])['TotalAmount'].sum().reset_index()
fig = px.bar(
    grouped4,
    x='DonationYear',
    y='TotalAmount',
    color='RecipientType',
    title='Donations by Recipient Type for the Last 5 Years',
    labels={"DonationYear": "Year", "TotalAmount": "Total Contributions"},
)

fig.show()

#### Observations and Interpretations
1. Candidates, as the recipients, make up the overwhelming majority of contributions.
2. Examining the period from 1998 to 2023, a distinct cyclical pattern emerges, characterized by alternating years of increased and decreased contributions, which may correspond to congressional elections or MN state house representatives elections which take place every two years.
3. Starting in 2012, recipient types "Political Committee or Fund" and "Political Party Unit" began receiving a larger share of contributions compared to prior years.

In [33]:
grouped5 = filtered_df.groupby(['DonationYear', 'OfficeSought'])['TotalAmount'].sum().reset_index()
fig = px.bar(
    grouped5,
    x='DonationYear',
    y='TotalAmount',
    color='OfficeSought',
    title='Donations by Candidate Recipient Race for the Last 5 Years',
    labels={"DonationYear": "Year", "TotalAmount": "Total Contributions"},
)

fig.show()

Clearly, state senators and house representatives recieve the most contributions