In [1]:
import pandas as pd
import os
import sys
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display
from pathlib import Path

ORIGINAL_DIRECTORY = os.getcwd()

BASE_PATH = Path.cwd().parent
FULL_PATH = str(BASE_PATH / "utils")

sys.path.append(FULL_PATH)


from preprocess_mi_campaign_data import (
    read_expenditure_data,
    read_contribution_data,
    create_all_plots,
)
from constants import (
    MI_EXP_FILEPATH,
    MI_CON_FILEPATH,
    MI_EXPENDITURE_COLUMNS,
    MI_CONTRIBUTION_COLUMNS,
)

os.chdir(ORIGINAL_DIRECTORY)

### Michigan Campaign Expenditure & Contribution Data 2018 - 2023 Exploratory Data Analysis

#### Task 1: Read in the Datasets and merge into one Pandas DataFrame

In [2]:
campaign_expenditure_dataframe_lst = []
campaign_contribution_dataframe_lst = []

for file in os.listdir(MI_EXP_FILEPATH):
    filepath = str(MI_EXP_FILEPATH) + "/" + file
    campaign_expenditure_dataframe_lst.append(
        read_expenditure_data(filepath, MI_EXPENDITURE_COLUMNS)
    )

for file in os.listdir(MI_CON_FILEPATH):
    filepath = str(MI_CON_FILEPATH) + "/" + file
    campaign_contribution_dataframe_lst.append(
        read_contribution_data(filepath, MI_CONTRIBUTION_COLUMNS)
    )

In [3]:
pd.options.display.max_columns = 100
campaign_expenditure_dataframe_lst[0].head(5)

Unnamed: 0,doc_seq_no,expenditure_type,gub_account_type,gub_elec_type,page_no,expense_id,detail_id,doc_stmnt_year,doc_type_desc,com_legal_name,common_name,cfr_com_id,com_type,schedule_desc,exp_desc,purpose,extra_desc,f_name,lname_or_org,address,city,state,zip,exp_date,amount,state_loc,supp_opp,can_or_ballot,county,debt_payment,vend_name,vend_addr,vend_city,vend_state,vend_zip,gotv_ink_ind,fundraiser
0,488507,1B,,,0,4100,0,2020,ANNUAL CS,COMMITTEE TO ELECT CHARIS LEE FOR STATE REPRES...,COMMITTEE TO ELECT CHARIS LEE FOR ST,519354.0,CAN,DIRECT,COMPUTER COSTS,CHUNIQ INPOWER GOSQ.COM,,,COMMITTEE TO ELECT CHARIS LEE,1133 FAIRFAX ST,FLINT,MI,48505-0000,11/15/2019,354.28,,,,,,,,,,,,
1,488507,1B,,,0,4101,0,2020,ANNUAL CS,COMMITTEE TO ELECT CHARIS LEE FOR STATE REPRES...,COMMITTEE TO ELECT CHARIS LEE FOR ST,519354.0,CAN,DIRECT,PRINT ADVERTISING,SAWICKI & SON 313-962-2725,,,COMMITTEE TO ELECT CHARIS LEE,1133 FAIRFAX ST,FLINT,MI,48505-0000,12/02/2019,500.0,,,,,,,,,,,,
2,488507,1B,,,0,4108,0,2020,ANNUAL CS,COMMITTEE TO ELECT CHARIS LEE FOR STATE REPRES...,COMMITTEE TO ELECT CHARIS LEE FOR ST,519354.0,CAN,DIRECT,"MAILING,POSTAGE,BULK RATE",STAPLES,,,COMMITTEE TO ELECT CHARIS LEE,1133 FAIRFAX ST,FLINT,MI,48505-0000,11/26/2019,101.61,,,,,,,,,,,,
3,488507,1B,,,0,4109,0,2020,ANNUAL CS,COMMITTEE TO ELECT CHARIS LEE FOR STATE REPRES...,COMMITTEE TO ELECT CHARIS LEE FOR ST,519354.0,CAN,DIRECT,"MAILING,POSTAGE,BULK RATE",STAPLES,,,COMMITTEE TO ELECT CHARIS LEE,1133 FAIRFAX ST,FLINT,MI,48505-0000,11/26/2019,131.18,,,,,,,,,,,,X
4,488507,1B,,,0,4110,0,2020,ANNUAL CS,COMMITTEE TO ELECT CHARIS LEE FOR STATE REPRES...,COMMITTEE TO ELECT CHARIS LEE FOR ST,519354.0,CAN,DIRECT,"CONSULTATION, RESEARCH",FIELD OPERATIONS,,,COMMITTEE TO ELECT CHARIS LEE,1133 FAIRFAX ST,FLINT,MI,48505-0000,12/03/2019,100.65,,,,,,,,,,,,


In [4]:
merged_contribution_df = pd.concat(campaign_contribution_dataframe_lst)
merged_expenditure_df = pd.concat(campaign_expenditure_dataframe_lst)

In [5]:
merged_contribution_df["amount"] = pd.to_numeric(
    merged_contribution_df["amount"], errors="coerce"
)
merged_expenditure_df["amount"] = pd.to_numeric(
    merged_expenditure_df["amount"], errors="coerce"
)

In [6]:
# removes the MENOMINEE COUNTY DEMOCRATIC PARTY columns that create errors
merged_contribution_df = merged_contribution_df[
    merged_contribution_df["com_type"] != "MENOMINEE COUNTY DEMOCRATIC PARTY"
]
merged_expenditure_df = merged_expenditure_df[
    merged_expenditure_df["com_type"] != "MENOMINEE COUNTY DEMOCRATIC PARTY"
]

#### Task 2: Answer the Following Questions
- For each column, what are the contents of it? How many blanks or nulls are there? What is the format? If there it is one of several types, what are those types?
    - Percentage of nulls/blanks
- Is the dataset relational (are there multiple tables that relate to each other)?
- Who are the top 10 contributors in your data? The top 10 recipients?
- Make a bar chart with plotly comparing contributions by donor type or recipient type (PAC, individual, etc) and one comparing recipients by the office type they are running for
- If you have multiple years, are they all similar? If not, is the difference explicable (maybe by election schedules)
    - Have an option to toggle at the top of the notebook to use different years.
    - Utility functions to import into the jupyter notebook, should be generalizable

----

##### Question 1: 
- For each column, what are the contents of it? How many blanks or nulls are there? What is the format? If there it is one of several types, what are those types?

In [7]:
merged_contribution_df.dtypes

doc_seq_no           int64
page_no              int64
contribution_id      int64
cont_detail_id       int64
doc_stmnt_year       int64
doc_type_desc       object
com_legal_name      object
common_name         object
cfr_com_id         float64
com_type            object
can_first_name      object
can_last_name       object
contribtype         object
f_name              object
l_name_or_org       object
address             object
city                object
state               object
zip                 object
occupation          object
employer            object
received_date       object
amount             float64
aggregate           object
extra_desc         float64
dtype: object

In [8]:
merged_expenditure_df.dtypes

doc_seq_no            int64
expenditure_type     object
gub_account_type     object
gub_elec_type        object
page_no               int64
expense_id            int64
detail_id             int64
doc_stmnt_year        int64
doc_type_desc        object
com_legal_name       object
common_name          object
cfr_com_id          float64
com_type             object
schedule_desc        object
exp_desc             object
purpose              object
extra_desc           object
f_name               object
lname_or_org         object
address              object
city                 object
state                object
zip                  object
exp_date             object
amount              float64
state_loc            object
supp_opp            float64
can_or_ballot        object
county               object
debt_payment         object
vend_name            object
vend_addr            object
vend_city            object
vend_state           object
vend_zip             object
gotv_ink_ind        

In [9]:
contritbution_null_percentage = (
    merged_contribution_df.isna().mean() * 100
).reset_index()
contritbution_null_percentage.columns = ["Column Name", "Missing Percentage"]

# The percentage of null values in each column of the contribution data is as follows below
display(contritbution_null_percentage)

Unnamed: 0,Column Name,Missing Percentage
0,doc_seq_no,0.0
1,page_no,0.0
2,contribution_id,0.0
3,cont_detail_id,0.0
4,doc_stmnt_year,0.0
5,doc_type_desc,0.0
6,com_legal_name,0.0
7,common_name,0.0
8,cfr_com_id,0.0
9,com_type,0.0


In [10]:
expenditure_null_percentage = (merged_expenditure_df.isna().mean() * 100).reset_index()
expenditure_null_percentage.columns = ["Column Name", "Missing Percentage"]

# The percentage of null values in each column of the expenditure data is as follows below
display(expenditure_null_percentage)

Unnamed: 0,Column Name,Missing Percentage
0,doc_seq_no,0.0
1,expenditure_type,0.0
2,gub_account_type,0.0
3,gub_elec_type,0.0
4,page_no,0.0
5,expense_id,0.0
6,detail_id,0.0
7,doc_stmnt_year,0.0
8,doc_type_desc,0.0
9,com_legal_name,0.0


##### Question 2: Are commmittee's relational in the different dataset based on the contribution_id or cfr_com_id ?
- Both the contribution and expenditure datasets are relational based upon the cfr_com_id, as shown below. According to the Secretary of State provided README cfr_com_id is the unique committee ID# of the committee in the Bureau of Elections database.

In [11]:
merged_contribution_df[merged_contribution_df["cfr_com_id"] == 508347.0].head(2)

Unnamed: 0,doc_seq_no,page_no,contribution_id,cont_detail_id,doc_stmnt_year,doc_type_desc,com_legal_name,common_name,cfr_com_id,com_type,can_first_name,can_last_name,contribtype,f_name,l_name_or_org,address,city,state,zip,occupation,employer,received_date,amount,aggregate,extra_desc
0,472254,0,464000,0,2018,OCTOBER QUARTERLY CS,UAW MICHIGAN VOLUNTARY POLITICAL ACTION COMMITTEE,UAW MICHIGAN VOLUNTARY POLITICAL ACT,508347.0,IND,,,DIRECT,TERRY,LOZANO,2034 EDWARD LANE WEST,KIMBALL,MI,48074-0000,FACTORY WORKER,FIAT CHRYSLER AUTOMOBILES N.V.,10/01/2018,15.0,165.0,
1,472254,0,464001,0,2018,OCTOBER QUARTERLY CS,UAW MICHIGAN VOLUNTARY POLITICAL ACTION COMMITTEE,UAW MICHIGAN VOLUNTARY POLITICAL ACT,508347.0,IND,,,DIRECT,PATRICK,CLAERHOUT,121 MURPHY DR.,ST. CLAIR,MI,48079-0000,,,10/01/2018,5.0,55.0,


In [12]:
merged_expenditure_df[merged_expenditure_df["cfr_com_id"] == 508347.0].tail(2)

Unnamed: 0,doc_seq_no,expenditure_type,gub_account_type,gub_elec_type,page_no,expense_id,detail_id,doc_stmnt_year,doc_type_desc,com_legal_name,common_name,cfr_com_id,com_type,schedule_desc,exp_desc,purpose,extra_desc,f_name,lname_or_org,address,city,state,zip,exp_date,amount,state_loc,supp_opp,can_or_ballot,county,debt_payment,vend_name,vend_addr,vend_city,vend_state,vend_zip,gotv_ink_ind,fundraiser
107636,472254,1B,,,0,716150,0,2018,OCTOBER QUARTERLY CS,UAW MICHIGAN VOLUNTARY POLITICAL ACTION COMMITTEE,UAW MICHIGAN VOLUNTARY POLITICAL ACT,508347.0,IND,DIRECT,DIRECT CONTRIBUTIONS,CONTRIBUTION,,,CTE BRIAN PICKELL JUDGE,727 SHADY BROOK LANE,FLUSHING,MI,48433-0000,10/18/2018,2000.0,,,BRIAN PICKELL,.,,,,,,,,
107637,472254,1B,,,0,716153,0,2018,OCTOBER QUARTERLY CS,UAW MICHIGAN VOLUNTARY POLITICAL ACTION COMMITTEE,UAW MICHIGAN VOLUNTARY POLITICAL ACT,508347.0,IND,DIRECT,DIRECT CONTRIBUTIONS,CONTRIBUTION,,,JUSTICE FOR ALL - MICHIGAN,6639 CENTURION SUITE 120,LANSING,MI,48917-0000,10/15/2018,100000.0,,,,,,,,,,,,


#### Question 3: Who are the top 10 contributors in your data? The top 10 recipients?

The top individuals and organizations that donate to committees are listed below as well as the top committees receiving contributions. 

In [13]:
# display the top 10 individual contributors from the contribution data
top_10_individual_contrubutors = (
    merged_contribution_df.groupby(["f_name", "l_name_or_org"])["amount"]
    .sum()
    .reset_index()
)
top_10_individual_contrubutors = top_10_individual_contrubutors.sort_values(
    by="amount", ascending=False
).head(10)
display(top_10_individual_contrubutors)

Unnamed: 0,f_name,l_name_or_org,amount
590986,KEVIN,RINKE,10007210.0
836138,PERRY,JOHNSON,7946035.0
971464,SHRI,THANEDAR,7158151.01
911332,RONALD,WEISER,4914475.5
744554,MICHAEL,BLOOMBERG,4020750.0
800616,NISHAD,SINGH,4007150.0
267918,DICK,DEVOS,3947900.0
90962,BETSY,DEVOS,3583875.0
685216,MARIA,DEVOS,2937570.0
877470,RICHARD,UIHLEIN,2755000.0


In [14]:
# display the top 10 organizational contributors from the contribution data
top_10_organizational_contributors = merged_contribution_df[
    merged_contribution_df["f_name"].isnull()
]
top_10_organizational_contributors = (
    top_10_organizational_contributors.groupby("l_name_or_org")["amount"]
    .sum()
    .reset_index()
)
top_10_organizational_contributors = top_10_organizational_contributors.sort_values(
    by="amount", ascending=False
).head(10)
display(top_10_organizational_contributors)

Unnamed: 0,l_name_or_org,amount
76532,SIXTEEN THIRTY FUND,33139000.0
70070,RIGHT TO LIFE OF MICHIGAN,12662350.0
57520,MICHIGAN CATHOLIC CONFERENCE,7616820.0
46988,LEAGUE OF CONSERVATION VOTERS,6399000.0
2209,AMERICAN CIVIL LIBERTIES UNION,6038599.0
78290,STATE VICTORY ACTION,5367500.0
638,ACTION NOW INITIATIVE,5002580.59
63353,OPEN SOCIETY POLICY CENTER,4600000.0
61787,NEXTGEN CLIMATE ACTION,4373238.22
31093,GRETCHEN WHITMER FOR GOVERNOR,3683177.36


In [15]:
# display the 10 recipients (commmittees) from the contribution data
top_10_commiitee_recipients = (
    merged_contribution_df.groupby("com_legal_name")["amount"].sum().reset_index()
)
top_10_commiitee_recipients = top_10_commiitee_recipients.sort_values(
    by="amount", ascending=False
).head(10)
display(top_10_commiitee_recipients)

Unnamed: 0,com_legal_name,amount
1571,GRETCHEN WHITMER FOR GOVERNOR,47376905.79
2423,REPRODUCTIVE FREEDOM FOR ALL,45250239.65
321,CITIZENS TO SUPPORT MI WOMEN AND CHILDREN,21183456.61
1617,HOUSE REPUBLICAN CAMPAIGN COMMITTEE,20985558.66
2375,PROMOTE THE VOTE 2022,20688994.54
2154,MICHIGAN REPUBLICAN PARTY,19398821.24
2519,SENATE REPUBLICAN CAMPAIGN COMMITTEE,16325446.25
2097,MICHIGAN HOUSE DEMOCRATIC FUND,16045083.8
2797,VOTERS NOT POLITICIANS BALLOT COMMITTEE,15986109.71
1106,DEMOCRATIC STATE CENTRAL COMMITTEE,13335520.27


In [16]:
# display the top 10 recipients (candidates) from the contribution data

top_10_recipients = (
    merged_contribution_df.groupby(["can_first_name", "can_last_name"])["amount"]
    .sum()
    .reset_index()
)
top_10_recipients = top_10_recipients.sort_values(by="amount", ascending=False).head(10)
display(top_10_recipients)

Unnamed: 0,can_first_name,can_last_name,amount
471,GRETCHEN,WHITMER,47376905.79
760,KEVIN,RINKE,10493219.07
1344,TUDOR,DIXON,8786024.47
1072,PERRY,JOHNSON,7964951.42
274,DANA,NESSEL,7540998.06
129,BILL,SCHUETTE,7351185.62
596,JOCELYN,BENSON,7222008.32
1231,SHRI,THANEDAR,7144039.22
155,BRIAN,CALLEY,3918020.27
513,JAMES,CRAIG,3259128.25


- The top 10 expenditures for individuals and organizations are shown below, as well as top expenditures supporting or opposing a candidate or ballot issue

In [17]:
# Top Expenditures supporting a Candidate or Ballot Issue

top_10_individual_recipients = merged_expenditure_df[
    (merged_expenditure_df["lname_or_org"].notnull())
    & (merged_expenditure_df["f_name"].notnull())
]
top_10_individual_recipients = (
    top_10_individual_recipients.groupby(["f_name", "lname_or_org", "supp_opp"])[
        "amount"
    ]
    .sum()
    .reset_index()
)
top_10_individual_recipients = top_10_individual_recipients.sort_values(
    by="amount", ascending=False
).head(10)
display(top_10_individual_recipients)

Unnamed: 0,f_name,lname_or_org,supp_opp,amount
508,HEATHER,RICKETTS,1.0,78985.66
341,DENNIS,LENNOX,1.0,72217.11
721,KATHRYN,FAHEY,1.0,60235.15
534,JACK,JENNINGS,1.0,58293.37
171,BRIAN,CALLEY ^,1.0,58000.0
1183,SCOTT,DREXEL,1.0,51488.27
561,JAMES,LANCASTER,1.0,45817.0
812,LENORE,GOLDMAN,1.0,39219.42
702,KAI,PAIGE,1.0,37540.74
120,AURELIUS,CHRISTIAN,1.0,36250.0


In [18]:
# Top Expenditures opposing an Office or Ballot Issue


top_10_individual_recipients_opposing = merged_expenditure_df[
    (merged_expenditure_df["lname_or_org"].notnull())
    & (merged_expenditure_df["f_name"].notnull())
    & (merged_expenditure_df["supp_opp"] == 2.0)
]
top_10_individual_recipients_opposing = (
    top_10_individual_recipients_opposing.groupby(
        ["f_name", "lname_or_org", "supp_opp"]
    )["amount"]
    .sum()
    .reset_index()
)
top_10_individual_recipients_opposing = (
    top_10_individual_recipients_opposing.sort_values(
        by="amount", ascending=False
    ).head(10)
)
display(top_10_individual_recipients_opposing)

Unnamed: 0,f_name,lname_or_org,supp_opp,amount
35,MICHAEL,GILMORE ESQ.,2.0,18532.26
32,LINDA,KOJIRO,2.0,5550.0
10,CLAUDIA,RODRIGUEZ,2.0,2556.9
11,CODY,WETHERILL,2.0,1597.74
19,GRETCHEN,WHITMER,2.0,1197.18
25,JENNIFER G,BARKER,2.0,950.0
12,DALE,KILDEE,2.0,579.14
21,HILLARY,SCHOLTEN,2.0,502.27
7,BRADLEY,OCONNER,2.0,500.0
28,JONATHAN,MOY,2.0,458.8


In [19]:
# Top recipients of expenditures for organizations

top_10_org_recipients = merged_expenditure_df[merged_expenditure_df["f_name"].isnull()]
top_10_org_recipients = (
    top_10_org_recipients.groupby(["lname_or_org", "purpose"])["amount"]
    .sum()
    .reset_index()
)
top_10_org_recipients = top_10_org_recipients.sort_values(
    by="amount", ascending=False
).head(10)
display(top_10_org_recipients)

Unnamed: 0,lname_or_org,purpose,amount
68303,KELLY SCOTT & MADISON INC,MEDIA AD PLACEMENT,17453108.0
54063,GMMB,MEDIA BUY,16954706.0
86686,NATIONAL PETITION MANAGEMENT,PETITIONS; SIGNATURE GATHERING,9901099.24
82390,MICHIGAN DEMOCRATIC STATE CENTRAL CO,CONTRIBUTION,8157143.18
54071,GMMB,TV ADVERTISING,7477029.48
46123,FIELD WORKS LLC,PETITION GATHERING EXPENSES,7340198.76
69753,KNOW-HOW STRATEGIES,MEDIA BUY,6671420.08
111062,TARGETED PLATFORM MEDIA LLC,ADVERTISING,6551748.27
109018,STRATEGIC MEDIA PLACEMENT INC,PLACED MEDIA,5224802.0
46151,FIELDWORKS LLC,SIGNATURE GATHERING,5209286.84


#### Question 4:
- Make a bar chart with plotly comparing contributions by donor type or recipient type (PAC, individual, etc) and one comparing recipients by the office type they are running for

MI Schedule Types (Schedule types to categorize different types of expenditures)
- Direct (Itimized Direct Expenditures)
- SUPP (Supplemental Expenditures)
- Independent (Independent Expenditures)
- Office (Office Expense Disbursements)
- INKIND (In-Kind Expenditure)
- GOTV (Get-Out-The-Vote Activity)

MI Committee Tyoes
- DIS (District Party Committee)
- STA (State Party Committee)
- BAL (Ballot Question Commmittee)
- COU (County Part Committee)
- POL (Political Action Committee)
- GUB (Gubernatorial Commmittee)
- CAN (Candidate Committee)
- IND (Independent Political Action Committee)
    

In [20]:
schedule_type_count = (
    merged_expenditure_df["schedule_desc"].value_counts().reset_index()
)
schedule_type_count.columns = ["Schedule_Type", "Count"]

In [21]:
fig = px.bar(
    schedule_type_count,
    x="Schedule_Type",
    y="Count",
    title="Michigan Expenditures by Schedule Type 2018-2023",
    text="Count",
)
fig.update_layout(
    xaxis_title="Schedule Types",
    yaxis_title="2018-2023 Count",
    xaxis={"categoryorder": "total ascending"},
)
fig.show()

As shown in the visual above, a large majority of the expenditure schedule types are direct as opposedd to GOTV and INKIND. This visual coveres 2018 to 2023.

In [22]:
com_type_count = merged_expenditure_df["com_type"].value_counts().reset_index()
com_type_count.columns = ["Committee_Type", "Count"]

In [23]:
fig = px.bar(
    com_type_count,
    x="Committee_Type",
    y="Count",
    title="Michigan Contributons by Committee Type 2018-2023",
    text="Count",
)
fig.update_layout(
    xaxis_title="Committee Types",
    yaxis_title="1999-2023 Count",
    xaxis={"categoryorder": "total ascending"},
)
fig.show()

As shown in the visual above, a large majority of campaign expenditures by committee type are Candidate Committees and Indpendent Expenditure Committees. This visual covers 2018 to 2023.

In [25]:
# bar chart with contribution type
contribution_type_count = (
    merged_contribution_df["contribtype"].value_counts().reset_index()
)
contribution_type_count.columns = ["Cont_Type", "Count"]

In [26]:
# Plot 2018-2023 Contributions by Type
fig = px.bar(
    contribution_type_count,
    x="Cont_Type",
    y="Count",
    title="Michigan Committee Contributions Type",
    text="Count",
)
fig.update_layout(
    xaxis_title="Contribution Types",
    yaxis_title="2018-2023 Count",
    xaxis={"categoryorder": "total ascending"},
)
fig.show()

Plots for Contribution 2018 - 2023

In [28]:
create_all_plots(merged_expenditure_df, merged_contribution_df)