In [7]:
import pandas as pd
import os
import sys
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display
from pathlib import Path


BASE_PATH = Path.cwd().parent
sys.path.append(BASE_PATH)

from utils.preprocess_mi_campaign_data import (
    read_expenditure_data,
    read_contribution_data,
    update_expenditure_plots,
    update_contribution_plots,
)
from utils.constants import (
    MI_EXP_FILEPATH,
    MI_CON_FILEPATH,
    MI_EXPENDITURE_COLUMNS,
    MI_CONTRIBUTION_COLUMNS,
)

### Michigan Campaign Expenditure & Contribution Data 2018 - 2023 Exploratory Data Analysis

#### Task 1: Read in the Datasets and merge into one Pandas DataFrame

In [2]:
campaign_expenditure_dataframe_lst = []
campaign_contribution_dataframe_lst = []

for file in os.listdir(MI_EXP_FILEPATH):
    filepath = MI_EXP_FILEPATH + file
    campaign_expenditure_dataframe_lst.append(
        read_expenditure_data(filepath, MI_EXPENDITURE_COLUMNS)
    )

for file in os.listdir(MI_CON_FILEPATH):
    filepath = MI_CON_FILEPATH + file
    campaign_contribution_dataframe_lst.append(
        read_contribution_data(filepath, MI_CONTRIBUTION_COLUMNS)
    )

In [3]:
pd.options.display.max_columns = 100
campaign_expenditure_dataframe_lst[0].head(5)

Unnamed: 0,doc_seq_no,expenditure_type,gub_account_type,gub_elec_type,page_no,expense_id,detail_id,doc_stmnt_year,doc_type_desc,com_legal_name,common_name,cfr_com_id,com_type,schedule_desc,exp_desc,purpose,extra_desc,f_name,lname_or_org,address,city,state,zip,exp_date,amount,state_loc,supp_opp,can_or_ballot,county,debt_payment,vend_name,vend_addr,vend_city,vend_state,vend_zip,gotv_ink_ind,fundraiser
0,488507,1B,,,0,4100,0,2020,ANNUAL CS,COMMITTEE TO ELECT CHARIS LEE FOR STATE REPRES...,COMMITTEE TO ELECT CHARIS LEE FOR ST,519354.0,CAN,DIRECT,COMPUTER COSTS,CHUNIQ INPOWER GOSQ.COM,,,COMMITTEE TO ELECT CHARIS LEE,1133 FAIRFAX ST,FLINT,MI,48505-0000,11/15/2019,354.28,,,,,,,,,,,,
1,488507,1B,,,0,4101,0,2020,ANNUAL CS,COMMITTEE TO ELECT CHARIS LEE FOR STATE REPRES...,COMMITTEE TO ELECT CHARIS LEE FOR ST,519354.0,CAN,DIRECT,PRINT ADVERTISING,SAWICKI & SON 313-962-2725,,,COMMITTEE TO ELECT CHARIS LEE,1133 FAIRFAX ST,FLINT,MI,48505-0000,12/02/2019,500.0,,,,,,,,,,,,
2,488507,1B,,,0,4108,0,2020,ANNUAL CS,COMMITTEE TO ELECT CHARIS LEE FOR STATE REPRES...,COMMITTEE TO ELECT CHARIS LEE FOR ST,519354.0,CAN,DIRECT,"MAILING,POSTAGE,BULK RATE",STAPLES,,,COMMITTEE TO ELECT CHARIS LEE,1133 FAIRFAX ST,FLINT,MI,48505-0000,11/26/2019,101.61,,,,,,,,,,,,
3,488507,1B,,,0,4109,0,2020,ANNUAL CS,COMMITTEE TO ELECT CHARIS LEE FOR STATE REPRES...,COMMITTEE TO ELECT CHARIS LEE FOR ST,519354.0,CAN,DIRECT,"MAILING,POSTAGE,BULK RATE",STAPLES,,,COMMITTEE TO ELECT CHARIS LEE,1133 FAIRFAX ST,FLINT,MI,48505-0000,11/26/2019,131.18,,,,,,,,,,,,X
4,488507,1B,,,0,4110,0,2020,ANNUAL CS,COMMITTEE TO ELECT CHARIS LEE FOR STATE REPRES...,COMMITTEE TO ELECT CHARIS LEE FOR ST,519354.0,CAN,DIRECT,"CONSULTATION, RESEARCH",FIELD OPERATIONS,,,COMMITTEE TO ELECT CHARIS LEE,1133 FAIRFAX ST,FLINT,MI,48505-0000,12/03/2019,100.65,,,,,,,,,,,,


In [4]:
merged_contribution_df = pd.concat(campaign_contribution_dataframe_lst)
merged_expenditure_df = pd.concat(campaign_expenditure_dataframe_lst)

In [None]:
merged_contribution_df["amount"] = pd.to_numeric(
    merged_contribution_df["amount"], errors="coerce"
)
merged_expenditure_df["amount"] = pd.to_numeric(
    merged_contribution_df["amount"], errors="coerce"
)

In [None]:
# removes the MENOMINEE COUNTY DEMOCRATIC PARTY columns that create errors
merged_contribution_df = merged_contribution_df[
    merged_contribution_df["com_type"] != "MENOMINEE COUNTY DEMOCRATIC PARTY"
]
merged_expenditure_df = merged_expenditure_df[
    merged_expenditure_df["com_type"] != "MENOMINEE COUNTY DEMOCRATIC PARTY"
]

#### Task 2: Answer the Following Questions
- For each column, what are the contents of it? How many blanks or nulls are there? What is the format? If there it is one of several types, what are those types?
    - Percentage of nulls/blanks
- Is the dataset relational (are there multiple tables that relate to each other)?
- Who are the top 10 contributors in your data? The top 10 recipients?
- Make a bar chart with plotly comparing contributions by donor type or recipient type (PAC, individual, etc) and one comparing recipients by the office type they are running for
- If you have multiple years, are they all similar? If not, is the difference explicable (maybe by election schedules)
    - Have an option to toggle at the top of the notebook to use different years.
    - Utility functions to import into the jupyter notebook, should be generalizable

----

##### Question 1: 
- For each column, what are the contents of it? How many blanks or nulls are there? What is the format? If there it is one of several types, what are those types?

In [5]:
merged_contribution_df.dtypes

doc_seq_no            int64
expenditure_type     object
gub_account_type     object
gub_elec_type        object
page_no               int64
expense_id            int64
detail_id             int64
doc_stmnt_year        int64
doc_type_desc        object
com_legal_name       object
common_name          object
cfr_com_id          float64
com_type             object
schedule_desc        object
exp_desc             object
purpose              object
extra_desc           object
f_name               object
lname_or_org         object
address              object
city                 object
state                object
zip                  object
exp_date             object
amount               object
state_loc            object
supp_opp            float64
can_or_ballot        object
county               object
debt_payment         object
vend_name            object
vend_addr            object
vend_city            object
vend_state           object
vend_zip             object
gotv_ink_ind        

In [None]:
merged_expenditure_df.dtypes

In [8]:
contritbution_null_percentage = (
    merged_contribution_df.isna().mean() * 100
).reset_index()
contritbution_null_percentage.columns = ["Column Name", "Missing Percentage"]

# The percentage of null values in each column of the contribution data is as follows below
display(contritbution_null_percentage)

Unnamed: 0,Column Name,Missing Percentage
0,doc_seq_no,0.0
1,expenditure_type,0.0
2,gub_account_type,0.0
3,gub_elec_type,0.0
4,page_no,0.0
5,expense_id,0.0
6,detail_id,0.0
7,doc_stmnt_year,0.0
8,doc_type_desc,0.0
9,com_legal_name,0.010585


In [None]:
expenditure_null_percentage = (merged_expenditure_df.isna().mean() * 100).reset_index()
expenditure_null_percentage.columns = ["Column Name", "Missing Percentage"]

# The percentage of null values in each column of the expenditure data is as follows below
display(expenditure_null_percentage)

##### Question 2: Are commmittee's relational in the different dataset based on the contribution_id or cfr_com_id ?
- Both the contribution and expenditure datasets are relational based upon the cfr_com_id, as shown below. According to the Secretary of State provided README cfr_com_id is the unique committee ID# of the committee in the Bureau of Elections database.

In [20]:
merged_contribution_df[merged_contribution_df["cfr_com_id"] == 508347.0].head(2)

Unnamed: 0,doc_seq_no,expenditure_type,gub_account_type,gub_elec_type,page_no,expense_id,detail_id,doc_stmnt_year,doc_type_desc,com_legal_name,common_name,cfr_com_id,com_type,schedule_desc,exp_desc,purpose,extra_desc,f_name,lname_or_org,address,city,state,zip,exp_date,amount,state_loc,supp_opp,can_or_ballot,county,debt_payment,vend_name,vend_addr,vend_city,vend_state,vend_zip,gotv_ink_ind,fundraiser
8915,491133,1B,,,0,1138926,0,2020,JANUARY QUARTERLY CS,UAW MICHIGAN VOLUNTARY POLITICAL ACTION COMMITTEE,UAW MICHIGAN VOLUNTARY POLITICAL ACT,508347.0,IND,DIRECT,DIRECT CONTRIBUTIONS,CONTRIBUTION,,,FRIENDS OF MAUREEN BROSNAN,11320 ARDEN,LIVONIA,MI,48150-0000,10/24/2019,6000.0,,,MAUREEN BROSNAN,WAYNE,,,,,,,,
8916,491133,1B,,,0,1138931,0,2020,JANUARY QUARTERLY CS,UAW MICHIGAN VOLUNTARY POLITICAL ACTION COMMITTEE,UAW MICHIGAN VOLUNTARY POLITICAL ACT,508347.0,IND,DIRECT,DIRECT CONTRIBUTIONS,CONTRIBUTION,,,CTE DENNARD SHAW,1041 HARRISON,INKSTER,MI,48141-0000,10/28/2019,500.0,,,DENNARD SHAW,WAYNE,,,,,,,,


In [19]:
merged_expenditure_df[merged_expenditure_df["cfr_com_id"] == 508347.0].tail(2)

Unnamed: 0,doc_seq_no,expenditure_type,gub_account_type,gub_elec_type,page_no,expense_id,detail_id,doc_stmnt_year,doc_type_desc,com_legal_name,common_name,cfr_com_id,com_type,schedule_desc,exp_desc,purpose,extra_desc,f_name,lname_or_org,address,city,state,zip,exp_date,amount,state_loc,supp_opp,can_or_ballot,county,debt_payment,vend_name,vend_addr,vend_city,vend_state,vend_zip,gotv_ink_ind,fundraiser
107636,472254,1B,,,0,716150,0,2018,OCTOBER QUARTERLY CS,UAW MICHIGAN VOLUNTARY POLITICAL ACTION COMMITTEE,UAW MICHIGAN VOLUNTARY POLITICAL ACT,508347.0,IND,DIRECT,DIRECT CONTRIBUTIONS,CONTRIBUTION,,,CTE BRIAN PICKELL JUDGE,727 SHADY BROOK LANE,FLUSHING,MI,48433-0000,10/18/2018,2000.0,,,BRIAN PICKELL,.,,,,,,,,
107637,472254,1B,,,0,716153,0,2018,OCTOBER QUARTERLY CS,UAW MICHIGAN VOLUNTARY POLITICAL ACTION COMMITTEE,UAW MICHIGAN VOLUNTARY POLITICAL ACT,508347.0,IND,DIRECT,DIRECT CONTRIBUTIONS,CONTRIBUTION,,,JUSTICE FOR ALL - MICHIGAN,6639 CENTURION SUITE 120,LANSING,MI,48917-0000,10/15/2018,100000.0,,,,,,,,,,,,


#### Question 3: Who are the top 10 contributors in your data? The top 10 recipients?

The top individuals and organizations that donate to committees are listed below as well as the top committees receiving contributions. 

In [None]:
# display the top 10 individual contributors from the contribution data
top_10_individual_contrubutors = (
    merged_contribution_df.groupby(["f_name", "l_name_or_org"])["amount"]
    .sum()
    .reset_index()
)
top_10_individual_contrubutors = top_10_individual_contrubutors.sort_values(
    by="amount", ascending=False
).head(10)
display(top_10_individual_contrubutors)

In [None]:
# display the top 10 organizational contributors from the contribution data
top_10_organizational_contributors = merged_contribution_df[
    merged_contribution_df["f_name"].isnull()
]
top_10_organizational_contributors = (
    top_10_organizational_contributors.groupby("l_name_or_org")["amount"]
    .sum()
    .reset_index()
)
top_10_organizational_contributors = top_10_organizational_contributors.sort_values(
    by="amount", ascending=False
).head(10)
display(top_10_organizational_contributors)

In [None]:
# display the 10 recipients (commmittees) from the contribution data
top_10_commiitee_recipients = (
    merged_contribution_df.groupby("com_legal_name")["amount"].sum().reset_index()
)
top_10_commiitee_recipients = top_10_commiitee_recipients.sort_values(
    by="amount", ascending=False
).head(10)
display(top_10_commiitee_recipients)

In [None]:
# display the top 10 recipients (candidates) from the contribution data

top_10_recipients = (
    merged_contribution_df.groupby(["can_first_name", "can_last_name"])["amount"]
    .sum()
    .reset_index()
)
top_10_recipients = top_10_recipients.sort_values(by="amount", ascending=False).head(10)
display(top_10_recipients)

- The top 10 expenditures for individuals and organizations are shown below, as well as top expenditures supporting or opposing a candidate or ballot issue

In [9]:
# Top Expenditures supporting a Candidate or Ballot Issue

top_10_individual_recipients = merged_expenditure_df[
    (merged_expenditure_df["lname_or_org"].notnull())
    & (merged_expenditure_df["f_name"].notnull())
]
top_10_individual_recipients = (
    top_10_individual_recipients.groupby(["f_name", "lname_or_org", "supp_opp"])[
        "amount"
    ]
    .sum()
    .reset_index()
)
top_10_individual_recipients = top_10_individual_recipients.sort_values(
    by="amount", ascending=False
).head(10)
display(top_10_individual_recipients)

Unnamed: 0,f_name,lname_or_org,supp_opp,amount
508,HEATHER,RICKETTS,1.0,78985.66
341,DENNIS,LENNOX,1.0,72217.11
721,KATHRYN,FAHEY,1.0,60235.15
534,JACK,JENNINGS,1.0,58293.37
171,BRIAN,CALLEY ^,1.0,58000.0
1183,SCOTT,DREXEL,1.0,51488.27
561,JAMES,LANCASTER,1.0,45817.0
812,LENORE,GOLDMAN,1.0,39219.42
702,KAI,PAIGE,1.0,37540.74
120,AURELIUS,CHRISTIAN,1.0,36250.0


In [10]:
# Top Expenditures opposing an Office or Ballot Issue


top_10_individual_recipients_opposing = merged_expenditure_df[
    (merged_expenditure_df["lname_or_org"].notnull())
    & (merged_expenditure_df["f_name"].notnull())
    & (merged_expenditure_df["supp_opp"] == 2.0)
]
top_10_individual_recipients_opposing = (
    top_10_individual_recipients_opposing.groupby(
        ["f_name", "lname_or_org", "supp_opp"]
    )["amount"]
    .sum()
    .reset_index()
)
top_10_individual_recipients_opposing = (
    top_10_individual_recipients_opposing.sort_values(
        by="amount", ascending=False
    ).head(10)
)
display(top_10_individual_recipients_opposing)

Unnamed: 0,f_name,lname_or_org,supp_opp,amount
35,MICHAEL,GILMORE ESQ.,2.0,18532.26
32,LINDA,KOJIRO,2.0,5550.0
10,CLAUDIA,RODRIGUEZ,2.0,2556.9
11,CODY,WETHERILL,2.0,1597.74
19,GRETCHEN,WHITMER,2.0,1197.18
25,JENNIFER G,BARKER,2.0,950.0
12,DALE,KILDEE,2.0,579.14
21,HILLARY,SCHOLTEN,2.0,502.27
7,BRADLEY,OCONNER,2.0,500.0
28,JONATHAN,MOY,2.0,458.8


In [11]:
# Top recipients of expenditures for organizations

top_10_org_recipients = merged_expenditure_df[merged_expenditure_df["f_name"].isnull()]
top_10_org_recipients = (
    top_10_org_recipients.groupby(["lname_or_org", "purpose"])["amount"]
    .sum()
    .reset_index()
)
top_10_org_recipients = top_10_org_recipients.sort_values(
    by="amount", ascending=False
).head(10)
display(top_10_org_recipients)

Unnamed: 0,lname_or_org,purpose,amount
68303,KELLY SCOTT & MADISON INC,MEDIA AD PLACEMENT,17453108.0
54063,GMMB,MEDIA BUY,16954706.0
86686,NATIONAL PETITION MANAGEMENT,PETITIONS; SIGNATURE GATHERING,9901099.24
82390,MICHIGAN DEMOCRATIC STATE CENTRAL CO,CONTRIBUTION,8157143.18
54071,GMMB,TV ADVERTISING,7477029.48
46123,FIELD WORKS LLC,PETITION GATHERING EXPENSES,7340198.76
69753,KNOW-HOW STRATEGIES,MEDIA BUY,6671420.08
111062,TARGETED PLATFORM MEDIA LLC,ADVERTISING,6551748.27
109018,STRATEGIC MEDIA PLACEMENT INC,PLACED MEDIA,5224802.0
46151,FIELDWORKS LLC,SIGNATURE GATHERING,5209286.84


#### Question 4:
- Make a bar chart with plotly comparing contributions by donor type or recipient type (PAC, individual, etc) and one comparing recipients by the office type they are running for

MI Schedule Types (Schedule types to categorize different types of expenditures)
- Direct (Itimized Direct Expenditures)
- SUPP (Supplemental Expenditures)
- Independent (Independent Expenditures)
- Office (Office Expense Disbursements)
- INKIND (In-Kind Expenditure)
- GOTV (Get-Out-The-Vote Activity)

MI Committee Tyoes
- DIS (District Party Committee)
- STA (State Party Committee)
- BAL (Ballot Question Commmittee)
- COU (County Part Committee)
- POL (Political Action Committee)
- GUB (Gubernatorial Commmittee)
- CAN (Candidate Committee)
- IND (Independent Political Action Committee)
    

In [12]:
schedule_type_count = (
    merged_expenditure_df["schedule_desc"].value_counts().reset_index()
)
schedule_type_count.columns = ["Schedule_Type", "Count"]

In [13]:
fig = px.bar(
    schedule_type_count,
    x="Schedule_Type",
    y="Count",
    title="Michigan Expenditures by Schedule Type 2018-2023",
    text="Count",
)
fig.update_layout(
    xaxis_title="Schedule Types",
    yaxis_title="2018-2023 Count",
    xaxis={"categoryorder": "total ascending"},
)
fig.show()

As shown in the visual above, a large majority of the expenditure schedule types are direct as opposedd to GOTV and INKIND. This visual coveres 2018 to 2023.

In [14]:
com_type_count = merged_expenditure_df["com_type"].value_counts().reset_index()
com_type_count.columns = ["Committee_Type", "Count"]

In [15]:
fig = px.bar(
    com_type_count,
    x="Committee_Type",
    y="Count",
    title="Michigan Contributons by Committee Type 2018-2023",
    text="Count",
)
fig.update_layout(
    xaxis_title="Committee Types",
    yaxis_title="1999-2023 Count",
    xaxis={"categoryorder": "total ascending"},
)
fig.show()

As shown in the visual above, a large majority of campaign expenditures by committee type are Candidate Committees and Indpendent Expenditure Committees. This visual covers 2018 to 2023.

##### Plotting Contribution Type and Amounts by Year

In [16]:
# Create the dropdown

years = sorted(merged_expenditure_df["doc_stmnt_year"].unique())

# Create a toggle widget
expenditure_year_selector = widgets.Dropdown(
    options=years,
    value=years[0],
    description="Select Year: ",
    button_style="primary",
    disabled=False,
)

#### Select a year below and rerun the cells

In [17]:
# Select a year
display(expenditure_year_selector)

Dropdown(description='Select Year: ', options=(2018, 2019, 2020, 2021, 2022, 2023), value=2018)

In [18]:
update_expenditure_plots(expenditure_year_selector, merged_expenditure_df)

---

In [None]:
# bar chart with contribution type
contribution_type_count = (
    merged_contribution_df["contribtype"].value_counts().reset_index()
)
contribution_type_count.columns = ["Cont_Type", "Count"]

In [None]:
# Plot 2018-2023 Contributions by Type
fig = px.bar(
    contribution_type_count,
    x="Cont_Type",
    y="Count",
    title="Michigan Committee Contributions Type",
    text="Count",
)
fig.update_layout(
    xaxis_title="Contribution Types",
    yaxis_title="2018-2023 Count",
    xaxis={"categoryorder": "total ascending"},
)
fig.show()

#### Select a year below and rerun the cells

In [None]:
# Create the dropdown

years = sorted(merged_contribution_df["doc_stmnt_year"].unique())

# Create a toggle widget
contribution_year_selector = widgets.Dropdown(
    options=years,
    value=years[0],
    description="Select Year: ",
    button_style="primary",
    disabled=False,
)

In [None]:
# Select a year
display(contribution_year_selector)

In [None]:
# Graph by Year
update_contribution_plots(contribution_year_selector, merged_contribution_df)