In [1]:
#!/usr/bin/env python
# coding: utf-8

import pandas as pd
import numpy as np
import json

In [2]:
filebase = 'data/fbpac-ads-en-US'

print(f'Reading file {filebase}.csv...', end='', flush=True)
df = pd.read_csv(filebase + '.csv')
print(' done')

Reading file data/fbpac-ads-en-US.csv... done


In [3]:
# Columns to keep
columns = ['political',
           'not_political',
           'title',
           'message',
           'created_at',
           'updated_at',
           'impressions',
           'political_probability',
           'targets',
           'advertiser',
           'entities',
           'lower_page',
           'paid_for_by',
           'targetedness',
           'listbuilding_fundraising_proba']

print('Keeping columns:')
for column in columns:
    print('\t', column)

Keeping columns:
	 political
	 not_political
	 title
	 message
	 created_at
	 updated_at
	 impressions
	 political_probability
	 targets
	 advertiser
	 entities
	 lower_page
	 paid_for_by
	 targetedness
	 listbuilding_fundraising_proba


In [4]:
# Reduce dataset
df = df[columns]

print("Removing HTML from 'message'")
# Remove HTML from 'message'
df['message'] = df['message'].str.replace('<[^<]+?>', '')

Removing HTML from 'message'


In [5]:
n = df.shape[0]

print("Parsing 'targets' column...", end='', flush=True)
# Parse the 'targets' column
targets = {
    'Gender': [''] * n,
    'Age': [''] * n,
    'Retargeting': [''] * n,
    'Interest': [''] * n,
    'Segment': [''] * n,
    'State': [''] * n,
    'List': [''] * n,
    'Engaged with Content': [''] * n,
    'Language': [''] * n,
    'Website': [''] * n,
    'City': [''] * n,
    'Activity on the Facebook Family': [''] * n,
    'MaxAge': [''] * n,
    'Like': [''] * n,
    'MinAge': [''] * n,
    'RegionTarget': [''] * n,
    'Agency': [''] * n
}

for i, target in enumerate(df['targets']):
    if pd.isna(target):
        continue
    data = json.loads(target)
    for datum in data:
        col = datum['target']
        # Region exists in both targets and entities
        if col == 'Region':
            col = 'RegionTarget'
        if 'segment' in datum:
            val = datum['segment']
        else:
            val = '1'
        targets[col][i] = val

target_df = pd.DataFrame.from_dict(targets)
print(' done')

Parsing 'targets' column... done


In [6]:
print("Parsing 'entities' column...", end='', flush=True)
# Parse the 'entities' column
entity_types = {
    'Organization': [''] * n,
    'Event': [''] * n,
    'Law': [''] * n,
    'RegionEntity': [''] * n,
    'Group': [''] * n,
    'Location': [''] * n,
    'Facility': [''] * n,
    'Person': [''] * n
}

for i, entity in enumerate(df['entities']):
    if pd.isna(entity):
        continue
    data = json.loads(entity)
    for datum in data:
        col = datum['entity_type']
        if col == 'Region':
            col = 'RegionEntity'
        val = datum['entity']
        entity_types[col][i] = val

entity_df = pd.DataFrame.from_dict(entity_types)
print(' done')

Parsing 'entities' column... done


In [7]:
age_bins = {
    '13-17': [0] * n,
    '18-34': [0] * n,
    '35-49': [0] * n,
    '50-64': [0] * n,
    '65+': [0] * n
}

In [8]:
# Combine original dataframe with newly created, parsed dataframes
df = pd.concat([df, target_df, entity_df], axis=1, sort=False)

age_ranges = {
    '13-17': {'min': 13, 'max': 17},
    '18-34': {'min': 18, 'max': 34},
    '35-49': {'min': 35, 'max': 49},
    '50-64': {'min': 50, 'max': 64},
    '65+': {'min': 65, 'max': 1000},
}

print('Moving ages into bins...', end='', flush=True)
for i, (min_age, max_age) in enumerate(zip(df['MinAge'], df['MaxAge'])):
    if pd.isna(min_age):
        min_age = 0
    if pd.isna(max_age):
        max_age = 1000
    try:
    	min_age = float(min_age)
    except ValueError:
        min_age = 0.0
    try:
    	max_age = float(max_age)
    except ValueError:
        max_age = 1000
    for key, val in age_ranges.items():
        if min_age <= val['max'] and max_age >= val['min']:
            age_bins[key][i] = 1
print(' Done')

Moving ages into bins... Done


In [9]:
df = pd.concat([df, pd.DataFrame.from_dict(age_bins)], axis=1, sort=False)

# Remove parsed and unneeded columns
df.drop('targets', axis=1, inplace=True)
df.drop('entities', axis=1, inplace=True)
df.drop('List', axis=1, inplace=True)
df.drop('Engaged with Content', axis=1, inplace=True)
df.drop('Age', axis=1, inplace=True)

In [10]:
Created_AT_Year = [int(x[0:4]) for x in df.created_at]
df['Created_At_Year'] = Created_AT_Year

Created_AT_Month = [int(x[5:7]) for x in df.created_at]
df['Created_At_Month'] = Created_AT_Month

Updated_AT_Year = [int(x[0:4]) for x in df.updated_at]
df['Updated_At_Year'] = Updated_AT_Year

Updated_AT_Month = [int(x[5:7]) for x in df.updated_at]
df['Updated_At_Month'] = Updated_AT_Month

#df.drop(columns=['created_at', 'updated_at'], inplace=True)

df['lower_page'] = df['lower_page'].str.replace('https://www.facebook.com/', '')
df['lower_page'] = df['lower_page'].str.replace('/', '')

In [11]:
df['political_probability'].value_counts(bins=4)

(0.75, 1.0]          148206
(0.5, 0.75]            7502
(-0.001921, 0.25]      3614
(0.25, 0.5]            3002
Name: political_probability, dtype: int64

In [12]:
political_probability = []
for prob in df['political_probability']:
    if pd.isna(prob):
        political_probability.append(prob)
    elif prob > 0.95:
        political_probability.append(4)
    elif prob > 0.85:
        political_probability.append(3)
    elif prob > 0.75:
        political_probability.append(2)
    elif prob > 0.50:
        political_probability.append(1)
    else:
        political_probability.append(0)

        
df['political_probability_int'] = political_probability

In [13]:
df['listbuilding_fundraising_proba'].value_counts(bins=4)

(-0.00214, 0.308]    49731
(0.308, 0.615]       47516
(0.923, 1.231]       46911
(0.615, 0.923]        8236
Name: listbuilding_fundraising_proba, dtype: int64

In [14]:
political_probability = []
for prob in df['listbuilding_fundraising_proba']:
    if pd.isna(prob):
        political_probability.append(prob)
    elif prob > 0.9:
        political_probability.append(3)
    elif prob > 0.60:
        political_probability.append(2)
    elif prob > 0.30:
        political_probability.append(1)
    else:
        political_probability.append(0)
        
df['fundraising_proba_int'] = political_probability
df.drop('listbuilding_fundraising_proba', axis=1, inplace=True)

In [15]:
is_political = []
is_not_political = []
for i in range(df.shape[0]):
    if df.loc[i, 'political'] > df.loc[i, 'not_political']:
        is_political.append(1)
        is_not_political.append(0)
    elif df.loc[i, 'political'] < df.loc[i, 'not_political']:
        is_political.append(0)
        is_not_political.append(1)
    else:
        is_political.append(0)
        is_not_political.append(0)
    """
    if df.loc[i, 'political'] > df.loc[i, 'not_political']:
        is_political.append(1)
    elif df.loc[i, 'political'] < df.loc[i, 'not_political']:
        is_political.append(0)
    else:
        is_political.append(-1)
    """
        
df['is_political'] = is_political
df['is_not_political'] = is_not_political

In [16]:
df['is_impressions'] = [1 if df.loc[i, 'impressions'] > 0 else 0 for i in range(df.shape[0])]

In [17]:
title_names = df['title'].value_counts()
#list(title_names)
#title_cutoff = sum(1 if x > 200 else 0 for x in list(title_names))
#title_cutoff
print(title_names)

International Rescue Committee    4150
Planned Parenthood Action         2363
Beto O'Rourke                     2272
Jay Inslee                        1935
Planned Parenthood                1849
                                  ... 
Victor Mitchell                      1
Actors' Equity Association           1
The Planet D                         1
Housing Action Illinois              1
Noel Kevin Breen                     1
Name: title, Length: 14087, dtype: int64


In [18]:
#df = df.dropna()
parsed_person = df['Person'].str.replace('’', "'")
parsed_person = parsed_person.str.replace("'s", "")
parsed_person = parsed_person.str.replace("Brett ", "")
parsed_person = parsed_person.str.replace("Kavanaugh", "Brett Kavanaugh")
parsed_person = parsed_person.str.replace(" Abrams", "")
parsed_person = parsed_person.str.replace("Stacey", "Stacey Abrams")
parsed_person = parsed_person.str.replace(" Evers", "")
parsed_person = parsed_person.str.replace("Tony", "Tony Evers")
parsed_person = parsed_person.str.replace("Cruz", "Ted Cruz")
parsed_person = parsed_person.str.replace("Ted Ted Cruz", "Ted Cruz")
parsed_person = parsed_person.str.replace("Jay", "Jay Inslee")
parsed_person = parsed_person.str.replace("Beto", "Beto O'Rourke")
parsed_person = parsed_person.str.replace("Biden", "Joe Biden")
parsed_person = parsed_person.str.replace("Obama", "Barack Obama")
parsed_person = parsed_person.str.replace("Donald J. Trump", "Donald Trump")
df['title'] = df['title'].str.replace("Donald J. Trump", "Donald Trump")
df['title'] = df['title'].str.replace("Planned Parenthood Action", "Planned Parenthood")
df['Organization'] = df['Organization'].str.replace("Planned Parenthood's", "Planned Parenthood")
df['Organization'] = df['Organization'].str.replace('the Democratic Party', 'The Democratic Party')
df['Organization'] = df['Organization'].str.replace('The Democratic Party’s', 'The Democratic Party')
df['Organization'] = df['Organization'].str.replace('Democrats’', 'The Democratic Party')
df['Organization'] = df['Organization'].str.replace("Trump administration's", 'The Trump Administration')
df['Organization'] = df['Organization'].str.replace('the Trump Administration', 'The Trump Administration')
df['Organization'] = df['Organization'].str.replace('The Trump Admin', 'The Trump Administration')
df['Organization'] = df['Organization'].str.replace('The Trump Administrationistration', 'The Trump Administration')
df['Organization'] = df['Organization'].str.replace('the U.S. Supreme Court', 'The Supreme Court')
df['Organization'] = df['Organization'].str.replace('the Supreme Court', 'The Supreme Court')
df['Organization'] = df['Organization'].str.replace('the U.S. Senate', 'Senate')
df['Organization'] = df['Organization'].str.replace('U.S. Senate', 'Senate')
df['Organization'] = df['Organization'].str.replace('the Republican Party', 'GOP')
df['Organization'] = df['Organization'].str.replace('Congressional', 'Congress')
df['Organization'] = df['Organization'].str.replace('House', 'Congress')
df['Organization'] = df['Organization'].str.replace('the U.S. Congress', 'Congress')
df['Organization'] = df['Organization'].str.replace('the Congress of Representatives', 'Congress')

parsed_person.value_counts() # NOTE: SOME OF THE COUNTS CHANGE ACCROSS RUNS

                         95297
Donald Trump             11418
Brett Kavanaugh           1595
Jay Inslee                1310
Paul Ryan                  854
                         ...  
Mario Cilento                1
Barbot                       1
Ellis                        1
Mar 24 - We March Nov        1
http://bit.ly/2FGpRa3        1
Name: Person, Length: 10987, dtype: int64

In [19]:
def examine_col(col_name):
    print(df[col_name].value_counts())
    names_list = df[col_name].value_counts().index.tolist()
    print()
    print(names_list)

In [20]:
def get_person(name, df_col_1, df_col_2):
    person = []
    for i in range(df.shape[0]):
        if df_col_1[i] == name or df_col_2[i] == name:
            person.append(1)
        else:
            person.append(0)
    return person

In [21]:
def get_one_person(name, df_col_1):
    person = []
    for i in range(df.shape[0]):
        if df_col_1[i] == name:
            person.append(1)
        else:
            person.append(0)
    return person

In [22]:
names = ['Donald Trump', 'Brett Kavanaugh', 'Jay Inslee', 'Paul Ryan', 'Barack Obama', 'Beto O\'Rourke', 
         'Ted Cruz', 'Tony Evers', 'Stacey Abrams', 'Joe Biden', 'Kirsten Gillibrand', 'Elizabeth Warren', 
         'Kamala Harris', 'Amy Klobuchar']
for name in names:
    col_name = name.replace(' ', '_')
    df[col_name] = get_person(name, parsed_person, df['title'])


In [23]:
title_names_list = df['title'].value_counts().index.tolist()
title_names_list

['Planned Parenthood',
 'International Rescue Committee',
 "Beto O'Rourke",
 'Jay Inslee',
 'ACLU',
 'Elizabeth Warren',
 'Kirsten Gillibrand',
 'Kamala Harris',
 'Donald Trump',
 'Indivisible Guide',
 'Sierra Club',
 'USA for UNHCR',
 'Doctors Without Borders/ Médecins Sans Frontières (MSF)',
 'Penzeys Spices',
 'Amy Klobuchar',
 'Care2',
 'Everytown for Gun Safety',
 'Michael Bennet',
 'Cathy Myers',
 'MoveOn',
 'Feeding America',
 'World Wildlife Fund',
 'Ocean Conservancy',
 'The Wilderness Society',
 'NARAL Pro-Choice America',
 'Environmental Defense Fund',
 'Tom Steyer',
 'Amnesty International USA',
 'The Nature Conservancy',
 'Greenpeace USA',
 'Julián Castro',
 'MJ for Texas',
 'Defenders of Wildlife',
 'Democratic Party',
 'Cory Booker',
 'CARE',
 'Bernie Sanders',
 'End Citizens United',
 'MoveOn.org',
 'Swing Left',
 'Alexandria Ocasio-Cortez',
 'NRDC',
 'Human Rights Watch',
 'Andrew Janz',
 'National Democratic Redistricting Committee',
 'Randy Bryce',
 'League of Conser

In [24]:
examine_col('Organization')

                      90356
Congress               8285
Senate                 3024
Planned Parenthood     1303
The Supreme Court      1279
                      ...  
Brackets for Good         1
a Matching Grant          1
Adriane                   1
Rise for Climate          1
Solar Panels              1
Name: Organization, Length: 12613, dtype: int64



In [25]:
names = ['International Rescue Committee', 'Planned Parenthood', 'ACLU', 'Congress', 'Senate', 'the Supreme Court', 
         'GOP', 'NRA', 'EPA', 'FBI', 'Citizens United', 'FEC']
for name in names:
    col_name = name.replace(' ', '_')
    df[col_name] = get_person(name, df['title'], df['Organization'])


#df['filtered_title'] = [df.loc[x, 'title'].count() for x in range(df.shape[0])]
#df['filtered_title']
#df['title'].value_counts()
#df = df.dropna()
#titles = df['title']
#titles = titles.dropna()
import numpy as np
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categories='auto')
feature_arr = ohe.fit_transform(df[['title']]).toarray()
feature_labels = ohe.categories_

feature_labels = np.array(feature_labels).ravel()

features = pd.DataFrame(feature_arr, columns=feature_labels)
features.shape

In [26]:
examine_col('Group')

                                             129654
Democrats                                      9351
Republicans                                    5590
Americans                                      3695
American                                       2872
                                              ...  
susanamendoza.com/letmeknowwhatyouthink           1
Shalala                                           1
MS-13                                             1
Crowdfund                                         1
Gestapo                                           1
Name: Group, Length: 1055, dtype: int64

['', 'Democrats', 'Republicans', 'Americans', 'American', 'Indivisible', 'Mueller', 'Texans', 'Syrian', 'Californians', 'Floridians', 'Jewish', 'TONIGHT', 'New Yorkers', 'Russian', 'Minnesotans', 'Muslim', 'African', 'Christian', 'Georgians', 'PredictIt', 'Marine', 'Pitch', 'Missourians', 'Coloradans', 'Christians', 'Arizonans', 'Shirt', 'Texan', 'Alaskans', 'Ohioans', 'Dems', 'Conser

In [27]:
df['Group'] = df['Group'].str.replace('American', 'Americans')

names = ['Democrats', 'Republicans', 'Americans']
for name in names:
    col_name = name.replace(' ', '_')
    df[col_name] = get_one_person(name, df['Group'])

In [28]:
df['targetedness'] = [0 if pd.isna(x) else x for x in df['targetedness']]
examine_col('targetedness')

0.0     49825
4.0     48095
3.0     17477
2.0     12301
7.0     10364
1.0      8998
5.0      8801
8.0      3364
6.0      2094
9.0       494
10.0      333
11.0      160
12.0       18
Name: targetedness, dtype: int64

[0.0, 4.0, 3.0, 2.0, 7.0, 1.0, 5.0, 8.0, 6.0, 9.0, 10.0, 11.0, 12.0]


In [29]:
df['targetedness'].value_counts(bins=8)

(-0.013000000000000001, 1.5]    58823
(3.0, 4.5]                      48095
(1.5, 3.0]                      29778
(4.5, 6.0]                      10895
(6.0, 7.5]                      10364
(7.5, 9.0]                       3858
(9.0, 10.5]                       333
(10.5, 12.0]                      178
Name: targetedness, dtype: int64

In [30]:
political_probability = []
for prob in df['targetedness']:
    if prob <= 0:
        political_probability.append(0)
    elif prob < 3:
        political_probability.append(1)
    elif prob < 4.5:
        political_probability.append(2)
    else:
        political_probability.append(3)

        
df['targetedness_parsed'] = political_probability
df.drop('targetedness', axis=1, inplace=True)

In [31]:
df['Website'] = [0 if x == '' else 1 for x in df['Website']]
df['Activity on the Facebook Family'] = [0 if x == '' else 1 for x in df['Activity on the Facebook Family']]
df['Like'] = [0 if x == '' else 1 for x in df['Like']]
df['is_liberal'] = [1 if 'liberal' in x else 0 for x in df['Segment']]
df['is_conservative'] = [1 if 'conservative' in x else 0 for x in df['Segment']]

In [32]:
quarter = []
for item in df['Updated_At_Month']:
    if item < 4:
        quarter.append(0)
    elif item < 7:
        quarter.append(1)
    elif item < 10:
        quarter.append(2)
    else:
        quarter.append(3)
        
df['quarter'] = quarter

In [33]:
for col_name in df.columns:
    examine_col(col_name)
    print('\n\n\n')

0      64822
1      26679
2      16591
3      11460
4       7960
       ...  
416        1
475        1
417        1
474        1
383        1
Name: political, Length: 165, dtype: int64

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 14, 17, 18, 26, 25, 20, 19, 23, 24, 27, 30, 21, 28, 22, 29, 31, 32, 33, 34, 36, 35, 37, 38, 40, 39, 41, 42, 43, 44, 46, 47, 45, 48, 49, 50, 53, 51, 54, 52, 55, 56, 63, 66, 65, 69, 71, 82, 62, 57, 73, 72, 86, 64, 61, 87, 89, 58, 67, 77, 68, 75, 85, 60, 70, 88, 91, 74, 90, 80, 94, 360, 100, 84, 114, 83, 81, 265, 115, 436, 79, 59, 117, 126, 78, 103, 257, 488, 401, 105, 364, 272, 362, 399, 125, 271, 252, 107, 237, 269, 119, 374, 389, 366, 139, 113, 394, 371, 135, 138, 243, 405, 277, 93, 483, 98, 300, 301, 339, 335, 333, 437, 76, 440, 254, 314, 443, 319, 192, 456, 449, 298, 215, 343, 95, 353, 97, 96, 410, 351, 411, 156, 471, 478, 349, 416, 475, 417, 474, 383]




0      119582
1       21398
2        7945
3        4083
4        2289
        ...  
51     

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



internationalrescuecommittee    4147
plannedparenthoodaction         2357
betoorourke                     2269
jayinslee                       1935
plannedparenthood               1844
                                ... 
joebloomspianoproject              1
jeffrey4texas                      1
samovensfan                        1
teresabarrett4petaluma             1
jobsfirstpac                       1
Name: lower_page, Length: 12609, dtype: int64

['internationalrescuecommittee', 'plannedparenthoodaction', 'betoorourke', 'jayinslee', 'plannedparenthood', 'aclu', 'kirstengillibrand', 'elizabethwarren', 'moveon', 'kamalaharris', 'donaldtrump', 'indivisibleguide', 'sierraclub', 'unrefugees', 'msf.english', 'penzeys', 'amyklobuchar', 'care2', 'nrdc.org', 'everytown', 'cathymyersforcongress', 'feedingamerica', 'michaelbennet', 'worldwildlifefund', 'oceanconservancy', 'thewildernesssociety', 'envdefensefund', 'naralprochoiceamerica', 'officialtomsteyer', 'amnestyusa', 'thenatureconservancy


['', 'English (US)', 'Spanish (Spain)', 'Spanish', 'English (UK)', 'Traditional Chinese (Taiwan)', 'Landlubber Speak (English) (US)']




0    155991
1      6333
Name: Website, dtype: int64

[0, 1]




                    142728
New York               732
Washington             624
San Francisco          376
Bellingham             337
                     ...  
Elder                    1
Nazareth                 1
Poland                   1
Kewaskum                 1
University Place         1
Name: City, Length: 1716, dtype: int64

['', 'New York', 'Washington', 'San Francisco', 'Bellingham', 'Seattle', 'Austin', 'Houston', 'Chicago', 'Los Angeles', 'Minneapolis', 'Oakland', 'Dallas', 'Portland', 'Eden Prairie', 'Baltimore', 'Columbus', 'Tucson', 'San Diego', 'Milwaukee', 'Sarasota', 'St. Louis', 'Bradenton', 'Phoenix', 'Albuquerque', 'Philadelphia', 'Troy', 'Raleigh', 'Potomac', 'Denver', 'Tampa', 'Ann Arbor', 'Saint Paul', 'Kennesaw', 'Fargo', 'Las Vegas', 'Lansing', 'Sacramento', 






                                   159821
Get Out the Vote                      163
Holocaust                             122
the Endangered Species Act            103
The Great Pacific Garbage Patch        83
                                    ...  
Voting Rights Act                       1
Texas Law                               1
Iraq “WMD                               1
The Women in Black                      1
Nurses Week                             1
Name: Event, Length: 625, dtype: int64

['', 'Get Out the Vote', 'Holocaust', 'the Endangered Species Act', 'The Great Pacific Garbage Patch', 'Get Out The Vote', 'Thanksgiving', 'Hurricane Maria', 'Black Pepper', 'International Refugee Day', 'this November 6', 'the WHOLE YEAR', 'Twitter', 'Watergate', 'Super PACK', 'The Endangered Species Act', 'Hurricane Harvey', 'Lean Republican', 'the New Year', 'America Great', 'Mueller', 'the Watergate tapes', 'Our Revolution', 'the Civil War', 'Amendment', 'NY', 'New Year', 'Primary Day'






0    111582
1     50742
Name: 13-17, dtype: int64

[0, 1]




1    150465
0     11859
Name: 18-34, dtype: int64

[1, 0]




1    157544
0      4780
Name: 35-49, dtype: int64

[1, 0]




1    158463
0      3861
Name: 50-64, dtype: int64

[1, 0]




1    156123
0      6201
Name: 65+, dtype: int64

[1, 0]




2018    109029
2019     40004
2017     13291
Name: Created_At_Year, dtype: int64

[2018, 2019, 2017]




10    28294
5     23936
11    19796
4     13832
12    13761
3     13352
8     11120
9      9763
7      8031
6      7657
2      6807
1      5975
Name: Created_At_Month, dtype: int64

[10, 5, 11, 4, 12, 3, 8, 9, 7, 6, 2, 1]




2018    104980
2019     45268
2017     12076
Name: Updated_At_Year, dtype: int64

[2018, 2019, 2017]




5     24510
11    24332
10    22247
4     14830
12    13893
3     13449
8     10752
9      9017
1      7748
6      7281
7      7260
2      7005
Name: Updated_At_Month, dtype: int64

[5, 11, 10, 4, 12, 3, 8, 9, 1, 6, 7, 2]




4    122269
3     15966
2

In [34]:
df.drop('political', axis=1, inplace=True)
df.drop('not_political', axis=1, inplace=True)
df.drop('title', axis=1, inplace=True)
df.drop('Agency', axis=1, inplace=True)
df.drop('Interest', axis=1, inplace=True)
df.drop('Segment', axis=1, inplace=True)
df.drop('Language', axis=1, inplace=True)
df.drop('City', axis=1, inplace=True)
df.drop('MaxAge', axis=1, inplace=True)
df.drop('MinAge', axis=1, inplace=True)
df.drop('RegionTarget', axis=1, inplace=True)
df.drop('Event', axis=1, inplace=True)
df.drop('Law', axis=1, inplace=True)
df.drop('RegionEntity', axis=1, inplace=True)
df.drop('Group', axis=1, inplace=True)
df.drop('Location', axis=1, inplace=True)
df.drop('Facility', axis=1, inplace=True)
df.drop('Person', axis=1, inplace=True)
df.drop('Created_At_Year', axis=1, inplace=True)
df.drop('Created_At_Month', axis=1, inplace=True)
df.drop('is_impressions', axis=1, inplace=True)
df.drop('Organization', axis=1, inplace=True)
df.drop('created_at', axis=1, inplace=True)
df.drop('updated_at', axis=1, inplace=True)

In [35]:
examine_col('Gender')
df.drop('Gender', axis=1, inplace=True)

         156121
women      4471
men        1732
Name: Gender, dtype: int64

['', 'women', 'men']


In [36]:
examine_col('Retargeting')
df['retarget'] = [1 if 'similar' in x else 0 for x in df['Retargeting']]
df.drop('Retargeting', axis=1, inplace=True)

                                                136958
people who may be similar to their customers     24379
recently near their business                       987
Name: Retargeting, dtype: int64

['', 'people who may be similar to their customers', 'recently near their business']


In [37]:
examine_col('paid_for_by')
df.drop('paid_for_by', axis=1, inplace=True)

International Rescue Committee                                       2944
INSLEE FOR AMERICA                                                   1904
Planned Parenthood Action Fund.                                      1781
Beto for Texas                                                       1558
Planned Parenthood Federation of America                             1495
                                                                     ... 
SMU                                                                     1
AARP Pennsylvania                                                       1
the Committee to Recall Judge Persky                                    1
One Arizona                                                             1
David Adkins for State Representative. Lisa Contreras, Treasurer.       1
Name: paid_for_by, Length: 8786, dtype: int64

['International Rescue Committee', 'INSLEE FOR AMERICA', 'Planned Parenthood Action Fund.', 'Beto for Texas', 'Planned Parenthood Federatio

In [38]:
examine_col('advertiser')
df.drop('advertiser', axis=1, inplace=True)

Beto O'Rourke                           2064
ACLU                                    1577
International Rescue Committee          1476
Donald J. Trump                         1443
Planned Parenthood Action               1230
                                        ... 
North Country SPCA                         1
New York Society for Ethical Culture       1
Elizabeth Moro for Congress                1
Andrew Janz for Congress                   1
The Nature Conservancy in Virginia         1
Name: advertiser, Length: 12068, dtype: int64

["Beto O'Rourke", 'ACLU', 'International Rescue Committee', 'Donald J. Trump', 'Planned Parenthood Action', 'Sierra Club', 'Planned Parenthood', 'Penzeys Spices', 'Care2', 'Cathy Myers', 'MoveOn', 'Feeding America', 'Everytown for Gun Safety', 'Elizabeth Warren', 'Environmental Defense Fund', 'Tom Steyer', 'The Wilderness Society', 'Ocean Conservancy', 'World Wildlife Fund', 'The Nature Conservancy', 'Amnesty International USA', 'NARAL Pro-Choice America

In [39]:
examine_col('lower_page')
df.drop('lower_page', axis=1, inplace=True)

internationalrescuecommittee    4147
plannedparenthoodaction         2357
betoorourke                     2269
jayinslee                       1935
plannedparenthood               1844
                                ... 
joebloomspianoproject              1
jeffrey4texas                      1
samovensfan                        1
teresabarrett4petaluma             1
jobsfirstpac                       1
Name: lower_page, Length: 12609, dtype: int64

['internationalrescuecommittee', 'plannedparenthoodaction', 'betoorourke', 'jayinslee', 'plannedparenthood', 'aclu', 'kirstengillibrand', 'elizabethwarren', 'moveon', 'kamalaharris', 'donaldtrump', 'indivisibleguide', 'sierraclub', 'unrefugees', 'msf.english', 'penzeys', 'amyklobuchar', 'care2', 'nrdc.org', 'everytown', 'cathymyersforcongress', 'feedingamerica', 'michaelbennet', 'worldwildlifefund', 'oceanconservancy', 'thewildernesssociety', 'envdefensefund', 'naralprochoiceamerica', 'officialtomsteyer', 'amnestyusa', 'thenatureconservancy

In [40]:
# see what the remaining data looks like
for col_name in df.columns:
    examine_col(col_name)
    print('\n\n\n')

It’s this simple: To get Jay to the debate stage in June, he needs 65,000 donors. Will you be one of them?                                                                                                                                                                                                                                                                                                                                                                                                    441
Zimbabwe is reeling from the impact of Cyclone Idai! Families stranded, hospitals destroyed and landslides threaten to take more lives. Our teams are on the ground providing lifesaving aid, medical care and critical supplies.                                                                                                                                                                                                                                                                             420
I’ll

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



1    85551
0    76773
Name: is_political, dtype: int64

[1, 0]




0    145749
1     16575
Name: is_not_political, dtype: int64

[0, 1]




0    149494
1     12830
Name: Donald_Trump, dtype: int64

[0, 1]




0    160729
1      1595
Name: Brett_Kavanaugh, dtype: int64

[0, 1]




0    160385
1      1939
Name: Jay_Inslee, dtype: int64

[0, 1]




0    161470
1       854
Name: Paul_Ryan, dtype: int64

[0, 1]




0    161830
1       494
Name: Barack_Obama, dtype: int64

[0, 1]




0    160001
1      2323
Name: Beto_O'Rourke, dtype: int64

[0, 1]




0    161525
1       799
Name: Ted_Cruz, dtype: int64

[0, 1]




0    161853
1       471
Name: Tony_Evers, dtype: int64

[0, 1]




0    161655
1       669
Name: Stacey_Abrams, dtype: int64

[0, 1]




0    161871
1       453
Name: Joe_Biden, dtype: int64

[0, 1]




0    160633
1      1691
Name: Kirsten_Gillibrand, dtype: int64

[0, 1]




0    160524
1      1800
Name: Elizabeth_Warren, dtype: int64

[0, 1]




0    160767
1      1557
Name: K

In [41]:
list(df)

['message',
 'impressions',
 'political_probability',
 'State',
 'Website',
 'Activity on the Facebook Family',
 'Like',
 '13-17',
 '18-34',
 '35-49',
 '50-64',
 '65+',
 'Updated_At_Year',
 'Updated_At_Month',
 'political_probability_int',
 'fundraising_proba_int',
 'is_political',
 'is_not_political',
 'Donald_Trump',
 'Brett_Kavanaugh',
 'Jay_Inslee',
 'Paul_Ryan',
 'Barack_Obama',
 "Beto_O'Rourke",
 'Ted_Cruz',
 'Tony_Evers',
 'Stacey_Abrams',
 'Joe_Biden',
 'Kirsten_Gillibrand',
 'Elizabeth_Warren',
 'Kamala_Harris',
 'Amy_Klobuchar',
 'International_Rescue_Committee',
 'Planned_Parenthood',
 'ACLU',
 'Congress',
 'Senate',
 'the_Supreme_Court',
 'GOP',
 'NRA',
 'EPA',
 'FBI',
 'Citizens_United',
 'FEC',
 'Democrats',
 'Republicans',
 'Americans',
 'targetedness_parsed',
 'is_liberal',
 'is_conservative',
 'quarter',
 'retarget']

In [42]:
print(df.shape)
df.head()

(162324, 52)


Unnamed: 0,message,impressions,political_probability,State,Website,Activity on the Facebook Family,Like,13-17,18-34,35-49,...,Citizens_United,FEC,Democrats,Republicans,Americans,targetedness_parsed,is_liberal,is_conservative,quarter,retarget
0,BREAKING: Trump’s Department of the Interior p...,1,0.999992,,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,The Mueller investigation is over. Special Cou...,1,0.999997,,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,Zimbabwe is reeling from the impact of Cyclone...,1,0.97757,,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,What more can you do in the final hours of 201...,1,0.360711,,0,1,0,0,1,1,...,0,0,0,0,0,3,0,0,3,0
4,"Say it loud, say it proud: Our rights, our hea...",1,0.999998,,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,1,0


In [43]:
print('Saving cleaned data')
# Save cleaned data
df.to_csv(filebase + '-cleaned.csv', index=False)

Saving cleaned data
