In [1]:
#!/usr/bin/env python
# coding: utf-8

import pandas as pd
import json

In [2]:
filebase = 'data/fbpac-ads-en-US'

print(f'Reading file {filebase}.csv...', end='', flush=True)
df = pd.read_csv(filebase + '.csv')
print(' done')

Reading file data/fbpac-ads-en-US.csv... done


In [3]:
# Columns to keep
columns = ['political',
           'not_political',
           'title',
           'message',
           'created_at',
           'updated_at',
           'impressions',
           'political_probability',
           'targets',
           'advertiser',
           'entities',
           'lower_page',
           'paid_for_by',
           'targetedness',
           'listbuilding_fundraising_proba']

print('Keeping columns:')
for column in columns:
    print('\t', column)

Keeping columns:
	 political
	 not_political
	 title
	 message
	 created_at
	 updated_at
	 impressions
	 political_probability
	 targets
	 advertiser
	 entities
	 lower_page
	 paid_for_by
	 targetedness
	 listbuilding_fundraising_proba


In [4]:
# Reduce dataset
df = df[columns]

print("Removing HTML from 'message'")
# Remove HTML from 'message'
df['message'] = df['message'].str.replace('<[^<]+?>', '')

Removing HTML from 'message'


In [5]:
n = df.shape[0]

print("Parsing 'targets' column...", end='', flush=True)
# Parse the 'targets' column
targets = {
    'Gender': [''] * n,
    'Age': [''] * n,
    'Retargeting': [''] * n,
    'Interest': [''] * n,
    'Segment': [''] * n,
    'State': [''] * n,
    'List': [''] * n,
    'Engaged with Content': [''] * n,
    'Language': [''] * n,
    'Website': [''] * n,
    'City': [''] * n,
    'Activity on the Facebook Family': [''] * n,
    'MaxAge': [''] * n,
    'Like': [''] * n,
    'MinAge': [''] * n,
    'RegionTarget': [''] * n,
    'Agency': [''] * n
}

for i, target in enumerate(df['targets']):
    if pd.isna(target):
        continue
    data = json.loads(target)
    for datum in data:
        col = datum['target']
        # Region exists in both targets and entities
        if col == 'Region':
            col = 'RegionTarget'
        if 'segment' in datum:
            val = datum['segment']
        else:
            val = '1'
        targets[col][i] = val

target_df = pd.DataFrame.from_dict(targets)
print(' done')

Parsing 'targets' column... done


In [6]:
print("Parsing 'entities' column...", end='', flush=True)
# Parse the 'entities' column
entity_types = {
    'Organization': [''] * n,
    'Event': [''] * n,
    'Law': [''] * n,
    'RegionEntity': [''] * n,
    'Group': [''] * n,
    'Location': [''] * n,
    'Facility': [''] * n,
    'Person': [''] * n
}

for i, entity in enumerate(df['entities']):
    if pd.isna(entity):
        continue
    data = json.loads(entity)
    for datum in data:
        col = datum['entity_type']
        if col == 'Region':
            col = 'RegionEntity'
        val = datum['entity']
        entity_types[col][i] = val

entity_df = pd.DataFrame.from_dict(entity_types)
print(' done')

Parsing 'entities' column... done


In [7]:
age_bins = {
    '13-17': [0] * n,
    '18-34': [0] * n,
    '35-49': [0] * n,
    '50-64': [0] * n,
    '65+': [0] * n
}

In [8]:
# Combine original dataframe with newly created, parsed dataframes
df = pd.concat([df, target_df, entity_df], axis=1, sort=False)

age_ranges = {
    '13-17': {'min': 13, 'max': 17},
    '18-34': {'min': 18, 'max': 34},
    '35-49': {'min': 35, 'max': 49},
    '50-64': {'min': 50, 'max': 64},
    '65+': {'min': 65, 'max': 1000},
}

print('Moving ages into bins...', end='', flush=True)
for i, (min_age, max_age) in enumerate(zip(df['MinAge'], df['MaxAge'])):
    if pd.isna(min_age):
        min_age = 0
    if pd.isna(max_age):
        max_age = 1000
    try:
    	min_age = float(min_age)
    except ValueError:
        min_age = 0.0
    try:
    	max_age = float(max_age)
    except ValueError:
        max_age = 1000
    for key, val in age_ranges.items():
        if min_age <= val['max'] and max_age >= val['min']:
            age_bins[key][i] = 1
print(' Done')

Moving ages into bins... Done


In [9]:
df = pd.concat([df, pd.DataFrame.from_dict(age_bins)], axis=1, sort=False)

# Remove parsed and unneeded columns
df.drop('targets', axis=1, inplace=True)
df.drop('entities', axis=1, inplace=True)
df.drop('List', axis=1, inplace=True)
df.drop('Engaged with Content', axis=1, inplace=True)
df.drop('Age', axis=1, inplace=True)

In [10]:
Created_AT_Year = [int(x[0:4]) for x in df.created_at]
df['Created_At_Year'] = Created_AT_Year

Created_AT_Month = [int(x[5:7]) for x in df.created_at]
df['Created_At_Month'] = Created_AT_Month

Updated_AT_Year = [int(x[0:4]) for x in df.updated_at]
df['Updated_At_Year'] = Updated_AT_Year

Updated_AT_Month = [int(x[5:7]) for x in df.updated_at]
df['Updated_At_Month'] = Updated_AT_Month

#df.drop(columns=['created_at', 'updated_at'], inplace=True)

df['lower_page'] = df['lower_page'].str.replace('https://www.facebook.com/', '')
df['lower_page'] = df['lower_page'].str.replace('/', '')

In [11]:
political_probability = []
for prob in df['political_probability']:
    if pd.isna(prob):
        political_probability.append(prob)
    elif prob > 0.9:
        political_probability.append(8)
    elif prob > 0.80:
        political_probability.append(7)
    elif prob > 0.70:
        political_probability.append(6)
    elif prob > 0.60:
        political_probability.append(5)
    elif prob > 0.50:
        political_probability.append(4)
    elif prob > 0.40:
        political_probability.append(3)
    elif prob > 0.30:
        political_probability.append(2)
    elif prob > 0.20:
        political_probability.append(1)
    else:
        political_probability.append(0)
        
df['political_probability_int'] = political_probability

In [12]:
political_probability = []
for prob in df['listbuilding_fundraising_proba']:
    if pd.isna(prob):
        political_probability.append(prob)
    elif prob > 0.9:
        political_probability.append(8)
    elif prob > 0.80:
        political_probability.append(7)
    elif prob > 0.70:
        political_probability.append(6)
    elif prob > 0.60:
        political_probability.append(5)
    elif prob > 0.50:
        political_probability.append(4)
    elif prob > 0.40:
        political_probability.append(3)
    elif prob > 0.30:
        political_probability.append(2)
    elif prob > 0.20:
        political_probability.append(1)
    else:
        political_probability.append(0)
        
df['fundraising_proba_int'] = political_probability

In [13]:
is_political = []
is_not_political = []
for i in range(df.shape[0]):
    if df.loc[i, 'political'] > df.loc[i, 'not_political']:
        is_political.append(1)
        is_not_political.append(0)
    elif df.loc[i, 'political'] < df.loc[i, 'not_political']:
        is_political.append(0)
        is_not_political.append(1)
    else:
        is_political.append(0)
        is_not_political.append(0)
    """
    if df.loc[i, 'political'] > df.loc[i, 'not_political']:
        is_political.append(1)
    elif df.loc[i, 'political'] < df.loc[i, 'not_political']:
        is_political.append(0)
    else:
        is_political.append(-1)
    """
        
df['is_political'] = is_political
df['is_not_political'] = is_not_political

In [14]:
df['is_impressions'] = [1 if df.loc[i, 'impressions'] > 0 else 0 for i in range(df.shape[0])]

In [15]:
title_names = df['title'].value_counts()
#list(title_names)
#title_cutoff = sum(1 if x > 200 else 0 for x in list(title_names))
#title_cutoff
title_names

International Rescue Committee                                                          4150
Planned Parenthood Action                                                               2363
Beto O'Rourke                                                                           2272
Jay Inslee                                                                              1935
Planned Parenthood                                                                      1849
ACLU                                                                                    1711
Elizabeth Warren                                                                        1688
Kirsten Gillibrand                                                                      1681
Kamala Harris                                                                           1498
Donald J. Trump                                                                         1478
Indivisible Guide                                                     

In [16]:
#df = df.dropna()
parsed_person = df['Person'].str.replace('’', "'")
parsed_person = parsed_person.str.replace("'s", "")
parsed_person = parsed_person.str.replace("Brett ", "")
parsed_person = parsed_person.str.replace("Kavanaugh", "Brett Kavanaugh")
parsed_person = parsed_person.str.replace(" Abrams", "")
parsed_person = parsed_person.str.replace("Stacey", "Stacey Abrams")
parsed_person = parsed_person.str.replace(" Evers", "")
parsed_person = parsed_person.str.replace("Tony", "Tony Evers")
parsed_person = parsed_person.str.replace("Cruz", "Ted Cruz")
parsed_person = parsed_person.str.replace("Ted Ted Cruz", "Ted Cruz")
parsed_person = parsed_person.str.replace("Jay", "Jay Inslee")
parsed_person = parsed_person.str.replace("Beto", "Beto O'Rourke")
parsed_person = parsed_person.str.replace("Biden", "Joe Biden")
parsed_person = parsed_person.str.replace("Obama", "Barack Obama")
parsed_person = parsed_person.str.replace("Donald J. Trump", "Donald Trump")



#df['Person'].value_counts()
parsed_person.value_counts() # NOTE: SOME OF THE COUNTS CHANGE ACCROSS RUNS


                         95297
Donald Trump             11418
Brett Kavanaugh           1595
Jay Inslee                1310
Paul Ryan                  854
Ted Cruz                   754
Mitch McConnell            717
Mike Pence                 620
Join                       563
Amy Klobuchar              548
Barack Obama               494
Beto O'Rourke              475
Stacey Abrams              420
Scott Pruitt               412
Ocasio-Cortez              410
Rick Scott                 404
Devin Nunes                378
Robert Mueller             372
Learn                      298
Michael                    287
Rescue Gifts               268
Tony Evers                 246
Barr                       242
Joe Biden                  215
Eric Holder                207
Nancy Pelosi               206
Scott Walker               201
Stand                      200
Elizabeth Warren           198
VOTE                       186
                         ...  
Elliott                      1
Join Mac

In [17]:
Donald_Trump = []
Brett_Kavanaugh = []
Jay_Inslee = []
Paul_Ryan = []
Ted_Cruz = []
Mitch_McConnell = []
Mike_Pence = []
    

#df['filtered_title'] = [df.loc[x, 'title'].count() for x in range(df.shape[0])]
#df['filtered_title']
#df['title'].value_counts()
#df = df.dropna()
#titles = df['title']
#titles = titles.dropna()
import numpy as np
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categories='auto')
feature_arr = ohe.fit_transform(df[['title']]).toarray()
feature_labels = ohe.categories_

feature_labels = np.array(feature_labels).ravel()

features = pd.DataFrame(feature_arr, columns=feature_labels)
features.shape

In [18]:
print(df.shape)
df.head()

(162324, 49)


Unnamed: 0,political,not_political,title,message,created_at,updated_at,impressions,political_probability,advertiser,lower_page,...,65+,Created_At_Year,Created_At_Month,Updated_At_Year,Updated_At_Month,political_probability_int,fundraising_proba_int,is_political,is_not_political,is_impressions
0,0,0,League of Conservation Voters,BREAKING: Trump’s Department of the Interior p...,2019-03-27 16:52:25.625455+00,2019-03-27 16:52:25.625455+00,1,0.999992,,lcvoters,...,1,2019,3,2019,3,8,5.0,0,0,1
1,0,0,Indivisible Guide,The Mueller investigation is over. Special Cou...,2019-03-27 17:28:14.096849+00,2019-03-27 17:28:14.096849+00,1,0.999997,,indivisibleguide,...,1,2019,3,2019,3,8,2.0,0,0,1
2,0,0,International Rescue Committee,Zimbabwe is reeling from the impact of Cyclone...,2019-03-27 17:38:23.101377+00,2019-03-27 17:38:23.101377+00,1,0.97757,,internationalrescuecommittee,...,1,2019,3,2019,3,8,8.0,0,0,1
3,0,0,Covenant House International,What more can you do in the final hours of 201...,2018-12-30 20:59:13.879124+00,2018-12-30 20:59:13.879124+00,1,0.360711,Covenant House International,covenanthouse,...,1,2018,12,2018,12,2,,0,0,1
4,0,1,Planned Parenthood,"Say it loud, say it proud: Our rights, our hea...",2019-03-27 17:18:29.764002+00,2019-04-11 15:02:58.081112+00,1,0.999998,,plannedparenthood,...,1,2019,3,2019,4,8,8.0,0,1,1


In [19]:
print('Saving cleaned data')
# Save cleaned data
df.to_csv(filebase + '-cleaned.csv', index=False)

Saving cleaned data
