# Data For Hate Methodology
Each header in this notebook corresponds to a headers in the methodology.

In [1]:
import os
import pandas as pd
from utils import value_counts

In [2]:
# inputs
fn_policy = '../data/output/placements_api_keyword_status/policy.csv'
fn_hate = '../data/output/placements_api_keyword_status/hate.csv'
fn_adhoc = '../data/output/placements_api_keyword_status/adhoc.csv'
fn_basewords = '../data/output/placements_api_keyword_status/basewords.csv'
fn_youtube_videos = '../data/output/placements_api_suggestions/videos_for_hate_terms.csv'
fn_youtube_channels = '../data/output/placements_api_suggestions/channels_for_hate_terms.csv'

# outputs
table_dir = '../data/output/tables/hate'
fn_table1 = '../data/output/tables/hate/table1.csv'
fn_table2 = '../data/output/tables/hate/table2.csv'
fn_table3a = '../data/output/tables/hate/table3a.csv'
fn_table3b = '../data/output/tables/hate/table3b.csv'
fn_table3a_full = '../data/output/tables/hate/table3a_full.csv'
fn_table3b_full = '../data/output/tables/hate/table3b_full.csv'
os.makedirs(table_dir, exist_ok=True)

In [3]:
df_policy = pd.read_csv(fn_policy)
df_hate = pd.read_csv(fn_hate)
df_basewords = pd.read_csv(fn_basewords)
df_adhoc = pd.read_csv(fn_adhoc)

In [4]:
display_cols = ['search_term', 'status']

## Google Ads API for Ad Placements

In [5]:
df_policy.status.value_counts(normalize=True)

Blocked          0.753333
Full             0.226667
Partial Block    0.020000
Name: status, dtype: float64

## Whats on the blocklist?

In [6]:
col2hate_table_col = {
    'search_term': "Search Term", 
    'status': "Status for Ad Placement",
    'n_youtube_videos': "N YouTube Videos",
    'n_youtube_channels': "N YouTube Channels",
    "additional_info_link": "Additional Info"
}

In [7]:
hate_table = df_hate[[c for c in col2hate_table_col.keys()]]
hate_table.columns = col2hate_table_col.values()
hate_table.to_csv(fn_table1, index=False)
hate_table.head(5)

Unnamed: 0,Search Term,Status for Ad Placement,N YouTube Videos,N YouTube Channels,Additional Info
0,it's okay to be white,Full,118249642.0,417519.0,https://www.adl.org/education/references/hate-...
1,white power,Full,169420977.0,377159.0,https://www.adl.org/education/references/hate-...
2,red ice tv,Full,93482132.0,326308.0,https://www.splcenter.org/hatewatch/2019/10/21...
3,black sun,Full,99052454.0,256751.0,https://www.adl.org/education/references/hate-...
4,14 words,Full,61142590.0,240367.0,https://www.adl.org/education/references/hate-...


In [8]:
value_counts(df_hate, col='status')

Unnamed: 0,count,percentage
Full,59.0,0.678161
Blocked,28.0,0.321839


In [9]:
value_counts(df_hate[df_hate.search_term.str.split(' ').str.len() > 1],
             col='status_no_spaces')

Unnamed: 0,count,percentage
Full,10.0,0.588235
Blocked,3.0,0.176471
Partial Block,3.0,0.176471
Empty,1.0,0.058824


In [10]:
still_blocked = df_hate[
    (df_hate.search_term.str.split(' ').str.len() > 1) &
    (df_hate.status_no_spaces == "Blocked")
].search_term

still_blocked

59    white pride worldwide
60         holocaust denial
61      american nazi party
Name: search_term, dtype: object

In [11]:
unique_basewords = {
    word for sent in still_blocked.tolist() for word in sent.split(' ')
}

In [12]:
df_basewords[df_basewords.search_term.isin(unique_basewords)].append(
    df_hate[df_hate.search_term == 'white pride']
)[display_cols]

Unnamed: 0,search_term,status
14,american,Full
36,white,Full
38,party,Full
71,worldwide,Full
95,pride,Full
124,denial,Full
196,nazi,Blocked
219,holocaust,Blocked
62,white pride,Blocked


In [13]:
df_hate[df_hate.search_term.str.contains('white national')].append(
    df_adhoc[df_adhoc.search_term.str.contains('white national')]
)[display_cols]

Unnamed: 0,search_term,status
26,white nationalism,Full
64,white nationalist,Blocked
20,white nationalists,Full


In [14]:
df_basewords[df_basewords.search_term.str.contains('terrorist')].append(
    df_policy[df_policy.search_term.str.contains('terrorist')]
).append(
    df_adhoc[df_adhoc.search_term.str.contains('terrorist')]
)[display_cols]

Unnamed: 0,search_term,status
161,terrorist,Blocked
35,recruit terrorists,Full
36,praise terrorists,Full
66,video game terrorist mod,Blocked
69,terrorist recruitment,Blocked
70,terrorist ideology,Blocked
71,terrorist hostages,Blocked
72,terrorist attack,Blocked
73,terrorist acts,Blocked
4,terrorists,Full


In [15]:
# multi-word blocked terms, which return full responses after spaces are removed.
bandaids = df_hate[
    (df_hate.status == 'Blocked') &
    (df_hate.status_no_spaces == "Full")
][['search_term']]

bandaids['full response'] = bandaids['search_term'].str.replace(' ', '')

bandaids.columns = ['blocked response', 'full response']
bandaids.sort_values(by='blocked response', inplace=True, ascending=False)
bandaids.to_csv(fn_table2, index=False)
bandaids

Unnamed: 0,blocked response,full response
62,white pride,whitepride
63,white pill,whitepill
64,white nationalist,whitenationalist
65,white genocide,whitegenocide
67,sieg heil,siegheil
68,radical islamic terror,radicalislamicterror
69,jewish question,jewishquestion
70,heil hitler,heilhitler
71,globalist jews,globalistjews
74,dual seedline,dualseedline


## Hate phrases vs hate content

In [16]:
df_hate[df_hate.search_term == "white power"].n_youtube_videos

1    169420977.0
Name: n_youtube_videos, dtype: float64

In [17]:
df_hate_video = pd.read_csv(fn_youtube_videos)
df_hate_channels = pd.read_csv(fn_youtube_channels)

In [18]:
channels_from_video = df_hate_video.youtube_video_channel.value_counts()
channels_from_video.to_csv(fn_table3a_full)

channels_from_video = channels_from_video[channels_from_video >= 8]
channels_from_video.to_csv(fn_table3a)
channels_from_video

CNN                    31
Ruptly                 29
AP Archive             23
The Young Turks        13
The F/S Effect         12
Dystopia Now           12
Newsy                  12
Destiny                12
Global News            11
CharlesFockaert        11
VICE News              10
Associated Press        9
Soap - Sim Racer        9
Journeyman Pictures     9
NowThis News            8
PBS NewsHour            8
Name: youtube_video_channel, dtype: int64

In [19]:
channels_from_channels = df_hate_channels.youtube_channel_name.value_counts()
channels_from_channels.to_csv(fn_table3b_full)

channels_from_channels = channels_from_channels[channels_from_channels>= 3]
channels_from_channels.to_csv(fn_table3b)
channels_from_channels

act.tv                      4
Beau of the Fifth Column    4
VICE                        4
Democracy Now!              4
CinemaSins                  4
PragerU                     4
ContraPoints                3
Pixel_Hipster               3
StevenCrowder               3
Trae Crowder                3
Shaun                       3
Dia Beltran                 3
The Officer Tatum           3
Name: youtube_channel_name, dtype: int64