# Data For Hate Methodology
Each header in this notebook corresponds to a headers in the methodology.

In [1]:
import os
import pandas as pd
from utils import value_counts

In [2]:
# inputs
fn_policy = '../data/output/placements_api_keyword_status/policy.csv'
fn_hate = '../data/output/placements_api_keyword_status/hate.csv'
fn_adhoc = '../data/output/placements_api_keyword_status/adhoc.csv'
fn_basewords = '../data/output/placements_api_keyword_status/basewords.csv'
fn_youtube_videos = '../data/output/placements_api_suggestions/videos_for_hate_terms.csv'
fn_youtube_channels = '../data/output/placements_api_suggestions/channels_for_hate_terms.csv'

# outputs
table_dir = '../data/output/tables/hate'
fn_table1 = '../data/output/tables/hate/table1.csv'
fn_table2 = '../data/output/tables/hate/table2.csv'
fn_table3a = '../data/output/tables/hate/table3a.csv'
fn_table3b = '../data/output/tables/hate/table3b.csv'
fn_table3a_full = '../data/output/tables/hate/table3a_full.csv'
fn_table3b_full = '../data/output/tables/hate/table3b_full.csv'
os.makedirs(table_dir, exist_ok=True)

In [3]:
# columns we'll display in the notebook
display_cols = ['search_term', 'status']

In [4]:
df_policy = pd.read_csv(fn_policy)
df_hate = pd.read_csv(fn_hate)
df_basewords = pd.read_csv(fn_basewords)
df_adhoc = pd.read_csv(fn_adhoc)

## Google Ads API for Ad Placements

In [5]:
# how many terms that YouTube saya are not advertiser-friendly are "blocked"?
df_policy.status.value_counts(normalize=True)

Blocked          0.753333
Full             0.226667
Partial Block    0.020000
Name: status, dtype: float64

Examples of `Full`, `Blocked`, and `Partial Block` terms:

In [6]:
(df_policy[df_policy.status == 'Full']
    .sample(3, random_state=303)
    .search_term.tolist())

['domestic violence', 'praise terrorists', 'pipe bomb']

In [7]:
(df_policy[df_policy.status == 'Blocked']
    .sample(3, random_state=303)
    .search_term.tolist())

['3d print guns', 'dramatized rape scenes', 'make high capacity ammunition']

In [8]:
(df_policy[df_policy.status == 'Partial Block']
    .sample(3, random_state=303)
    .search_term.tolist())

['disparaging', 'dogfighting', 'cannibalism']

## Whats on the blocklist?

In [9]:
col2hate_table_col = {
    'search_term': "Search Term", 
    'status': "Status for Ad Placement",
    'n_youtube_videos': "N YouTube Videos",
    'n_youtube_channels': "N YouTube Channels",
    "background_info_link": "Background Info"
}

In [10]:
hate_table = df_hate[[c for c in col2hate_table_col.keys()]]
hate_table.columns = col2hate_table_col.values()
hate_table.to_csv(fn_table1, index=False)
hate_table.head(5)

Unnamed: 0,Search Term,Status for Ad Placement,N YouTube Videos,N YouTube Channels,Background Info
0,14 words,Full,61142590.0,240367.0,https://www.adl.org/education/references/hate-...
1,2083: a european declaration of independence,Full,32.0,0.0,https://www.splcenter.org/hatewatch/2011/07/24...
2,alt-lite,Full,15269138.0,150.0,https://www.adl.org/resources/backgrounders/fr...
3,alt-right,Full,15276072.0,21880.0,https://www.adl.org/resources/backgrounders/fr...
4,american front,Full,67677338.0,83682.0,https://www.adl.org/education/references/hate-...


In [11]:
# what are the statuses of un-altered hate terms?
value_counts(df_hate, col='status')

Unnamed: 0,count,percentage
Full,59,0.678161
Blocked,28,0.321839


In [12]:
# what are the statuses from multi-word terms with spaces removed?
df_hate_phrases = df_hate[df_hate.search_term.str.split(' ').str.len() > 1]
value_counts(df_hate_phrases,
             col='status_no_spaces')

Unnamed: 0,count,percentage
Full,10,0.588235
Blocked,3,0.176471
Partial Block,3,0.176471
Empty,1,0.058824


In [13]:
# what is still blocked after removing spaces?
still_blocked = df_hate[
    (df_hate.search_term.str.split(' ').str.len() > 1) &
    (df_hate.status_no_spaces == "Blocked")
].search_term.tolist()

still_blocked

['american nazi party', 'holocaust denial', 'white pride worldwide']

In [14]:
# among those three terms, which base words are blocked?
unique_basewords = {
    word for sent in still_blocked for word in sent.split(' ')
}

df_basewords[df_basewords.search_term.isin(unique_basewords)].append(
    df_hate[df_hate.search_term == 'white pride']
)[display_cols].set_index('search_term')

Unnamed: 0_level_0,status
search_term,Unnamed: 1_level_1
american,Full
denial,Full
party,Full
pride,Full
white,Full
worldwide,Full
holocaust,Blocked
nazi,Blocked
white pride,Blocked


In [15]:
# white nationalist vs -ists and -ism
(df_hate[df_hate.search_term.str.contains('white national')]
    .append(df_adhoc[df_adhoc.search_term.str.contains('white national')])
    .sort_values(by='status')[display_cols]
    .set_index('search_term'))

Unnamed: 0_level_0,status
search_term,Unnamed: 1_level_1
white nationalist,Blocked
white nationalism,Full
white nationalists,Full


In [16]:
# "terrorists" vs "terrorist"
(df_basewords[df_basewords.search_term.str.contains('terrorist')]
    .append(df_policy[df_policy.search_term.str.contains('terrorist')])
    .append(df_adhoc[df_adhoc.search_term.str.contains('terrorist')])
    .sort_values(by='status')[display_cols]
    .set_index('search_term'))

Unnamed: 0_level_0,status
search_term,Unnamed: 1_level_1
terrorist,Blocked
terrorist acts,Blocked
terrorist attack,Blocked
terrorist hostages,Blocked
terrorist ideology,Blocked
terrorist recruitment,Blocked
video game terrorist mod,Blocked
praise terrorist,Blocked
recruit terrorist,Blocked
praise terrorists,Full


In [17]:
# multi-word blocked terms, which return full responses after spaces are removed.
bandaids = df_hate[
    (df_hate.status == 'Blocked') &
    (df_hate.status_no_spaces == "Full")
][['search_term']]

bandaids['full response'] = bandaids['search_term'].str.replace(' ', '')

bandaids.columns = ['blocked response', 'full response']
bandaids.sort_values(by='blocked response', inplace=True, ascending=False)
bandaids.to_csv(fn_table2, index=False)
bandaids

Unnamed: 0,blocked response,full response
68,white pride,whitepride
67,white pill,whitepill
66,white nationalist,whitenationalist
65,white genocide,whitegenocide
64,sieg heil,siegheil
63,radical islamic terror,radicalislamicterror
62,jewish question,jewishquestion
61,heil hitler,heilhitler
60,globalist jews,globalistjews
59,dual seedline,dualseedline


## Hate phrases vs hate content

In [18]:
# how many videos are related to "white power"?
df_hate[df_hate.search_term == "white power"].n_youtube_videos

54    169420977.0
Name: n_youtube_videos, dtype: float64

In [19]:
df_hate_video = pd.read_csv(fn_youtube_videos)
df_hate_channels = pd.read_csv(fn_youtube_channels)

What are the most suggested channels?

In [20]:
# top channels from suggested videos
channels_from_video = df_hate_video.youtube_video_channel.value_counts()
channels_from_video.to_csv(fn_table3a_full)

# this is what is displayed in the method
channels_from_video = channels_from_video[channels_from_video >= 8]
channels_from_video.to_csv(fn_table3a)
channels_from_video

CNN                    27
Ruptly                 24
AP Archive             18
The F/S Effect         12
Dystopia Now           12
Destiny                12
Newsy                  11
Global News            10
The Young Turks        10
Soap - Sim Racer        9
VICE News               8
NowThis News            8
Journeyman Pictures     8
Name: youtube_video_channel, dtype: int64

In [21]:
# top channels from suggested channels
channels_from_channels = df_hate_channels.youtube_channel_name.value_counts()
channels_from_channels.to_csv(fn_table3b_full)

# this is what is displayed in the method
channels_from_channels = channels_from_channels[channels_from_channels>= 3]
channels_from_channels.to_csv(fn_table3b)
channels_from_channels

Democracy Now!              4
PragerU                     4
VICE                        4
Beau of the Fifth Column    4
CinemaSins                  4
Pixel_Hipster               3
Dia Beltran                 3
The Officer Tatum           3
ContraPoints                3
act.tv                      3
Trae Crowder                3
StevenCrowder               3
Name: youtube_channel_name, dtype: int64