# Data For Hate Methodology
Each header in this notebook corresponds to a headers in the methodology.

In [1]:
import os
import pandas as pd
from utils import value_counts

In [2]:
# inputs
fn_policy = '../data/output/placements_api_keyword_status/policy.csv'
fn_hate = '../data/output/placements_api_keyword_status/hate.csv'
fn_adhoc = '../data/output/placements_api_keyword_status/adhoc.csv'
fn_basewords = '../data/output/placements_api_keyword_status/basewords.csv'
fn_youtube_videos = '../data/output/placements_api_suggestions/videos_for_hate_terms.csv'
fn_youtube_channels = '../data/output/placements_api_suggestions/channels_for_hate_terms.csv'

# outputs
table_dir = '../data/output/tables/hate'
fn_table1 = '../data/output/tables/hate/table1.csv'
fn_table2 = '../data/output/tables/hate/table2.csv'
fn_table3a = '../data/output/tables/hate/table3a.csv'
fn_table3b = '../data/output/tables/hate/table3b.csv'
fn_table3a_full = '../data/output/tables/hate/table3a_full.csv'
fn_table3b_full = '../data/output/tables/hate/table3b_full.csv'
os.makedirs(table_dir, exist_ok=True)

In [3]:
df_policy = pd.read_csv(fn_policy)
df_hate = pd.read_csv(fn_hate)
df_basewords = pd.read_csv(fn_basewords)
df_adhoc = pd.read_csv(fn_adhoc)

In [4]:
display_cols = ['search_term', 'status']

## Google Ads API for Ad Placements

In [5]:
# how many terms that YouTube saya are not advertiser-friendly are "blocked"?
df_policy.status.value_counts(normalize=True)

Blocked          0.753333
Full             0.226667
Partial Block    0.020000
Name: status, dtype: float64

## Whats on the blocklist?

In [6]:
col2hate_table_col = {
    'search_term': "Search Term", 
    'status': "Status for Ad Placement",
    'n_youtube_videos': "N YouTube Videos",
    'n_youtube_channels': "N YouTube Channels",
    "additional_info_link": "Additional Info"
}

In [7]:
hate_table = df_hate[[c for c in col2hate_table_col.keys()]]
hate_table.columns = col2hate_table_col.values()
hate_table.to_csv(fn_table1, index=False)
hate_table.head(5)

Unnamed: 0,Search Term,Status for Ad Placement,N YouTube Videos,N YouTube Channels,Additional Info
0,14 words,Full,61142590.0,240367.0,https://www.adl.org/education/references/hate-...
1,2083: a european declaration of independence,Full,32.0,0.0,https://www.splcenter.org/hatewatch/2011/07/24...
2,alt-lite,Full,15269138.0,150.0,https://www.adl.org/resources/backgrounders/fr...
3,alt-right,Full,15276072.0,21880.0,https://www.adl.org/resources/backgrounders/fr...
4,american front,Full,67677338.0,83682.0,https://www.adl.org/education/references/hate-...


In [8]:
# what are the statuses of un-altered hate terms?
value_counts(df_hate, col='status')

Unnamed: 0,count,percentage
Full,59,0.678161
Blocked,28,0.321839


In [9]:
# what are the statuses from multi-word terms with spaces removed?
value_counts(df_hate[df_hate.search_term.str.split(' ').str.len() > 1],
             col='status_no_spaces')

Unnamed: 0,count,percentage
Full,10,0.588235
Partial Block,3,0.176471
Blocked,3,0.176471
Empty,1,0.058824


In [10]:
# what is still blocked after removing spaces?
still_blocked = df_hate[
    (df_hate.search_term.str.split(' ').str.len() > 1) &
    (df_hate.status_no_spaces == "Blocked")
].search_term

still_blocked

59      american nazi party
68         holocaust denial
86    white pride worldwide
Name: search_term, dtype: object

In [11]:
unique_basewords = {
    word for sent in still_blocked.tolist() for word in sent.split(' ')
}

In [12]:
# among those three terms, which base words are blocked?
df_basewords[df_basewords.search_term.isin(unique_basewords)].append(
    df_hate[df_hate.search_term == 'white pride']
)[display_cols]

Unnamed: 0,search_term,status
4,american,Full
27,denial,Full
92,party,Full
100,pride,Full
141,white,Full
144,worldwide,Full
196,holocaust,Blocked
219,nazi,Blocked
85,white pride,Blocked


In [13]:
# white nationalist vs -ists and -ism
df_hate[df_hate.search_term.str.contains('white national')].append(
    df_adhoc[df_adhoc.search_term.str.contains('white national')]
)[display_cols]

Unnamed: 0,search_term,status
53,white nationalism,Full
83,white nationalist,Blocked
25,white nationalists,Full


In [14]:
# "terrorists" vs "terrorist"
df_basewords[df_basewords.search_term.str.contains('terrorist')].append(
    df_policy[df_policy.search_term.str.contains('terrorist')]
).append(
    df_adhoc[df_adhoc.search_term.str.contains('terrorist')]
)[display_cols]

Unnamed: 0,search_term,status
254,terrorist,Blocked
24,praise terrorists,Full
26,recruit terrorists,Full
136,terrorist acts,Blocked
137,terrorist attack,Blocked
138,terrorist hostages,Blocked
139,terrorist ideology,Blocked
140,terrorist recruitment,Blocked
147,video game terrorist mod,Blocked
18,terrorists,Full


In [15]:
# multi-word blocked terms, which return full responses after spaces are removed.
bandaids = df_hate[
    (df_hate.status == 'Blocked') &
    (df_hate.status_no_spaces == "Full")
][['search_term']]

bandaids['full response'] = bandaids['search_term'].str.replace(' ', '')

bandaids.columns = ['blocked response', 'full response']
bandaids.sort_values(by='blocked response', inplace=True, ascending=False)
bandaids.to_csv(fn_table2, index=False)
bandaids

Unnamed: 0,blocked response,full response
85,white pride,whitepride
84,white pill,whitepill
83,white nationalist,whitenationalist
82,white genocide,whitegenocide
78,sieg heil,siegheil
77,radical islamic terror,radicalislamicterror
70,jewish question,jewishquestion
67,heil hitler,heilhitler
65,globalist jews,globalistjews
62,dual seedline,dualseedline


## Hate phrases vs hate content

In [16]:
df_hate[df_hate.search_term == "white power"].n_youtube_videos

54    169420977.0
Name: n_youtube_videos, dtype: float64

In [17]:
df_hate_video = pd.read_csv(fn_youtube_videos)
df_hate_channels = pd.read_csv(fn_youtube_channels)

In [18]:
# top channels from suggested videos
channels_from_video = df_hate_video.youtube_video_channel.value_counts()
channels_from_video.to_csv(fn_table3a_full)

channels_from_video = channels_from_video[channels_from_video >= 8]
channels_from_video.to_csv(fn_table3a)
channels_from_video

CNN                    27
Ruptly                 24
AP Archive             18
Dystopia Now           12
The F/S Effect         12
Destiny                12
Newsy                  11
The Young Turks        10
Global News            10
Soap - Sim Racer        9
Journeyman Pictures     8
VICE News               8
NowThis News            8
Name: youtube_video_channel, dtype: int64

In [19]:
# top channels from suggested channels
channels_from_channels = df_hate_channels.youtube_channel_name.value_counts()
channels_from_channels.to_csv(fn_table3b_full)

channels_from_channels = channels_from_channels[channels_from_channels>= 3]
channels_from_channels.to_csv(fn_table3b)
channels_from_channels

CinemaSins                  4
PragerU                     4
Beau of the Fifth Column    4
VICE                        4
Democracy Now!              4
act.tv                      3
Trae Crowder                3
Pixel_Hipster               3
StevenCrowder               3
Dia Beltran                 3
The Officer Tatum           3
ContraPoints                3
Name: youtube_channel_name, dtype: int64