# Re-run hate terms
After we shared our findings with Google and got their statement, we re-collected the data and checked what changed.

This notebook processes the API responses for the key terms and compares the new API statuses with the original API statuses.

In [1]:
import os
import json
import glob

from tqdm import tqdm
import pandas as pd
from utils import value_counts, determine_status, process_api_response

In [2]:
# input
data_in = '../data/z_data_rerun/input/placements_api/'
fn_hate_og = '../data/output/placements_api_keyword_status/hate.csv'
fn_social_justice_og = '../data/output/placements_api_keyword_status/social_justice.csv'


# outputs
data_dir = '../data/z_data_rerun/output/placements_api_keyword_status/'
os.makedirs(data_dir, exist_ok=True)
fn_hate = os.path.join(data_dir, "hate.csv")
fn_social_justice = os.path.join(data_dir, "social_justice.csv")
fn_policy = os.path.join(data_dir, "policy.csv")
fn_basewords = os.path.join(data_dir, "basewords.csv")
fn_adhoc = os.path.join(data_dir, "adhoc.csv")

fn_table4 = '../data/output/tables/hate/table4.csv'

In [3]:
display_cols = [
    'search_term', 
    'status',
    'status_no_spaces',
    'n_youtube_videos',
    'n_youtube_channels',
    'n_youtube_videos_no_spaces',
    'n_youtube_channels_no_spaces',
]

In [4]:
files = glob.glob(data_in + '*/*.json')
len(files)

421

## Checking the API responses
Use the `utils` functions `process_api_response` and `determine_status` from the original API responses in `1-data-preprocessing.ipynb`.

In [5]:
dataset = []
for fn in tqdm(files):
    record = process_api_response(fn)
    dataset.append(record)
    
df = pd.DataFrame(dataset)

100%|██████████| 421/421 [00:00<00:00, 24726.27it/s]


In [6]:
df['status'] = df.apply(determine_status, axis=1)

In [7]:
merge_cols = [
    'search_term', 
    'is_blocked', 
    'status', 
    'n_youtube_videos', 
    'n_youtube_channels'
]
blocked = df[df.fn.str.contains('/blocked/')]
df = df.merge(blocked[merge_cols], 
              on=['search_term'], how='left', 
              suffixes=('', '_no_spaces'))

In [8]:
df.sort_values(by=['n_youtube_channels', 
                   'is_blocked_no_spaces',
                   'search_term'], 
               ascending=False, 
               inplace=True)

In [9]:
hate = df[df.fn.str.contains('/hate/')]
social_justice = df[df.fn.str.contains('/social_justice/')]
policy = df[df.fn.str.contains('/policy/')]
word = df[df.fn.str.contains('/blocked_basewords/')]
adhoc = df[(df.fn.str.contains('adhoc/'))]

In [10]:
# save the status of terms from each keyword list
hate[display_cols ].to_csv(fn_hate, index=False)
social_justice[display_cols].to_csv(fn_social_justice, index=False)
policy[display_cols].to_csv(fn_policy, index=False)
word[display_cols].to_csv(fn_basewords, index=False)
adhoc[display_cols].to_csv(fn_adhoc, index=False)

## Analyze Hate terms

In [11]:
hate_og = pd.read_csv(fn_hate_og)

In [12]:
# before
value_counts(hate_og, col='status')

Unnamed: 0,count,percentage
Full,59,0.678161
Blocked,28,0.321839


Half of the terms are now blocked with `Empty` responses. Note this `Empty` response is a totally unique use from the original data collection. This prevents any further studies from differentiating between a blocked search and an obscure search.

In [13]:
# after
value_counts(hate, col='status')

Unnamed: 0,count,percentage
Empty,44,0.505747
Blocked,28,0.321839
Full,15,0.172414


In [14]:
# compare the original responses to the new responses
hate_delta = hate_og[['search_term', 'status', 'status_no_spaces']].merge(
    hate[['search_term', 'status', 'status_no_spaces']],
    on = 'search_term', suffixes=['_before', '_after']
)

In [15]:
# hate terms that are now blocked with an "empty" response.
hate_delta[
    hate_delta.status_after != hate_delta.status_before
][['search_term','status_before', 'status_after']]

Unnamed: 0,search_term,status_before,status_after
0,14 words,Full,Empty
1,2083: a european declaration of independence,Full,Empty
2,alt-lite,Full,Empty
3,alt-right,Full,Empty
4,american front,Full,Empty
5,american identity movement,Full,Empty
7,american vanguard,Full,Empty
10,blood and soil,Full,Empty
12,civilization jihad,Full,Empty
14,council of conservative citizens,Full,Empty


In [16]:
# space-removal bandaids that are now blocked with an "empty" response.
hate_delta[
    (hate_delta.status_no_spaces_after != hate_delta.status_no_spaces_before) & 
    (~hate_delta.status_no_spaces_before.isnull())
][['search_term','status_no_spaces_before', 'status_no_spaces_after']]

Unnamed: 0,search_term,status_no_spaces_before,status_no_spaces_after
62,dual seedline,Full,Empty
65,globalist jews,Full,Empty
67,heil hitler,Full,Empty
70,jewish question,Full,Empty
77,radical islamic terror,Full,Empty
78,sieg heil,Full,Empty
82,white genocide,Full,Empty
83,white nationalist,Full,Empty
85,white pride,Full,Empty


In [17]:
# which of the space-removal bandaid terms have not changed?
still_available  = hate_delta[
    (hate_delta.status_no_spaces_after == hate_delta.status_no_spaces_before) & 
    (hate_delta.status_no_spaces_before == 'Full')
].search_term.tolist()
still_available  = [w.replace(' ', '') for w in still_available]

# which unaltered terms have not changed status?
still_available += hate_delta[
    (hate_delta.status_after == 'Full')
].search_term.tolist()

still_available = sorted(still_available)

In [18]:
# which of the space-removal terms have been changed?
newly_blocked = hate_delta[
    (hate_delta.status_no_spaces_after != hate_delta.status_no_spaces_before) & 
    (~hate_delta.status_no_spaces_before.isnull())
].search_term.tolist()
newly_blocked = [w.replace(' ', '') for w in newly_blocked]

newly_blocked += hate_delta[
    hate_delta.status_after != hate_delta.status_before
].search_term.tolist()

newly_blocked = sorted(newly_blocked)

In [19]:
len(newly_blocked), len(still_available)

(53, 16)

Let's display these changed and unchanged terms in a side-by-side table:

In [20]:
# pad with None (columns need to be the same dimensions)
n_pad = len(newly_blocked) - len(still_available)
still_available.extend([None for _ in range(n_pad)])

In [21]:
changed_terms =pd.DataFrame({
    'still allowed for placements' : still_available, 
    'now blocked from placements': newly_blocked
})

changed_terms.to_csv(fn_table4, index=False)
changed_terms

Unnamed: 0,still allowed for placements,now blocked from placements
0,american renaissance,14 words
1,amerimutt,2083: a european declaration of independence
2,black sun,alt-lite
3,boogaloo,alt-right
4,color of crime,american front
5,diversity is a code word for anti-white,american identity movement
6,free helocopter rides,american vanguard
7,hammerskin,blood and soil
8,national socialism,civilization jihad
9,open borders for israel,council of conservative citizens


## Analyze Social Justice terms

In [22]:
social_justice_og = pd.read_csv(fn_social_justice_og)

In [23]:
# before
value_counts(social_justice_og, col='status')

Unnamed: 0,count,percentage
Full,42,0.677419
Blocked,17,0.274194
Partial Block,3,0.048387


In [24]:
# after: half the terms are now blocked with an "empty" response
value_counts(social_justice, col='status')

Unnamed: 0,count,percentage
Empty,32,0.516129
Blocked,17,0.274194
Full,10,0.16129
Partial Block,3,0.048387


In [25]:
social_justice_delta = social_justice_og[['search_term', 'status']].merge(
    social_justice[['search_term', 'status']],
    on = 'search_term', suffixes=['_before', '_after']
)

In [26]:
# Which terms got blocked?
social_justice_delta[
    social_justice_delta.status_before != social_justice_delta.status_after
]

Unnamed: 0,search_term,status_before,status_after
0,abolish ice,Full,Empty
2,anti-black,Full,Empty
3,anti-fascist,Full,Empty
4,antiracism,Full,Empty
5,believe black women,Full,Empty
6,bipoc,Full,Empty
7,black august,Full,Empty
9,black excellence,Full,Empty
10,black girls matter,Full,Empty
12,black identity extremists,Full,Empty
