# Data Preprocessing
This notebook parses and processes the API responses from the "PlacementSuggestionService".

In [1]:
import os
import json
import glob

from tqdm import tqdm
import pandas as pd

from utils import determine_status, process_api_response

In [2]:
# input
data_in = '../data/input/placements_api/'
fn_hate_info = '../data/input/hate_terms_background_info.csv'

# outputs
data_dir = '../data/output/placements_api_keyword_status/'
os.makedirs(data_dir, exist_ok=True)
fn_hate = os.path.join(data_dir, "hate.csv")
fn_social_justice = os.path.join(data_dir, "social_justice.csv")
fn_policy = os.path.join(data_dir, "policy.csv")
fn_basewords = os.path.join(data_dir, "basewords.csv")
fn_adhoc = os.path.join(data_dir, "adhoc.csv")

data_dir_2 = '../data/output/placements_api_suggestions/'
os.makedirs(data_dir_2, exist_ok=True)
fn_social_justice_videos = os.path.join(data_dir_2, 'videos_for_social_justice_terms.csv')
fn_hate_videos = os.path.join(data_dir_2, 'videos_for_hate_terms.csv')
fn_social_justice_channels = os.path.join(data_dir_2, 'channels_for_social_justice_terms.csv')
fn_hate_channels = os.path.join(data_dir_2, 'channels_for_hate_terms.csv')

In [14]:
# save CSVs with these columns
display_cols = [
    'search_term', 
    'status',
    'status_no_spaces',
    'n_youtube_videos',
    'n_youtube_channels',
    'n_youtube_videos_no_spaces',
    'n_youtube_channels_no_spaces',
]

In [3]:
# these are the files we're going to process
files = glob.glob(data_in + '*/*.json')
len(files)

802

## Checking the API responses
Refer to `../data/reference/placements_api_example_responses/full.json` for an example what a typical API response looks like.

We use the following function to parse each API response:

In [4]:
??process_api_response

[0;31mSignature:[0m [0mprocess_api_response[0m[0;34m([0m[0mfn[0m[0;34m)[0m [0;34m->[0m [0mdict[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mprocess_api_response[0m[0;34m([0m[0mfn[0m[0;34m)[0m [0;34m->[0m [0mdict[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""[0m
[0;34m    Reads the JSON returned from the API, and parses metadata for [0m
[0;34m    YouTube video and channel suggestions.[0m
[0;34m    """[0m[0;34m[0m
[0;34m[0m    [0mdata[0m [0;34m=[0m [0mjson[0m[0;34m.[0m[0mload[0m[0;34m([0m[0mopen[0m[0;34m([0m[0mfn[0m[0;34m)[0m[0;34m)[0m[0;34m[0m
[0;34m[0m    [0msearch_term[0m [0;34m=[0m [0mfn[0m[0;34m.[0m[0msplit[0m[0;34m([0m[0;34m'/'[0m[0;34m)[0m[0;34m[[0m[0;34m-[0m[0;36m1[0m[0;34m][0m[0;34m.[0m[0mreplace[0m[0;34m([0m[0;34m'.json'[0m[0;34m,[0m [0;34m''[0m[0;34m)[0m[0;34m[0m
[0;34m[0m    [0mrecord[0m [0;34m=[0m [0;34m{[0m[0;34m'fn'[0m [0;34m:[0m 

In [5]:
dataset = []
for fn in tqdm(files):
    record = process_api_response(fn)
    dataset.append(record)
    
df = pd.DataFrame(dataset)

100%|██████████| 802/802 [00:00<00:00, 11033.41it/s]


We use the following function to decipher the API statuses:

In [6]:
??determine_status

[0;31mSignature:[0m [0mdetermine_status[0m[0;34m([0m[0mresp[0m[0;34m:[0m [0mdict[0m[0;34m)[0m [0;34m->[0m [0mstr[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mdetermine_status[0m[0;34m([0m[0mresp[0m[0;34m:[0m [0mdict[0m[0;34m)[0m [0;34m->[0m [0mstr[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""[0m
[0;34m    Determines the status (`Full`, `Blocked`, `Partial Block` or `Empty`) [0m
[0;34m    of or a given keyword's API response (`resp`).[0m
[0;34m    [0m
[0;34m    Examples for each kind of response in [0m
[0;34m    `../data/reference/placements_api_response_examples`. [0m
[0;34m    [0m
[0;34m    Please read the methodology for more detail.[0m
[0;34m    """[0m[0;34m[0m
[0;34m[0m    [0;32mif[0m [0mresp[0m[0;34m[[0m[0;34m'is_blocked'[0m[0;34m][0m [0;34m==[0m [0;32mTrue[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;32mreturn[0m [0;34m'Blocked'[0m[0;34m[0m
[0;34m[0m    [0;32melif[0m [

In [7]:
df['status'] = df.apply(determine_status, axis=1)

In [8]:
blocked = df[df.fn.str.contains('/blocked/')]

In [9]:
merge_cols = [
    'search_term', 
    'is_blocked', 
    'status', 
    'n_youtube_videos', 
    'n_youtube_channels'
]

df = df.merge(blocked[merge_cols], 
              on=['search_term'], how='left', 
              suffixes=('', '_no_spaces'))

In [10]:
# create order of statuses for sorting
status_order = ["Full", "Empty", "Partial Block", "Blocked"]
for col in ['status', 'status_no_spaces']:
    df[col] = pd.Categorical(df[col], status_order)

In [11]:
df.sort_values(by=['status', 'search_term', 'status_no_spaces'], 
               ascending=True, 
               inplace=True)

In [12]:
hate = df[df.fn.str.contains('/hate/')]
social_justice = df[df.fn.str.contains('/social_justice/')]
policy = df[df.fn.str.contains('/policy/')]
word = df[df.fn.str.contains('/blocked_basewords/')]
adhoc = df[(df.fn.str.contains('adhoc/'))]

Save the terms and responses:

In [15]:
# add links for background info on each hate term.
hate = hate.merge(pd.read_csv(fn_hate_info), on='search_term')

# save the status of terms from each keyword list
hate[display_cols + ['background_info_link']].to_csv(fn_hate, index=False)
social_justice[display_cols].to_csv(fn_social_justice, index=False)
policy[display_cols].to_csv(fn_policy, index=False)
word[display_cols].to_csv(fn_basewords, index=False)
adhoc[display_cols].to_csv(fn_adhoc, index=False)


# Here we write everything to a spreadsheet...
writer = pd.ExcelWriter('../data/reference/what_is_blocked.xlsx')
hate[display_cols + ['background_info_link']].to_excel(writer, 'hate', index=False)
social_justice[display_cols].to_excel(writer, 'social_justice', index=False)
policy[display_cols].to_excel(writer, 'policy', index=False)
adhoc[display_cols].to_excel(writer, 'adhoc', index=False)
word[display_cols].to_excel(writer, 'basewords', index=False)
writer.save()

## What YouTube channels and videos are suggested?
`Full` API responses have suggestions to parse and analyze, this is how we accessed that information.

In [16]:
youtube_channels = []
youtube_videos = []
for results in [_ for _ in dataset if _['fn'] in df['fn'].tolist()]:
    _row = {'search_term' : results['search_term']}
    
    # check channel suggestions
    if results.get('youtube_channels'):
        for channel_meta in results.get('youtube_channels'):
            row = _row.copy()
            row = {**_row, **channel_meta}
            row['channel_url'] = ('www.youtube.com/channel/'
                                 f'{channel_meta["youtube_channel_id"]}')
            youtube_channels.append(row)
    else:
        youtube_channels.append(_row)
        
    # check video suggesionts
    if results.get('youtube_videos'):
        for video_meta in results.get('youtube_videos'):
            row = _row.copy()
            row = {**_row, **video_meta} 
            row['video_url']= ('www.youtube.com/watch/?v='
                            f'{video_meta["youtube_video_id"]}')
            youtube_videos.append(row)
    else:
        youtube_videos.append(_row)

In [17]:
df_channel = pd.DataFrame(youtube_channels)
df_vids = pd.DataFrame(youtube_videos)

In [18]:
df_vids = df_vids[~df_vids.youtube_video_id.isnull()]
len(df_vids[df_vids.search_term.isin(hate.search_term)])

1385

In [19]:
def get_bandaid_search_term(search_term):
    """Make clear if the search_term was a space-removal term"""
    if search_term in df[~df.status_no_spaces.isnull()].search_term.tolist():
        return search_term.replace(' ', '')
    return search_term

In [20]:
df_vids.loc[:, "search_term"] = df_vids.search_term.apply(get_bandaid_search_term)
df_channel.loc[:, "search_term"] = df_channel.search_term.apply(get_bandaid_search_term)

In [21]:
# how many unique videos were suggested from the hate terms?
df_vids[df_vids.search_term.isin(hate.search_term)].youtube_video_id.nunique()

1140

In [22]:
# save the parsed suggested videos
df_vids[df_vids.search_term.isin(hate.search_term)].to_csv(
    fn_hate_videos, index=False
)
df_vids[df_vids.search_term.isin(social_justice.search_term)].to_csv(
    fn_social_justice_videos, index=False
)

In [23]:
# save the parsed suggested channels
df_channel[df_channel.search_term.isin(hate.search_term)].to_csv(
    fn_hate_channels, index=False
)
df_channel[df_channel.search_term.isin(social_justice.search_term)].to_csv(
    fn_social_justice_channels, index=False
)