# Analyzing Suggested Videos and Channels

This section follows the "Hate phrases vs hate content" section of the methodology.

In [1]:
import os
import pandas as pd
from tqdm import tqdm

In [2]:
# inputs
fn_adl = '../data/input/channel_lists/adl_extremist_alternative_channels_overlap.csv'
fn_acm = '../data/input/channel_lists/ribeiro_sources.csv'

fn_topline = '../data/input/video_metadata/topline_hate_videos.csv'
fn_deeper_catalog = '../data/input/video_metadata/deep_catalog_wwk_wg_we.csv'
fn_youtube_videos = '../data/output/placements_api_suggestions/videos_for_hate_terms.csv'
fn_youtube_channels = '../data/output/placements_api_suggestions/channels_for_hate_terms.csv'

# outputs
fn_table = '../data/output/channel_overlap.csv'

We cross-reference suggestions from the "PlacementSuggestionService" API against channels that researchers from [EPFL and UFMG identified](https://dl.acm.org/doi/abs/10.1145/3351095.3372879) as "alt-right" and "alt-lite" in their 2020 paper, "Auditing radicalization pathways on YouTube," as well as channels a [recent report](https://www.adl.org/resources/reports/exposure-to-alternative-extremist-content-on-youtube) conducted by researchers at Dartmouth College, Northeastern Univerisity, and Univeristy of Exeter identified as "extremist" and "alternative" in the ADL-published report, "Exposure to Alternative & Extremist Content on YouTube".

The ADL list is not publically available. Contact the authors of the ADL study and add it to this repo where `fn_adl` should be. This notebook will run even without the file (`adl_file_present` will be `False`).

In [3]:
adl_file_present = os.path.exists(fn_adl)
adl_file_present

True

In [4]:
if adl_file_present:
    df_adl = pd.read_csv(fn_adl)
    extremist_channels = df_adl[df_adl.type == 'extremist'].channel_id.tolist()
    alt_channels = df_adl[df_adl.type == 'alternative'].channel_id.tolist()

In [5]:
df_acm = pd.read_csv(fn_acm)
altright_channels = df_acm[df_acm.Category == 'Alt-right'].Id.tolist()
altlite_channels = df_acm[df_acm.Category == 'Alt-lite'].Id.tolist()

In [6]:
len(altright_channels), len(altlite_channels)

(88, 114)

## Getting YouTube Metadata
We used `channel_id` to match videos we were suggested with channels listed in two other reports. This is not available in the suggested videos, so we collect the video metadata using the YouTube Data API (v3) Python client.

This code is just an example of what we did for all `video_id`'s:

```python
from youtube_api import YouTubeDataAPI

def custom_parser(resp : dict) -> dict:
    """
    Parses the JSON response from the YouTube Data API.
    Mostly descriptive fields from the snippet.
    """
    return {
        "video_id" : resp['id'],
        "video_title": resp['snippet']['title'],
        "video_description": resp['snippet']['description'],
        "video_tags": "|".join(resp['snippet'].get('tags', [])),
        "video_published_at": resp['snippet']['publishedAt'],
        "channel_title": resp['snippet']['channelTitle'],
        "channel_id": resp['snippet']['channelId'],
    }

# you need an API key for the YouTube Data API v3
YT_KEY = "AKAI- ..."
yt = YouTubeDataAPI(YT_KEY)

video_id = "5qap5aO4i9A"
video_metadata = yt.get_video_metadata(video_id, parser=custom_parser)
```

## How many Extremist and Alternative channels were we suggested for Topline suggestions?

In [7]:
df_topline_channels = pd.read_csv(fn_youtube_channels)
df_topline_channels = df_topline_channels[~df_topline_channels.youtube_channel_id.isnull()]

In [8]:
df_topline = pd.read_csv(fn_topline)
print(f"N unique videos: {len(df_topline)}")
df_topline.iloc[0]

N unique videos: 1311


video_id                                                    04axDDRVy_o
video_title           The “ethnic cleansing” of Myanmar’s Rohingya M...
video_description     The Rohingya have been systematically driven o...
video_tags            vox.com|vox|explain|Rohingya|Myanmer|Bangledes...
video_published_at                             2017-09-25T12:00:01.000Z
channel_title                                                       Vox
channel_id                                     UCLXo7UDZvByw2ixzpQCufnA
Name: 0, dtype: object

combine unique channels from videos and channels

In [9]:
## ADL
if adl_file_present:
    # video suggestions
    topline_extreme = df_topline[df_topline.channel_id.isin(extremist_channels)]
    topline_alt = df_topline[df_topline.channel_id.isin(alt_channels)]

    # channel suggestions
    topline_extreme_channel_suggestions = df_topline_channels[
        df_topline_channels.youtube_channel_id.isin(extremist_channels)
    ]
    topline_alt_channel_suggestions = df_topline_channels[
    df_topline_channels.youtube_channel_id.isin(alt_channels)]
    
    topline_extreme_channels = list(set(
        topline_extreme_channel_suggestions.youtube_channel_name.tolist() + 
        topline_extreme.channel_title.tolist()
    ))
    
    topline_alt_channels = list(set(
        topline_alt_channel_suggestions.youtube_channel_name.tolist() + 
        topline_alt.channel_title.tolist()
    ))
    
else:    
    topline_extreme = pd.DataFrame({'video_id': []})
    topline_alt = pd.DataFrame({'video_id': []})
    
    topline_extreme_channels = list()
    topline_alt_channels = list()

In [10]:
# ACM
# video suggestions
topline_altright = df_topline[df_topline.channel_id.isin(altright_channels)]
topline_altlite = df_topline[df_topline.channel_id.isin(altlite_channels)]

# channel suggestions
topline_altright_channel_suggestions = df_topline_channels[
    df_topline_channels.youtube_channel_id.isin(altright_channels)]

topline_altlite_channel_suggestions = df_topline_channels[
    df_topline_channels.youtube_channel_id.isin(altlite_channels)]

# combine unique channels from videos and channels
topline_altright_channels= list(set(
    topline_altright_channel_suggestions.youtube_channel_name.tolist() + 
    topline_altright.channel_title.tolist()
))

topline_altlite_channels= list(set(
    topline_altlite_channel_suggestions.youtube_channel_name.tolist() + 
    topline_altlite.channel_title.tolist()
))

## How many Extremist and Alternative channels were we suggested for the deeper catalog of three terms?

In [11]:
df_deeper_catalog = pd.read_csv(fn_deeper_catalog)
print(f"N unique videos: {len(df_deeper_catalog)}")
df_deeper_catalog.iloc[0]

N unique videos: 1635


video_id                                                    puJ-arJgkZU
video_title           Gary Younge interviews Richard Spencer: 'Afric...
video_description     In a dramatic interview, the Guardian's Gary Y...
video_tags            Richard Spencer|Gary Younge|white supremacy|ra...
video_published_at                             2017-11-07T09:28:00.000Z
channel_title                                              The Guardian
channel_id                                     UCHpw8xwDNhU9gdohEcJu4aA
Name: 0, dtype: object

In [12]:
if adl_file_present:
    deep_extreme = df_deeper_catalog[df_deeper_catalog.channel_id.isin(extremist_channels)]
    deep_alt = df_deeper_catalog[df_deeper_catalog.channel_id.isin(alt_channels)]
else:
    deep_extreme = pd.DataFrame({'video_id': [], 'channel_title': []})
    deep_alt = pd.DataFrame({'video_id': [], 'channel_title': []})

In [13]:
deep_altright = df_deeper_catalog[df_deeper_catalog.channel_id.isin(altright_channels)]
deep_altlite = df_deeper_catalog[df_deeper_catalog.channel_id.isin(altlite_channels)]

In [14]:
result = pd.DataFrame([
    {
        "data source": "topline",
        "category": "extremist",
        "n_videos": topline_extreme.video_id.nunique(),
        "perc_videos" : topline_extreme.video_id.nunique() / df_topline.video_id.nunique(),
        "n_channels": len(topline_extreme_channels),
        "channels" : None
    },
    {
        "data source": "topline",
        "category": "alt-right",
        "n_videos": topline_altright.video_id.nunique(),
        "perc_videos" : topline_altright.video_id.nunique() / df_topline.video_id.nunique(),
        "n_channels": len(topline_altright_channels),
        "channels" : " | ".join(set(topline_altright_channels))
    },
    {
        "data source": "topline",
        "category": "alternative",
        "n_videos": topline_alt.video_id.nunique(),
        "perc_videos" : topline_alt.video_id.nunique() / df_topline.video_id.nunique(),
        "n_channels": len(topline_alt_channels),
        "channels" : None
    },
    {
        "data source": "topline",
        "category": "alt-lite",
        "n_videos": topline_altlite.video_id.nunique(),
        "perc_videos" : topline_altlite.video_id.nunique() / df_topline.video_id.nunique(),
        "n_channels": len(topline_altlite_channels),
        "channels" : " | ".join(set(topline_altlite_channels))
    },
    
    
    {
        "data source": "deep3",
        "category": "extremist",
        "n_videos": deep_extreme.video_id.nunique(),
        "perc_videos" : deep_extreme.video_id.nunique() / df_deeper_catalog.video_id.nunique(),
        "n_channels": deep_extreme.channel_title.nunique(),
        "channels" : None
    },
    {
        "data source": "deep3",
        "category": "alt-right",
        "n_videos": deep_altright.video_id.nunique(),
        "perc_videos" : deep_altright.video_id.nunique() / df_deeper_catalog.video_id.nunique(),
        "n_channels": deep_altright.channel_title.nunique(),
        "channels" : " | ".join(set(deep_altright.channel_title.unique()))
    },
    {
        "data source": "deep3",
        "category": "alternative",
        "n_videos": deep_alt.video_id.nunique(),
        "perc_videos" : deep_alt.video_id.nunique() / df_deeper_catalog.video_id.nunique(),
        "n_channels": deep_alt.channel_title.nunique(),
        "channels" : None
    },
    {
        "data source": "deep3",
        "category": "alt-lite",
        "n_videos": deep_altlite.video_id.nunique(),
        "perc_videos" : deep_altlite.video_id.nunique() / df_deeper_catalog.video_id.nunique(),
        "n_channels": deep_altlite.channel_title.nunique(),
        "channels" : " | ".join(set(deep_altlite.channel_title.unique()))
    }
])

result

Unnamed: 0,data source,category,n_videos,perc_videos,n_channels,channels
0,topline,extremist,32,0.024409,21,
1,topline,alt-right,15,0.011442,7,Real McGoy | Nightmare Fuel | The Golden One |...
2,topline,alternative,53,0.040427,41,
3,topline,alt-lite,18,0.01373,15,Lauren Chen | Liberty Hangout | Liberty Doll |...
4,deep3,extremist,23,0.021375,14,
5,deep3,alt-right,14,0.013011,6,Nightmare Fuel | Real McGoy | Laura Towler | a...
6,deep3,alternative,219,0.203532,38,
7,deep3,alt-lite,30,0.027881,16,RobinHoodUKIP | Tree Of Logic | Sargon of Akka...


In [15]:
result.to_csv(fn_table, index=False)

## Do suggested videos contain metadata that matches the query?

In [16]:
# these are suggested videos, we'll merge the video metadata to make this search.
df_hate_video = pd.read_csv(fn_youtube_videos)

In [17]:
def search_metadata(row : dict) -> bool:
    """
    Check for metadata that corresponds with the `search_term`
    """
    term = row['search_term'].replace(' ', '')
    if term in row['youtube_video_title'].lower().replace(' ', ''):
        return True
    elif term in str(row['video_tags']).replace(' ', ''):
        return True
    elif term in str(row['video_description']).lower().replace(' ', ''):
        return True
    return False

In [18]:
df_hate_video.merge(
    df_topline, 
    left_on='youtube_video_id',
    right_on='video_id'
).apply(search_metadata, axis=1).value_counts(normalize=True)

True     0.565254
False    0.434746
dtype: float64