# INCA Scrapers: Test with Sample of Media Cloud URLs
**Purpose**: 
- Test scrapers in INCA using Kibana's Discover and Dashboard

In [1]:
import os
import pandas as pd
import datetime

In [2]:
from inca import Inca



In [3]:
from usrightmedia.shared.loggers import get_logger
LOGGER = get_logger(filename = '02-mediacloud-sample', logger_type='main')

## Reference: show outlets

In [4]:
from usrightmedia.shared.media_references import get_mediacloud_outlet_ids

In [5]:
get_mediacloud_outlet_ids()

Unnamed: 0,outlet,media_id
0,American Renaissance,26186
1,Breitbart,19334
2,Daily Caller,18775
3,Daily Stormer,113988
4,Fox News,1092
5,Gateway Pundit,25444
6,InfoWars,18515
7,Newsmax,25349
8,One America News,127733
9,Rush Limbaugh,24669


## Instantiate INCA

In [6]:
myinca = Inca()

## Load sample data

In [7]:
dir_sample = os.path.join('..', '..', 'data', '02-intermediate', '03-mediacloud-sample')

In [8]:
df_sample = pd.read_pickle(os.path.join(dir_sample, 'mediacloud_urls_sampled_with_alt_url.pkl'))

In [9]:
urls_to_fetch = df_sample.to_dict("records")

In [10]:
urls_to_fetch[0]

{'url_id': '590133932',
 'outlet': 'Daily Caller',
 'publish_date': Timestamp('2017-03-06 22:59:43+0000', tz='UTC'),
 'title': 'Is Conservative Criticism Of GOP Obamacare Proposal Putting Repeal At Risk?',
 'url': 'http://dailycaller.com/2017/03/06/is-conservative-criticism-of-gop-obamacare-proposal-putting-repeal-at-risk/',
 'alt_url': '',
 'ap_syndicated': False,
 'themes': ''}

## Define function for scraping a URL based on its outlet

In [11]:
def scrape(url_dict):
    """Collect a URL using the appropriate scraper in INCA.
    Args:
        url_dict
        
        Example:
        {'url_id': '1565840471',
         'outlet': 'Gateway Pundit',
         'publish_date': Timestamp('2020-04-01 17:00:59+0000', tz='UTC'),
         'title': 'Joe Biden Appears To Be Reading From Note Cards During Media Spot (VIDEO)',
         'url': 'https://www.thegatewaypundit.com/2020/04/joe-biden-appears-to-be-reading-from-note-cards-during-media-spot-video/',
         'alt_url': '',
         'ap_syndicated': False,
         'themes': ''}
         
    Returns:
        None
        *URL's info is stored as a document in Elasticsearch
    """
    
    myinca = Inca()
    
    d = url_dict
    url_id = d['url_id']
    outlet = d['outlet']
    
    es_id = f"{outlet.replace(' ', '')}_{url_id}"
    
    if myinca.database.check_exists(es_id)[0]:
        LOGGER.info(f"URL with es_id {es_id} already exists; skip.")
    
    else:
        LOGGER.info(f"Collecting {es_id}...")
        
        if outlet == "American Renaissance":
            myinca.usmedia_scrapers.americanrenaissance(url_info=d)

        elif outlet == "Breitbart":
            myinca.usmedia_scrapers.breitbart(url_info=d)

        elif outlet == "Daily Caller":
            myinca.usmedia_scrapers.dailycaller(url_info=d)

        elif outlet == "Daily Stormer":
            myinca.usmedia_scrapers.dailystormer(url_info=d)

        elif outlet == "Fox News":
            myinca.usmedia_scrapers.foxnews(url_info=d)

        elif outlet == "Gateway Pundit":
            myinca.usmedia_scrapers.gatewaypundit(url_info=d)

        elif outlet == "InfoWars":
            myinca.usmedia_scrapers.infowars(url_info=d)

        elif outlet == "Newsmax":
            myinca.usmedia_scrapers.newsmax(url_info=d)

        elif outlet == "One America News":
            myinca.usmedia_scrapers.oneamericanews(url_info=d)

        elif outlet == "Rush Limbaugh":
            myinca.usmedia_scrapers.rushlimbaugh(url_info=d)

        elif outlet == "Sean Hannity":
            myinca.usmedia_scrapers.seanhannity(url_info=d)

        elif outlet == "VDARE":
            myinca.usmedia_scrapers.vdare(url_info=d)

        elif outlet == "Washington Examiner":
            myinca.usmedia_scrapers.washingtonexaminer(url_info=d)
            
        LOGGER.info(f"Finished collecting {es_id}.")

In [13]:
print(f"starting fetch: {datetime.datetime.now()}")
for n, url in enumerate(urls_to_fetch):
    print(f"fetching url {n}: {url['url']}")
    scrape(url)
print(f"ending fetch: {datetime.datetime.now()}")