# Fraudulent Provider Dataset Construction (PART 1)
## STEP 1
* Build Collection of Text Scraped from DOJ Press Release Site

In [1]:
# Import Pyhon libraries
import scrapy
from scrapy.crawler import CrawlerProcess
import re
from urllib.parse import urlparse

In [2]:
# Define Scrapy Spider to scrap DOJ Opioid Awareness News Feed Site
class DOJPressReleaseSpider(scrapy.Spider):
    name = "DOJPressRelease"

    def start_requests(self):
        urls = [
            'https://www.justice.gov/news?f%5B0%5D=type%3Apress_release&f%5B1%5D=field_pr_topic%3A34596',   # DOJ Press Releases, filtered for cateogry: Drugs -> Opioids
            #'https://www.justice.gov/opioidawareness/news', # DOJ Opioid Awareness News Feed
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):                                                                # scrape listing page
        press_release_links = response.css('div.views-row span.field-content a::attr(href)')  # get the press release link URL from each listing row
        yield from response.follow_all(press_release_links, self.parse_release)               # follow links to press release details for current page

        pagination_links = response.css('li.pager__item--next a')                             # follow pagination link to "next" page
        yield from response.follow_all(pagination_links, self.parse)
                
    def parse_release(self, response):                                                        # scrape press release details page 
        def extract_with_css(query):
            #return response.css(query).get(default='').strip()
            return response.css(query).getall()
        
        yield {
            'release_agency': extract_with_css('div.pr-header div.agency::text'),                     # get agency / office name
            'release_date': extract_with_css('div.pr-info span.date-display-single::attr(content)'),  # get release date
            'release_title': extract_with_css('div.node__content h1::text'),                          # get release title
            'release_url': response.url,                                                              # add URL
            'release_text': extract_with_css('div.field__item p::text'),                              # get release text
        }            

In [3]:
# Prepare crawl process parameters
process = CrawlerProcess(settings={
    'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1',
    #'FEED_FORMAT': 'jsonlines',
    #'FEED_URI': 'items.jsonl',
    'FEED_FORMAT': 'csv',
    'FEED_URI': 'DOJ_Press_Releases.csv',
    'FEED_EXPORT_FIELDS': [
        "release_date", 
        "release_agency", 
        "release_title", 
        "release_url",
        "release_text"
    ]
})

2020-03-11 15:50:08 [scrapy.utils.log] INFO: Scrapy 2.0.0 started (bot: scrapybot)
2020-03-11 15:50:08 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.5.2, w3lib 1.20.0, Twisted 19.10.0, Python 3.7.6 (default, Jan  8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Windows-10-10.0.17763-SP0
2020-03-11 15:50:08 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor


In [4]:
# Execute Scrapy Spider
process.crawl(DOJPressReleaseSpider)
process.start() # the script will block here until all crawling jobs are finished

2020-03-11 15:50:12 [scrapy.crawler] INFO: Overridden settings:
{'FEED_EXPORT_FIELDS': ['release_date',
                        'release_agency',
                        'release_title',
                        'release_url',
                        'release_text'],
 'FEED_FORMAT': 'csv',
 'FEED_URI': 'DOJ_Press_Releases.csv',
 'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 '
               'Firefox/40.1'}
2020-03-11 15:50:12 [scrapy.extensions.telnet] INFO: Telnet Password: 1431e55338024cf2
2020-03-11 15:50:12 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2020-03-11 15:50:12 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddl