## This notebook illustrates the use of BeautifulSoup in scraping information from www.drugwatch.com

In [1]:
## Import packages
from bs4 import BeautifulSoup
import json
import requests
import pandas as pd
import re

In [2]:
## Let's assume we want to grab all articles related to keyword "opioid". We first grab all the text with html.parser
keyword = 'opioid'
search_url = 'https://www.drugwatch.com/search/?query={}'.format(keyword)
search_page = requests.get(search_url)
search_soup = BeautifulSoup(search_page.content, 'html.parser')
search_soup

<!DOCTYPE html>

<!--[if lt IE 8]>      <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]-->
<!--[if IE 8]>         <html class="no-js lt-ie9 ie9 ie8" lang="en"> <![endif]-->
<!--[if IE 9]>         <html class="no-js ie9" lang="en"> <![endif]-->
<!--[if gt IE 9]><!--> <html class="no-js" lang="en"> <!--<![endif]-->
<head>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<meta content="yes" name="apple-mobile-web-app-capable">
<meta content="SKYPE_TOOLBAR_PARSER_COMPATIBLE" name="SKYPE_TOOLBAR">
<link href="//www.googletagmanager.com" rel="preconnect"/>
<link href="//www.google-analytics.com" rel="preconnect"/>
<link href="//api.heirial.com" rel="dns-prefetch"/>
<link href="//client.heirial.com" rel="dns-prefetch"/>
<link href="//connect.facebook.net" rel="dns-prefetch"/>
<link href="//fast.wistia.com" rel="dns-pr

In [3]:
## After looking at the html, we identify the tag that contains the url for each article listed. We grab all such instances for the urls of all articles on the page.
urls = [search.get_text() for search in search_soup.find_all('small', class_="c-search__result-url")]
urls

['https://www.drugwatch.com/opioids/',
 'https://www.drugwatch.com/opioids/lawsuits/',
 'https://www.drugwatch.com/featured/opioid-crisis-big-pharma/',
 'https://www.drugwatch.com/news/2016/06/09/opioid-use-and-hip-replacement-revision-risk/',
 'https://www.drugwatch.com/news/2017/08/23/big-pharma-paid-doctors-millions-opioid-campaign-study-says/',
 'https://www.drugwatch.com/news/2018/01/17/opioid-overdose-deaths-continue-to-rise-in-us/',
 'https://www.drugwatch.com/news/2020/03/24/fda-drugs-gabapentin-and-pregabalin-linked-to-serious-breathing-problems/',
 'https://www.drugwatch.com/news/2015/08/10/worldwide-prescription-drug-abuse/',
 'https://www.drugwatch.com/news/2017/07/06/powerful-painkiller-opana-yanked-from-market/']

In [4]:
## Let's also identify the link for the "next" button which gets us to the next page of results
search_soup.find('a', class_="c-button c-button--link")

<a class="c-button c-button--link" href="/search/?query=opioid&amp;page_num=2">
                    Next
                    <i class="o-icon o-icon--arrow-long-right"></i>
</a>

In [5]:
## Now we are ready to construct a scraper. We will loop through all pages of the search results, and for each page, 
## setup another loop to collect the text for each article. Finally, we will tabulate the info on a pandas dataframe.

def get_search_info(keyword, pagecount=1, max_pages=10, postcount=1, df=pd.DataFrame()):
    """Iterates through every article on every page of the search results and collects relevant information"""
    ## Define search_url, grab url links for all articles
    search_url = 'https://www.drugwatch.com/search/?query={kw}&page_num={pg}'.format(kw=keyword, pg=pagecount)
    search_page = requests.get(search_url)
    search_soup = BeautifulSoup(search_page.content, 'html.parser')
    urls = [search.get_text() for search in search_soup.find_all('small', class_="c-search__result-url")]
    
    ## loop through all article urls
    for post_url in urls:
        post_dict = {}
        ## get url
        post_dict['url'] = post_url
        ## make page bs4 inquiry
        post_page = requests.get(post_url)
        post_soup = BeautifulSoup(post_page.content, 'html.parser')
        ## get title
        try:
            if post_soup.find('h1', class_="hero__title") is not None:
                title = post_soup.find('h1', class_="hero__title").get_text()
            else:
                title = post_soup.find(attrs={"data-location" : "page-header"}).find('h1').get_text()
            post_dict['title'] = title
        except:
            post_dict['title'] = ''
        ## get publish time
        try:
            pubtime = post_soup.find('div', class_='post-meta__publish').find('span').find('time').get_text()
            post_dict['publish_time'] = pubtime
        except:
            post_dict['publish_time'] = ''
        ## get abstract
        try:
            abstract = post_soup.find('div', class_="hero__intro").get_text()
            post_dict['abstract'] = abstract
        except:
            post_dict['abstract'] = ''        
        ## get text
        try:
            regex = re.compile('.*wysiwyg*')
            all_div = post_soup.find_all("div", {"class" : regex})
            text_list = [div.get_text() for div in all_div]
            text = ''.join(text_list)
            post_dict['text'] = text
        except:
            continue
            print('continued')
            
        ## get ID
        post_dict['id'] = postcount    
        postcount += 1
        
        ## Write to df
        row = pd.Series()
        for k,v in post_dict.items():
            row[k] = v
        df = df.append(row, ignore_index=True)
        
    print("{} articles collected thus far".format(len(df)))

    ## if exists, flip to the next page
    if pagecount < max_pages:
        if search_soup.find('a', class_="c-button c-button--link") is not None:
            pagecount += 1
            print("flipping to page {}".format(pagecount))
            df = get_search_info(keyword, pagecount = pagecount, max_pages=max_pages, postcount=postcount, df = df)
    
    return df

In [6]:
## Let's run our function!
df = get_search_info(keyword)
df.head()

  row = pd.Series()


10 articles collected thus far
flipping to page 2
20 articles collected thus far
flipping to page 3
30 articles collected thus far
flipping to page 4
40 articles collected thus far
flipping to page 5
50 articles collected thus far
flipping to page 6
60 articles collected thus far
flipping to page 7
70 articles collected thus far
flipping to page 8
80 articles collected thus far
flipping to page 9
90 articles collected thus far
flipping to page 10
100 articles collected thus far


Unnamed: 0,abstract,id,publish_time,text,title,url
0,,1.0,,"\nOpioids are powerful painkillers, and health...",,https://www.drugwatch.com/opioids/
1,,2.0,,\nThe opioid crisis began in the late 1990s. I...,,https://www.drugwatch.com/opioids/lawsuits/
2,,3.0,,,,https://www.drugwatch.com/featured/opioid-cris...
3,,4.0,,\nA new medical study showed that patients who...,Opioid Use After Hip Replacement Linked to Rev...,https://www.drugwatch.com/news/2016/06/09/opio...
4,,5.0,,\nPharmaceutical companies promoting prescript...,Big Pharma Paid Doctors Millions in Opioid Cam...,https://www.drugwatch.com/news/2017/08/23/big-...


In [7]:
## We can also convert to JSON format if needed
result = df.loc[0].to_json()
parsed = json.loads(result)
parsed

{'abstract': '',
 'id': 1.0,
 'publish_time': '',
 'text': '\nOpioids are powerful painkillers, and health care providers typically prescribe them to patients to treat moderate-to-severe pain. Some uses include controlling pain from surgery, pain from an injury such as a broken bone or pain from cancer. But there has been a recent increase in opioid use for chronic, non-cancer pain including headaches, back pain and arthritis, despite serious risks and lack of evidence for long-term use.\n\nWhile opioids can help people control severe, acute pain when taken for short durations, they are highly addictive and can be misused. Because of this, the U.S. Food and Drug Administration and Centers for Disease Control and Prevention discourage opioids for long-term use.\nIn 2018, health care providers prescribed about 51 opioid prescriptions for every 100 people in the United States, according to the most recent data from the Center for Disease Control and Prevention. That breaks down to about 1