## Web Scrape All Pharmacies and Add Data to Data Frame


In [1]:
import pandas as pd
import shutil
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pdb
from urllib.request import urlopen, Request
import requests

In [2]:
#load csv file of non-accredited pharmacy urls
df_unaccredited = pd.read_csv('fake-pharmacies.csv', header=None)

#convert rows to strings, remove spaces, rename column
df_unaccredited['url'] = df_unaccredited[0].str.strip()
#drop old column
df_unaccredited = df_unaccredited.drop([0], axis=1)

# strip https:// bc they only occur for some of the urls
df_unaccredited['url'] = df_unaccredited['url'].str.strip('https://')
#make sure entire strings are lowercase
df_unaccredited['url'] = df_unaccredited['url'].str.lower()

# strip 'www.' bc they only occur for some urls
#df['url'] = df['url'].str.strip('www.')

# add the https:// part back in for all of them
df_unaccredited['url'] = 'https://' + df_unaccredited['url'].astype(str)
df_unaccredited.head(20)


Unnamed: 0,url
0,https://inhousepharmacy.vu
1,https://www.wellerectile.com
2,https://kubapharm.com
3,https://remiumlightsupplier.com
4,https://ivermectin24h.com
5,https://rxshopmd.com
6,https://extrapharmacy.ru
7,https://rxmedkart.com
8,https://247rxpill.in
9,https://armacygeoff.md


In [3]:
#get a list of all the raw links
raw_links_unaccredited = df_unaccredited['url'].tolist()
#raw_links_unaccredited

In [4]:
#remove clearly problematic strings: 'https://euphoria healthcare pvt ltd.'
#'https://icenetworks ltd./www.mega-pillspharmacy.com' etc 
# initializing K
A = 'https://www.medzbrand.com'
B = 'https://www.onlinepharmacydrug.com'
C = 'https://multiproductphshop.com'
D = 'https://mdedrx.com'
E = 'https://md-q.com'
F = 'https://mallofmedicine.com'   
G = 'https://euphoria healthcare pvt ltd.'
H = 'https://icenetworks ltd./www.mega-pillspharmacy.com'

# using list comprehension to Remove K String from String List

raw_links_unaccredited = [i for i in raw_links_unaccredited if i != A]
raw_links_unaccredited = [i for i in raw_links_unaccredited if i != B]
raw_links_unaccredited = [i for i in raw_links_unaccredited if i != C]
raw_links_unaccredited = [i for i in raw_links_unaccredited if i != D]
raw_links_unaccredited = [i for i in raw_links_unaccredited if i != F]
raw_links_unaccredited = [i for i in raw_links_unaccredited if i != G]
raw_links_unaccredited = [i for i in raw_links_unaccredited if i != H]

In [5]:
#raw_links_unaccredited

### List of URLs

In [6]:
#build a list of urls that can be accessed without error and scrapped 
#build a url list of the bad urls 
def sort_urls(raw_links_list):
    url_list = []
    bad_urls = []
    for each in raw_links_list:
        try:
            headers = {'user-agent': 'ds6050 (vkb6bn@virginia.edu)'}
            results = requests.get(each, headers=headers, timeout=2.0)
        except requests.exceptions.RequestException as e:
            #print('Bad URL: ' + each)
            bad_urls.append(each)
            continue
        #print(results)
        #print(each) 
        if results.status_code == 200:
            url_list.append(each)
    return url_list, bad_urls

url_list_unaccredited, bad_urls_unaccredited = sort_urls(raw_links_unaccredited)

In [7]:
len(bad_urls_unaccredited)

469

In [8]:
len(url_list_unaccredited)

99

In [9]:
#url_list_unaccredited

### Loop through url_list to scrape text & images

In [10]:
#find and extract image urls (4 tag possibilies)
def get_image_urls(images, url):
    image_url_list = []
    if len(images) != 0: #make sure there are images
        for i, image in enumerate(images):
            try:
                #search for 'srcset'
                image_url = image['scrset']
                if image_url.startswith('https'):
                    image_url_list.append(image_url)
                else:
                    image_url_list.append(url+image_url)
            except:
                try:
                    #search for 'src'
                    image_url = image['src']
                    if image_url.startswith('https'):
                        image_url_list.append(image_url)
                    else:
                        image_url_list.append(url+image_url)
                except:
                    pass
    return image_url_list       

In [11]:
#scrape web data and add to dataframe
def web_data_to_df(url_list, is_accredited):
    df = pd.DataFrame()
    for url in url_list:
        #print('\n' + url +'\n')
        #html = urlopen(url)
        headers = {'user-agent': 'Mozilla/5.0'}
        try:
            response = requests.get(url, headers, allow_redirects=False, timeout=5) #request html from url
            #print(response)
        except requests.exceptions.RequestException as e:
            print('\n' + url +'\n')
            print(e)
            continue
        html = response.content #raw, unformatted html from site 
        #print(html)
        soup = BeautifulSoup(html,'html.parser')

        #remove style and script tags
        for data in soup(['style', 'script']):
            #remove tags, so all is left is raw html
            data.decompose()
            
        #scrape body, header, footer, image data
        body = soup.body
        header = soup.header
        footer = soup.footer
        images = soup.findAll('img')
        image_urls = get_image_urls(images, url) #use function defined above

        if body != None: 
            body= ' '.join(body.stripped_strings)
            #print(body)
        if header != None:
            header = ' '.join(header.stripped_strings)
            #print(header)
        if footer != None:
            footer = ' '.join(footer.stripped_strings)
            #print(footer)
            
        #turn scraped pharmacy website data into dataframe, for each url 
        d = {'Website': url, 'Body': body, 'Header': header, 'Footer': footer, 
         'Image Urls': image_urls, 'Accredited': 0 if is_accredited else 1}
        df = df.append(d, ignore_index=True)
    
    return df        

In [12]:
df_unaccredited = web_data_to_df(url_list_unaccredited, is_accredited=False)
df_unaccredited

  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignor


https://md-q.com

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignore_index=True)
  df = df.append(d, ignor

Unnamed: 0,Website,Body,Header,Footer,Image Urls,Accredited
0,https://inhousepharmacy.vu,Object moved to here .,,,[],1
1,https://www.wellerectile.com,Home Shop Men's Health Anti Viral Smart pills ...,Home Shop Men's Health Anti Viral Smart pills ...,“We Own healing wands. we are healers. We lend...,[https://static.wixstatic.com/media/3e04e6_80c...,1
2,https://kubapharm.com,Skip to navigation Skip to content Welcome to ...,Home About FAQ REVIEW REFUND POLICY Shop Produ...,Featured Products Buy Hydrocodone Online Now $...,[https://kubapharm.com/wp-content/uploads/2020...,1
3,https://ivermectin24h.com,Buy Ivermectin Online Ivermectin is applied in...,{{selectedSymbol}} {{selectedCur}} {{cur}} {{s...,{{'Please Call to'|translate}} US +1 (855) 42...,"[https://ivermectin24h.comimages/en_us.png, ht...",1
4,https://rxshopmd.com,Moved Permanently The document has moved here .,,,[],1
...,...,...,...,...,...,...
93,https://buckadaypharmacy.com,,,,[],1
94,https://canamericaglobal.com,,,,[],1
95,https://worldwidedrugplan.com,,,,[],1
96,https://medcentercanada.com,,,,[],1


In [14]:
df_unaccredited.to_csv('pharmacy_dataset_unaccredited.csv', index=False)