In [1]:
# Imports
import pandas as pd
import requests
import os
from bs4 import BeautifulSoup
import re
from splinter import Browser
import time

In [2]:
# Create base url
base_url = 'https://www.walgreens.com/store/store/category/productlist.jsp?Erp=72&N=362122&Eon=362122'

In [3]:
executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
browser=Browser("chrome", **executable_path, headless=False)

def get_html(url): 
    browser.visit(url)
    SCROLL_PAUSE_TIME=0.5
    last_height = browser.execute_script("return document.body.scrollHeight")
    while True:
        # Scroll down to bottom
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)
        # Calculate new scroll height and compare with last scroll height
        new_height = browser.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    time.sleep(3)
    html=browser.html
    return html

In [9]:
def scrape_one_page(html):
    all_items=[]
    base_path='http://www.walgreens.com'
    soup=BeautifulSoup(html, 'html.parser')
    results=soup.find_all('div', id=re.compile('productcardprod'))
    for each_result in results: 
        item_dict={}
        #finding name, link, and upc
        name_and_link=each_result.find('a')
        item_dict['name']=name_and_link['title'].strip()
        item_dict['link']=base_path+name_and_link['href']
        new_path=item_dict['link']
        response=requests.get(new_path)
        item_soup=BeautifulSoup(response.text, 'html.parser')
        try: 
            item_dict['upc']=item_soup.find('p', class_='universal-Item-code').text.replace('UPC:', '')
        except: 
            pass

        # finding price
        try: 
            first_span=each_result.find('span', class_='product__price')
            second_span=first_span.find('span', class_='product__price')
            price=float(second_span.text.replace('$', ''))/100
            item_dict['price']=price
        except: 
            pass

        # finding rating and reviews
        try: 
            rating_and_reviews=each_result.find('div', class_='wag-prod-ratings').find('img')['alt']
            rating, reviews=rating_and_reviews.split(', ')
            item_dict['rating']=rating.replace(' out of 5', '')
            item_dict['reviews']=reviews.replace('total ', '').replace('reviews', '')
    #     print(each_result.text)
        except: 
            pass
        # print(item_dict)
        all_items.append(item_dict)
    return all_items

In [10]:
# Iterate through each page
max_number_items=194
all_pages=[]
page_numbers=[str(i*24) for i in range(0, int(max_number_items/24))]
for each_page in page_numbers[:8]:
    url=base_url+'&No='+each_page
    # print(url)
    html=get_html(url)
    one_page = scrape_one_page(html)
    for item in one_page:
        all_pages.append(item)
    print('page: \n' + url + '\nhas ' + str(len(one_page)) + ' items')
    time.sleep(2)

page: 
https://www.walgreens.com/store/store/category/productlist.jsp?Erp=72&N=362122&Eon=362122&No=0
has 24 items
page: 
https://www.walgreens.com/store/store/category/productlist.jsp?Erp=72&N=362122&Eon=362122&No=24
has 24 items
page: 
https://www.walgreens.com/store/store/category/productlist.jsp?Erp=72&N=362122&Eon=362122&No=48
has 24 items
page: 
https://www.walgreens.com/store/store/category/productlist.jsp?Erp=72&N=362122&Eon=362122&No=72
has 24 items
page: 
https://www.walgreens.com/store/store/category/productlist.jsp?Erp=72&N=362122&Eon=362122&No=96
has 24 items
page: 
https://www.walgreens.com/store/store/category/productlist.jsp?Erp=72&N=362122&Eon=362122&No=120
has 22 items
page: 
https://www.walgreens.com/store/store/category/productlist.jsp?Erp=72&N=362122&Eon=362122&No=144
has 12 items
page: 
https://www.walgreens.com/store/store/category/productlist.jsp?Erp=72&N=362122&Eon=362122&No=168
has 16 items


In [11]:
df=pd.DataFrame(all_pages)
df.to_csv('data_walgreens.csv')
df


Unnamed: 0,name,link,upc,price,rating,reviews
0,Tide Liquid Laundry Detergent Plus Downy April...,http://www.walgreens.com/store/c/tide-liquid-l...,03700073975,4.99,4.2,280
1,Glade Carpet & Room Odor Eliminator Powder (32...,http://www.walgreens.com/store/c/glade-carpet-...,04650015474,3.29,4.8,39
2,Scrubbing Bubbles Foaming Disinfectant Citrus ...,http://www.walgreens.com/store/c/scrubbing-bub...,02570070755,3.99,4.6,188
3,Tide Ultra Stain Release High Efficiency Liqui...,http://www.walgreens.com/store/c/tide-ultra-st...,03700087589,4.99,4.3,246
4,Palmolive Oxy Dish Soap (20 oz ),http://www.walgreens.com/store/c/palmolive-oxy...,03500045041,,5.0,2
...,...,...,...,...,...,...
165,Contour Products Bio Pillow (1 ea ),http://www.walgreens.com/store/c/contour-produ...,73770900300,9.99,,
166,Libman Tile and Grout Brush (1 ea ),http://www.walgreens.com/store/c/libman-tile-a...,07173600018,,,
167,Goo Gone Sticker Lifter (2 fl oz ),http://www.walgreens.com/store/c/goo-gone-stic...,07004877191,,,
168,Personal Care Toilet Bowl Tabs Blue (3 ea ),http://www.walgreens.com/store/c/personal-care...,04815592544,,,
