# Import required Python modules

In [232]:
# Selenium is the main scraping software used here
# It is used to automate web browsers (i.e. open URLs, click, scroll, etc without human interaction)
from selenium import webdriver

# beautifulsoup converts the scraped HTML into a structure you can easily retrieve information from
from bs4 import BeautifulSoup

# time can be used to slow down the scraper when necessary
import time

# 'pandas' will be used to manipulate the data and construct the output CSV file
import pandas as pd

# Scrape HTML from single product page

In [357]:
# set desired URL for scraping
url = 'https://www.parknshop.com/en/honey-crunch-corn-flakes/p/BP_151582'

# open browser using scraping software
driver = webdriver.Chrome("/Users/Tazman/Desktop/chromedriver 2")

# go to the desired URL on the driver
driver.get(url)

# convert the webpage's HTML into a structured format
soup=BeautifulSoup(driver.page_source, 'lxml')

# Collect desired information from scraped and formatted HTML

## Product name

In [358]:
# find the 'div' tag that contains class name 'itemName'
soup.find('div', {"class":"itemName"})

<div class="itemName"><a href="/en/brandlist/KELLOGGS/b/KELLOGGS"><span class="brandName" itemprop="brand">KELLOGG'S</span></a> <h1 class="productName productNameForH1" itemprop="name">HONEY CRUNCH CORN FLAKES</h1> <span class="sizeUnitColor">400G</span></div>

In [359]:
# extract all text
soup.find('div', {"class":"itemName"}).text

"KELLOGG'S HONEY CRUNCH CORN FLAKES 400G"

## Brand name

In [360]:
# find the 'div' tag that contains class name 'itemName', 
# then find 'span' tag that contains class name 'brandName'
soup.find('div', {"class":"itemName"}).find('span', {"class":"brandName"}).text

"KELLOGG'S"

## Package size

In [361]:
# find the 'div' tag that contains class name 'itemName', 
# then find 'span' tag that contains class name 'sizeUnitColo'

soup.find('div', {"class":"itemName"}).find('span', {"class":"sizeUnitColor"}).text

'400G'

## NIP photo

In [362]:
# find product images section
soup.find('div', {"class":'newShowGalleryImagesContainer'})

<div class="newShowGalleryImagesContainer">
<meta content="http://www.parknshop.com/medias/sys_master/front/prd/9059087187998.jpg" itemprop="image"/>
<div class="largePhoto">
<div class="photo">
<div class="photo-container" data-zoom-image="/medias/sys_master/front/zoom/9059100917790.jpg" id="large-photo" style="background-image:url(/medias/sys_master/front/prd/9059100622878.jpg);" title="KELLOGG'S HONEY CRUNCH CORN FLAKES"></div>
<div class="zoom-box" id="zoom-box"></div>
<div class="hidden-content"><img alt="HONEY CRUNCH CORN FLAKES" itemprop="image" src="/medias/sys_master/front/prd/9059100622878.jpg"/></div>
</div>
<ul class="mobile-thumb-point">
<li class="selected" data-image="/medias/sys_master/front/prdthumb/9059100688414.jpg" data-index="0" data-zoom-image="/medias/sys_master/front/zoom/9059100917790.jpg"></li>
<li class="" data-image="/medias/sys_master/back/prdthumb/9059079454750.jpg" data-index="0" data-zoom-image="/medias/sys_master/front/zoom/9059100917790.jpg"></li>
<li 

In [363]:
# then find all 'li' tags, as these contains the URLs
soup.find('div', {"class":'newShowGalleryImagesContainer'}).find_all('li')

[<li class="selected" data-image="/medias/sys_master/front/prdthumb/9059100688414.jpg" data-index="0" data-zoom-image="/medias/sys_master/front/zoom/9059100917790.jpg"></li>,
 <li class="" data-image="/medias/sys_master/back/prdthumb/9059079454750.jpg" data-index="0" data-zoom-image="/medias/sys_master/front/zoom/9059100917790.jpg"></li>,
 <li class="" data-image="/medias/sys_master/nutrition/prdthumb/9059078537246.jpg" data-index="0" data-zoom-image="/medias/sys_master/front/zoom/9059100917790.jpg"></li>]

In [364]:
# let's isolate the URL for the first image using 'get'
soup.find('div', {"class":'newShowGalleryImagesContainer'}).find_all('li')[0].get('data-image')

'/medias/sys_master/front/prdthumb/9059100688414.jpg'

In [365]:
# Side lesson - learn about list comprehensions
list_a = [1, 2, 3, 4]

# if we want to do something to each individual element in the list (e.g. square), we need to use a list comprehension

[x**2 for x in list_a]

[1, 4, 9, 16]

In [366]:
# use a list comprehension syntax to extract all image URLS
image_url_section = soup.find('div', {"class":'newShowGalleryImagesContainer'}).find_all('li')
image_urls = [x.get('data-image') for x in image_url_section]
image_urls

['/medias/sys_master/front/prdthumb/9059100688414.jpg',
 '/medias/sys_master/back/prdthumb/9059079454750.jpg',
 '/medias/sys_master/nutrition/prdthumb/9059078537246.jpg']

In [367]:
# use for loop to search for  presence of the word 'nutrition' in each URL

# first, assume NIP URL is absent and set variable to nothing
NIP_url = ''

for image_url in image_urls:
    if 'nutrition' in image_url:
        
        # if required URL is found, reset NIP_url variable, and concatenate the website URL out the front
        NIP_url = 'https://www.parknshop.com' + image_url
        
        # this stops the for loop - don't want it running longer than necessary!
        break
NIP_url

'https://www.parknshop.com/medias/sys_master/nutrition/prdthumb/9059078537246.jpg'

## Ingredients list

In [368]:
# find product info section
soup.find('div', {"class":"tabpage product-info selected"})

<div class="tabpage product-info selected" id="product-info">
<div class="tab-button"><span>Product Information</span><div class="icon-arrow-up"></div></div>
<div class="tab-content" itemprop="description">
<h3>Product Details:</h3>
<p>HONEY CRUNCH CORN FLAKES</p>
<br/>
<!-- <h3>Ingredient:</h3>
			<table>
				<tr class="header">
					<td>Typical values</td>
					<td>Per 100g</td>
					<td>Approx. 6 bottle (1000ml)</td>
					<td>Column for icons</td>
				</tr>
				<tr><td>Energy</td><td>2185kJ</td><td>365kJ</td><td><span class="icon-circle-tick"></span></td></tr>
				<tr><td>Energy</td><td>522kcal</td><td>87kcal</td><td><span class="icon-circle-tick"></span></td></tr>
				<tr><td>Fat</td><td>27.0g</td><td>4.5g</td><td></td></tr>
				<tr><td>Of which</td><td></td><td></td><td></td></tr>
				<tr><td>- Saturates</td><td>15.8g</td><td>2.6g</td><td><span class="icon-circle-tick"></span></td></tr>
				<tr><td>Carbohydrate</td><td>63.5g</td><td>10.6g</td><td><span class="icon-circle-tick"></s

In [369]:
# find all 'h3' tags in product info section
soup.find('div', {"class":"tabpage product-info selected"}).find_all('h3')

[<h3>Product Details:</h3>,
 <h3>Origin:</h3>,
 <h3>Product Usage:</h3>,
 <h3>Storage Condition:</h3>,
 <h3>Ingredients:</h3>]

In [370]:
# use a list comprehension again to capture all text
h3_headings = soup.find('div', {"class":"tabpage product-info selected"}).find_all('h3')

h3_headings_text = [x.text.lower() for x in h3_headings]
h3_headings_text

['product details:',
 'origin:',
 'product usage:',
 'storage condition:',
 'ingredients:']

In [371]:
# side lesson: accessing elements using indexes
list_b = ['a', 'b', 'c', 'd', 'e']
list_b[3]

'd'

In [372]:
# determine index that contains 'ingredients'
ingredients_index = ''
num_headings = len(h3_headings_text)
for index in range(0, num_headings):
    h3_heading = h3_headings_text[index]
    if 'ingredients' in h3_heading:
        ingredients_index = index
        break

In [373]:
ingredients_index

5

In [374]:
# now that we know this index, let's access the full ingredients list
soup.find('div', {"class":"tabpage product-info selected"}).find_all('p')

[<p>HONEY CRUNCH CORN FLAKES</p>,
 <p>United States</p>,
 <p>Contains gluten and soy bean. This product is manufactured in the facility also produces traces of peanuts and almond (tree nuts).</p>,
 <p>Refer to Chinese version</p>,
 <p>請置於陰涼乾燥處◦開封後,請將產品置入密封罐中或用密封夾夾住,以防產品受潮變質◦ (建議開袋後盡快食用完畢)</p>,
 <p>Corn, Sugar, Peanut, Honey, Malt Extract, Salt, Vitamin C (Ascorbic Acid), Antioxidant (⍺-Tocopherol), Nicotinamide, Zinc Oxide, Reduced Iron, Vitamin A Palmitate, Vitamin B6 (Pyridoxine Hydrochloride), Vitamin B1 (Thiamine Hydrochloride),  Vitamin B2 (Riboflavin), Vitamin B12 (Cyanocobalamin), Folic Acid.</p>]

In [375]:
soup.find('div', {"class":"tabpage product-info selected"}).find_all('p')[ingredients_index].text

'Corn, Sugar, Peanut, Honey, Malt Extract, Salt, Vitamin C (Ascorbic Acid), Antioxidant (⍺-Tocopherol), Nicotinamide, Zinc Oxide, Reduced Iron, Vitamin A Palmitate, Vitamin B6 (Pyridoxine Hydrochloride), Vitamin B1 (Thiamine Hydrochloride),  Vitamin B2 (Riboflavin), Vitamin B12 (Cyanocobalamin), Folic Acid.'

# Scrape bakery category

In [287]:
# set desired category URL for scraping
url = 'https://www.parknshop.com/en/breakfast-bakery/bakery/c/010100'

driver = webdriver.Chrome("/Users/Tazman/Desktop/chromedriver 2")

# go to the desired URL on the driver
driver.get(url)

In [277]:
# notice some products aren't visible until you click 'Show More'.
# we need to keep clicking this button until we see everything
for i in range(0, 100):
    try:
        time.sleep(3)
        driver.find_element_by_xpath("//*[@class='button highlight']").click()
    except:
        break

In [288]:
# convert the webpage's HTML into a structured format
soup=BeautifulSoup(driver.page_source, 'lxml')

In [289]:
# isolate all product URLs for the category
all_names_section = soup.find_all('div', {"class":"name"})
product_urls = ['https://www.parknshop.com' + x.find('a').get('data-link') for x in all_names_section]

In [381]:
# side lesson - how to store data in pandas dataframe, by progressively assing rows

row_1 = {'Animal':'Cat', 'n':2} #dictionary
row_2 = {'Animal':'Dog', 'n':3}

# let's append to a list

example = []
example.append(row_1)
example.append(row_2)

pd.DataFrame(example)

Unnamed: 0,Animal,n
0,Cat,2
1,Dog,3


In [339]:
# let's store all product data in the list 'data'

data = []
for product_url in product_urls:
    driver.get(product_url)
    time.sleep(3)
    soup=BeautifulSoup(driver.page_source, 'lxml')
    row = {}
    
    row['URL'] = product_url
    
    try:
        row['Product name'] = soup.find('div', {"class":"itemName"}).text
    except:
        row['Product name'] = ''
        
    try:    
        row['Brand Name'] = soup.find('div', {"class":"itemName"}).find('span', {"class":"brandName"}).text
    except:
        row['Brand Name'] = ''
    
    try:
        row['Package size'] = soup.find('div', {"class":"itemName"}).find('span', {"class":"sizeUnitColor"}).text
    except:
        row['Package size'] = ''
    
    try:
        image_url_section = soup.find('div', {"class":'newShowGalleryImagesContainer'}).find_all('li')
        image_urls = [x.get('data-image') for x in image_url_section]
        image_urls
        NIP_url = ''
        for image_url in image_urls:
            if 'nutrition' in image_url:
                NIP_url = 'https://www.parknshop.com' + image_url
                break
        row['NIP url'] = NIP_url
    except:
        row['NIP url'] = ''
    
    try:
        h3_headings = soup.find('div', {"class":"tabpage product-info selected"}).find_all('h3')
        h3_headings_text = [x.text.lower() for x in h3_headings]
        ingredients_index = ''
        num_headings = len(h3_headings_text)
        for index in range(0, num_headings):
            h3_heading = h3_headings_text[index]
            if 'ingredients' in h3_heading:
                ingredients_index = index
                break
        row['Ingredients'] = soup.find('div', {"class":"tabpage product-info selected"}).find_all('p')[ingredients_index].text
    except:
        row['Ingredients'] = ''
        
    data.append(row)
    
    

In [340]:
df = pd.DataFrame(data)

In [382]:
df.head()

Unnamed: 0,Brand Name,Ingredients,NIP url,Package size,Product name,URL
0,HANG HEUNG,"Lotus seed paste[lotus seed, sugar, peanut oil...",,4PCS,HANG HEUNG WHITE LOTUS SEED 2 YOLKS MOONCAKES ...,https://www.parknshop.com/en/white-lotus-seed-...
1,HANG HEUNG,"Almond, Glutinous rice flour, Walnut, Sugar, W...",,4PCS,HANG HEUNG MOONCAKE WITH ASSORTED NUTS 4PCS,https://www.parknshop.com/en/mooncake-with-ass...
2,MESTEMACHER,"Organic whole kernel rye, water, organic whole...",https://www.parknshop.com/medias/sys_master/nu...,500G,MESTEMACHER ORGANIC THREE GRAIN BREAD 500G,https://www.parknshop.com/en/organic-three-gra...
3,IMP. BANQUET,"Wheat Flour, Sugar, Margarine, Shortening, Egg...",https://www.parknshop.com/medias/sys_master/nu...,150G,IMP. BANQUET EGG ROLL 150G,https://www.parknshop.com/en/egg-roll/p/BP_423500
4,IMP. BANQUET,"Wheat Flour, Sugar, Shortening (Refined Palm O...",https://www.parknshop.com/medias/sys_master/nu...,140G,IMP. BANQUET PHOENIX ROLL 140G,https://www.parknshop.com/en/phoenix-roll/p/BP...


In [356]:
df.to_csv('bakery.csv')