# **Web scraping Shopify - easily download all products**
**Link tutorial**: https://youtu.be/jPjxWC7zV2s<br>
**Link source for scraping**: https://helmboots.com/<br>
> **Link json**: 
- https://helmboots.com/products.json
- https://helmboots.com/products.json?limit=250&page=1

In [None]:
import pandas as pd
import numpy as np
import requests
import json

from urllib.request import urlopen

In [None]:
# Python web scraping:JSON - No.1

url = 'https://helmboots.com/products.json?limit=100&page=1'
r = requests.get(url)

data = r.json()

product_list = []

for item in data['products']:
  title = item['title']
  handle = item['handle']
  product_type = item['product_type']
  
  for image in item['images']:
    try:
      image = image['src']
    except:
      image = 'None'

  for variant in item['variants']:
    position = variant['position']
    product_id = variant['product_id']
    sku = variant['sku']
    price = variant['price']
    grams = variant['grams']
    available = variant['available']
    created_at = variant['created_at']
    updated_at = variant['updated_at']

    product = {
        'sku':sku,
        'product_id':product_id,
        'image':image,
        'title':title,
        'hanle':handle,
        'product_type':product_type,
        'position':position,
        'price':price,
        'grams':grams,
        'available':available,
        'created_at':created_at,
        'updated_at':updated_at
    }

    product_list.append(product)

In [None]:
#Change list into dataframe
helmboots_shopify = pd.DataFrame(product_list)

#Remove duplicate data
helmboots_shopify['title'].drop_duplicates()

0                                The Railroad
27                           The Muller Brown
54                           The Marion Olive
81                           The Muller Black
108                           The Muller Teak
                        ...                  
1774                          The Pablo Black
1801                                Gift Card
1809    Otter Wax Saddle Soap Leather Cleaner
1810     Otter Wax Boot Wax - Leather Sealant
1811      Otter Wax Leather Salve Conditioner
Name: title, Length: 99, dtype: object

In [None]:
helmboots_shopify

Unnamed: 0,sku,product_id,image,title,hanle,product_type,position,price,grams,available,created_at,updated_at
0,H002022006,99414168,https://cdn.shopify.com/s/files/1/0048/2912/pr...,The Railroad,railroad-blucher-boot,Boots,1,350.00,1814,True,2018-02-19T12:13:37-06:00,2021-11-18T16:56:45-06:00
1,H002022007,99414168,https://cdn.shopify.com/s/files/1/0048/2912/pr...,The Railroad,railroad-blucher-boot,Boots,2,350.00,1814,False,2012-08-08T18:38:20-05:00,2021-05-13T07:11:55-05:00
2,H002022075,99414168,https://cdn.shopify.com/s/files/1/0048/2912/pr...,The Railroad,railroad-blucher-boot,Boots,3,350.00,1814,False,2012-08-08T18:41:38-05:00,2021-04-09T05:19:35-05:00
3,H002022008,99414168,https://cdn.shopify.com/s/files/1/0048/2912/pr...,The Railroad,railroad-blucher-boot,Boots,4,350.00,1814,False,2012-08-08T18:41:58-05:00,2021-05-02T22:16:25-05:00
4,H002022085,99414168,https://cdn.shopify.com/s/files/1/0048/2912/pr...,The Railroad,railroad-blucher-boot,Boots,5,350.00,1814,True,2012-08-08T18:42:39-05:00,2021-11-18T16:56:45-06:00
...,...,...,...,...,...,...,...,...,...,...,...,...
1807,,384430588,https://cdn.shopify.com/s/files/1/0048/2912/pr...,Gift Card,gift-card-1,Gift Card,7,400.00,0,True,2018-06-04T09:24:17-05:00,2021-12-22T12:17:01-06:00
1808,,384430588,https://cdn.shopify.com/s/files/1/0048/2912/pr...,Gift Card,gift-card-1,Gift Card,8,500.00,27,True,2014-11-19T15:26:46-06:00,2021-12-22T22:00:20-06:00
1809,10253,322741061,https://cdn.shopify.com/s/files/1/0048/2912/pr...,Otter Wax Saddle Soap Leather Cleaner,otter-wax-saddle-soap,Boot Care,1,13.00,57,True,2014-05-30T16:50:08-05:00,2021-12-31T22:18:46-06:00
1810,OTTRWX_006,2532788869,https://cdn.shopify.com/s/files/1/0048/2912/pr...,Otter Wax Boot Wax - Leather Sealant,otter-wax-boot-wax,Boot Care,1,13.00,57,True,2015-10-21T21:29:47-05:00,2022-01-07T12:37:21-06:00


In [None]:
# Python web scraping:JSON - No.2
response = urlopen("https://helmboots.com/products.json?limit=100&page=1")
json_data = response.read().decode('utf-8', 'replace')
d = json.loads(json_data)
df = pd.json_normalize(d['products'])

df.head(2)

Unnamed: 0,id,title,handle,body_html,published_at,created_at,updated_at,vendor,product_type,tags,variants,images,options
0,99414168,The Railroad,railroad-blucher-boot,"<meta charset=""utf-8"">\n<p><span style=""font-w...",2022-01-10T14:22:25-06:00,2012-08-08T18:38:20-05:00,2022-01-10T14:22:26-06:00,HELM Boots,Boots,[],"[{'id': 6829619085374, 'title': '6 / D', 'opti...","[{'id': 28562308530238, 'created_at': '2021-08...","[{'name': 'Size', 'position': 1, 'values': ['6..."
1,334557149,The Muller Brown,muller-brown-blucher-boot,"<meta charset=""utf-8"">\n<div></div>\n<div>\n<m...",2022-01-10T14:20:57-06:00,2014-06-24T12:43:35-05:00,2022-01-11T11:48:26-06:00,HELM,Boots,"[Boots, cf-color-Dark Browns, cf-size-06, cf-s...","[{'id': 6973665902654, 'title': '6 / D', 'opti...","[{'id': 28562521358398, 'created_at': '2021-08...","[{'name': 'Size', 'position': 1, 'values': ['6..."


In [None]:
# Python web scraping:JSON - No.3
df = pd.read_json('https://helmboots.com/products.json?limit=100&page=1')
df_helmboots = pd.DataFrame(df['products'].values.tolist())
df_helmboots.head(2)

Unnamed: 0,id,title,handle,body_html,published_at,created_at,updated_at,vendor,product_type,tags,variants,images,options
0,99414168,The Railroad,railroad-blucher-boot,"<meta charset=""utf-8"">\n<p><span style=""font-w...",2022-01-10T14:22:25-06:00,2012-08-08T18:38:20-05:00,2022-01-10T14:22:26-06:00,HELM Boots,Boots,[],"[{'id': 6829619085374, 'title': '6 / D', 'opti...","[{'id': 28562308530238, 'created_at': '2021-08...","[{'name': 'Size', 'position': 1, 'values': ['6..."
1,334557149,The Muller Brown,muller-brown-blucher-boot,"<meta charset=""utf-8"">\n<div></div>\n<div>\n<m...",2022-01-10T14:20:57-06:00,2014-06-24T12:43:35-05:00,2022-01-11T11:48:26-06:00,HELM,Boots,"[Boots, cf-color-Dark Browns, cf-size-06, cf-s...","[{'id': 6973665902654, 'title': '6 / D', 'opti...","[{'id': 28562521358398, 'created_at': '2021-08...","[{'name': 'Size', 'position': 1, 'values': ['6..."
