In [1]:
# Webscrape Verve website, the datasource, for all Verve products currently listed into a pandas dataframe.
# Using this url: https://www.vervecoffee.com/collections/all-coffee?view=products

In [2]:
# Import necessary python packages
import pandas as pd
import requests
import json
import re

In [3]:
# Send HTTP GET request to the Verve url
source_url = 'https://www.vervecoffee.com/collections/all-coffee?view=products'
source = requests.get(source_url)

In [4]:
# Convert extract into a string
source_text = source.content.decode('utf-8')

In [5]:
# Define starting and ending indices of desired substring
start_index = source_text.find('"title":"All Coffee","productVariants":[')
end_index = source_text.find(']', start_index)

In [6]:
# Extract substring between the start and end indices
substring = source_text[start_index + len('"title":"All Coffee","productVariants":['):end_index + 1]

In [7]:
# View substring output for validation
substring

'{"id":"40006386810963","image":{"src":"https:\\/\\/cdn.shopify.com\\/s\\/files\\/1\\/0035\\/9372\\/products\\/Bronson_1.png?v=1658421910"},"price":{"amount":16.0,"currencyCode":"USD"},"product":{"id":"6777135988819","title":"Bronson French Roast Craft Instant Coffee","untranslatedTitle":"Bronson French Roast Craft Instant Coffee","vendor":"Verve Coffee","type":"Coffee"},"sku":"4210621-0902100","title":"Default Title","untranslatedTitle":"Default Title"},{"id":"40006386876499","image":{"src":"https:\\/\\/cdn.shopify.com\\/s\\/files\\/1\\/0035\\/9372\\/products\\/BuenaVista_1.png?v=1658421973"},"price":{"amount":16.0,"currencyCode":"USD"},"product":{"id":"6777136054355","title":"Buena Vista Dark Roast Craft Instant Coffee","untranslatedTitle":"Buena Vista Dark Roast Craft Instant Coffee","vendor":"Verve Coffee","type":"Coffee"},"sku":"SM-0099","title":"Default Title","untranslatedTitle":"Default Title"},{"id":"40006387138643","image":{"src":"https:\\/\\/cdn.shopify.com\\/s\\/files\\/1\\

In [8]:
# Define starting and ending indices of the JSON data within substring
start_index = substring.index('{')
end_index = substring.rindex('}') + 1

# Extract JSON data from substring
json_data = substring[start_index:end_index]

In [9]:
# Parse JSON data
product_data = json.loads("[" + json_data + "]")

In [10]:
# Create empty list to store the Verve cofffe product information
products_list = []

In [11]:
# Iterate over each Verve cofffe product in product_data
for product in product_data:
    product_id = product['id']
    image_src = product['image']['src']
    price_amount = product['price']['amount']
    price_currency = product['price']['currencyCode']
    product_title = product['product']['title']
    vendor = product['product']['vendor']
    product_type = product['product']['type']
    sku = product['sku']

    # Define a dictionary with the product information for each listed Verve cofffe product
    product_info = {
        'Product_ID': product_id,
        'Image_Source': image_src,
        'Price_Amount': price_amount,
        'Price_Currency': price_currency,
        'Product_Title': product_title,
        'Vendor': vendor,
        'Product_Type': product_type,
        'SKU': sku
    }

    # Append the dictionary to products_list
    products_list.append(product_info)

In [12]:
# Create pandas dataFrame from products_list
df_products = pd.DataFrame(products_list)

In [13]:
# View df_products output
df_products

Unnamed: 0,Product_ID,Image_Source,Price_Amount,Price_Currency,Product_Title,Vendor,Product_Type,SKU
0,40006386810963,https://cdn.shopify.com/s/files/1/0035/9372/pr...,16.0,USD,Bronson French Roast Craft Instant Coffee,Verve Coffee,Coffee,4210621-0902100
1,40006386876499,https://cdn.shopify.com/s/files/1/0035/9372/pr...,16.0,USD,Buena Vista Dark Roast Craft Instant Coffee,Verve Coffee,Coffee,SM-0099
2,40006387138643,https://cdn.shopify.com/s/files/1/0035/9372/pr...,18.0,USD,Craft Instant Coffee Variety Pack,Verve Coffee,Coffee,4210821-0902100
3,40006409322579,https://cdn.shopify.com/s/files/1/0035/9372/pr...,16.0,USD,Seabright House Blend Craft Instant Coffee,Verve Coffee,Coffee,SM-0336
4,40006409355347,https://cdn.shopify.com/s/files/1/0035/9372/pr...,16.0,USD,Sermon Craft Instant Coffee,Verve Coffee,Coffee,4210421-0902100
5,40006410338387,https://cdn.shopify.com/s/files/1/0035/9372/pr...,16.0,USD,Streetlevel Craft Instant Coffee,Verve Coffee,Coffee,SM-0097
6,40006410403923,https://cdn.shopify.com/s/files/1/0035/9372/pr...,16.0,USD,The 1950 Craft Instant Coffee,Verve Coffee,Coffee,4210521-0902100
7,40006410698835,https://cdn.shopify.com/s/files/1/0035/9372/pr...,16.0,USD,Vancouver Decaf Craft Instant Coffee,Verve Coffee,Coffee,SM-0100
8,39923041108051,https://cdn.shopify.com/s/files/1/0035/9372/pr...,18.75,USD,Streetlevel,Verve Coffee,Coffee,CO-0182
9,39923040944211,https://cdn.shopify.com/s/files/1/0035/9372/pr...,17.75,USD,Bronson French Roast,Verve Coffee,Coffee,CO-0020


In [14]:
df_products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Product_ID      44 non-null     object 
 1   Image_Source    44 non-null     object 
 2   Price_Amount    44 non-null     float64
 3   Price_Currency  44 non-null     object 
 4   Product_Title   44 non-null     object 
 5   Vendor          44 non-null     object 
 6   Product_Type    44 non-null     object 
 7   SKU             44 non-null     object 
dtypes: float64(1), object(7)
memory usage: 2.9+ KB


# RAW HTML output reference:

In [15]:
# 'https://www.vervecoffee.com/collections/all-coffee?view=products'
source_text

'\n\n\n\n<!doctype html>\n<html class="no-js" lang="en">\n  <head>\n    <meta charset="utf-8">\n    <meta http-equiv="X-UA-Compatible" content="IE=edge">\n    <meta name="viewport" content="width=device-width,initial-scale=1">\n    <meta name="theme-color" content="">\n    <link rel="canonical" href="https://www.vervecoffee.com/collections/all-coffee">\n    <link rel="preconnect" href="https://cdn.shopify.com" crossorigin>\n    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script><link rel="icon" type="image/png" href="//cdn.shopify.com/s/files/1/0035/9372/files/favicon_33d2d288-2905-49cf-86a5-8525204526bc_32x32.png?v=1650427325"><link rel="preconnect" href="https://fonts.shopifycdn.com" crossorigin><title>All Coffee</title><script>window.dataLayer = window.dataLayer || [];</script>\n      <!-- Google Tag Manager -->\n<script>\n  window.dataLayer = window.dataLayer || []; \n  dataLayer.push({\n      \'event\': \'bva_gtm_init\',\n      \'template\': {\