# Aim: identify shopping cart softwares within websites' HTML

Ideal scenario: Use the top 100 (or so) websites for each cart software (source: https://builtwith.com/ecommerce/new-zealand/). This would automate collecting a large sample of websites to test and generate "actual"/baseline labels. However, this requires non-free plan.

Solution: Use lists of websites and manually plug them into the BuiltWith website to see what "eCommerce" software they use. Because I am using lists of websites which use the given softwares, I may be biased in favour of testing websites which are easily classifiable (but this is unavoidable for now).

In [1]:
import os
import re
import urllib, requests
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd

from fake_useragent import UserAgent

In [2]:
def make_url_name(url):
    """
    Return a name from the url, if it fails to make a nice name it will just return the url
    """
    netloc = urllib.parse.urlsplit(url).netloc
    netloc_pieces = netloc.split('.')
    
    if 'co' in netloc_pieces:
        i = netloc_pieces.index('co')
    elif 'com' in netloc_pieces:
        i = netloc_pieces.index('com')
    elif 'org' in netloc_pieces:
        i = netloc_pieces.index('org')
    elif 'nz' in netloc_pieces:
        i = netloc_pieces.index('nz')
    else:
        return netloc
    
    return netloc_pieces[i - 1]

In [3]:
# Source: BuiltWith.com
# '?' means that it was not showing on the Basic Technology View (detailed view may or may not show it[?])
# '!none' means that BuiltWith was unable to do lookups on the site

# There are multiple Squarespace plans that come up on BuiltWith, but I collated them under Squarespace
# Bold UpSell = Shopify, since it is a Shopify app
# Shopify Plus = Shopify, since it is a Shopify plan
# WooCommerce Checkout = WooCommerce, since I think they mean the same thing
# Magento Enterprise = Magento, since it is a Magento plan
actual = {
    'http://macpac.co.nz/': ['Demandware', 'Salesforce Commerce Cloud'],
    'https://www.huffer.co.nz/': ['Magento'],
    'https://www.allbirds.co.nz': ['Shopify'],
    'https://infinitedefinite.com/': ['Shopify'],
    'https://www.hallensteins.com/': ['?'],
    'https://www.glassons.com/nz': ['?'],
    'https://www.barkersonline.co.nz/': ['Magento'],
#     'https://www.maxshop.com/',  # no SSL certificate
    'https://shop.countdown.co.nz/': ['custom'],
    'https://www.barkers.co.nz/': ['WooCommerce'],
    'https://www.picspeanutbutter.com/nz/': ['?'],
    'https://rainbowconfectionery.co.nz/': ['Magento'],
    'https://www.mountainwarehouse.com/nz/': ['?'],
    'https://www.kathmandu.co.nz/': ['Magento'],
    'https://cottonon.com/NZ/': ['Salesforce Commerce Cloud', 'Demandware'],
    'https://www.platypusshoes.co.nz/': ['Magento'],
    
    # https://www.moneyhub.co.nz/online-shopping.html
    # > clothing
    'https://designerwardrobe.co.nz/': ['?'],
    'https://www.princesspolly.com.au/?currency=NZD': ['Shopify'],
    'https://www.surfstitch.com/nz/': ['Salesforce Commerce Cloud', 'Demandware'],
    'https://www.missguidedau.com/': ['Magento', 'Amplience'],
    'https://www.bonds.com.au/': ['Magento'],
    'https://www.ezibuy.com/shop/nz/': ['SAP Commerce Cloud'],
    'https://www.onceit.co.nz/': ['custom'],
    'https://www.merchant1948.co.nz/': ['Magento'],
    'https://www.justjeans.co.nz/': ['Kibo', 'HCL Commerce'],
    'https://allaboutagirl.co.nz/': ['Shopify'],
    'https://www.kidrepublic.co.nz/': ['custom'],
    'https://nz.boohoo.com/': ['Salesforce Commerce Cloud', 'Amplience', 'Demandware'],
    'https://www.superette.co.nz/': ['?'],
    'https://www.mode.co.nz/': ['CS Cart'],
    'https://www.prettylittlething.com.au/': ['Magento', 'Kibo'],
    'https://www.showpo.com/nz/': ['Salesforce Commerce Cloud', 'Demandware'],
    'https://www.farfetch.com/nz/': ['?'],
    'http://www.asos.com/': ['custom'],
    'https://www.zara.com/nz/': ['?'],
    # > beauty products
    'https://strawberrynet.com/en-us/': ['?'],
    'https://www.lovemymakeup.co.nz/': ['Shopify'],
    'https://nz.iherb.com/': ['?'],
    'https://beautybliss.co.nz/': ['?'],
    'https://www.sephora.nz/': ['custom'],
    # > sports
    'https://www.torpedo7.co.nz/': ['custom'],
    'https://www.rebelsport.co.nz/': ['?'],
    'https://www.nike.com/nz/': ['?'],
    'https://www.kathmandu.co.nz/': ['Magento'],
    # > general homeware/department store
    'https://www.thewarehouse.co.nz/': ['Demandware', 'Salesforce Commerce Cloud'],
    'https://www.kmart.co.nz/': ['HCL Commerce'],
    'https://www.farmers.co.nz/': ['Intershop'],
    'https://www.harveynorman.co.nz/': ['CS Cart'],
    'https://www.budgetbeds.co.nz/': ['OpenCart'],
    'https://www.catch.co.nz/': ['custom'],
    'https://www.toyco.co.nz/': ['custom'],
    # > computer & tech
    'https://www.noelleeming.co.nz/': ['Powerfront'],
    'https://www.pbtech.co.nz/': ['custom'],
    'https://www.jbhifi.co.nz/': ['custom'],
    'https://www.ebgames.co.nz/': ['custom'],
    'https://www.dicksmith.co.nz/': ['?'],
    # > auction/marketplace
    'https://www.etsy.com/': ['custom'],
    'https://www.aliexpress.com/': ['custom'],
#     'https://www.ebay.com/': ['!none'],  # VERY strict policy on bots
#     'https://www.ebay.com.au/': ['custom'],
    'https://www.amazon.com/': ['Amazon Webstore'],
    'https://www.wish.com/': ['?'],
    'https://www.1-day.co.nz/': ['custom'],
    'https://themarket.com/nz/': ['?'],
    'https://www.trademe.co.nz/': ['custom'],
    'https://www.mightyape.co.nz/': ['custom'],
    # > misc
    'https://www.skechers.co.nz/': ['Magento'],
    'https://www.notsocks.co.nz/': ['BigCommerce'],
    'https://www.supercheapauto.co.nz/': ['Demandware', 'Salesforce Commerce Cloud'],
    'https://www.repco.co.nz/': ['SAP Commerce Cloud'],
    'https://www.bunnings.co.nz/': ['Powerfront', 'Sitecore Experience Commerce'],
    'https://www.mitre10.co.nz/': ['SAP Commerce Cloud'],
    'https://www.warehousestationery.co.nz/': ['Demandware', 'Salesforce Commerce Cloud'],
    'https://www.fishpond.co.nz/': ['osCommerce'],
    'https://www.vistaprint.co.nz/': ['custom'],
    'https://www.plasticmodels.co.nz/': ['custom'],
    'https://www.diecastmodels.co.nz/': ['custom'],
    
    # https://woocommerce.com/posts/12-great-examples-woocommerce-stores/ - fairly out-of-date but oh well
    'https://www.jhornig.com/': ['WooCommerce'],
    'https://porterandyork.com/': ['WooCommerce'],
    'https://cupcake.com.ua/': ['?'],
    'https://www.dineamic.com.au/': ['Shopify'],
    'http://buddhibaby.ca/': ['?'],
    'https://ecokitty.co.uk/': ['WooCommerce'],
    'https://pickybars.com/': ['Shopify'],
    'https://fitgersbrewhouse.com/': ['Squarespace'],
    'https://bestfive.com.au/': ['!none'],
    'https://overclothing.com/': ['WooCommerce'],
    
    # https://blog.hubspot.com/marketing/best-shopify-stores
    'https://ugmonk.com/': ['Shopify'],
    'https://www.pipsnacks.com/': ['Shopify'],
    'https://www.taylorstitch.com/': ['Shopify'],
    'https://www.happinessabscissa.com/': ['Shopify'],
    'https://www.burga.com/': ['Shopify'],
    'https://skinny-teatox.com/': ['Shopify'],
    'https://helmboots.com/': ['Shopify'],
    'https://www.bioliteenergy.com/': ['Shopify'],
    'https://www.popchartlab.com/': ['Shopify'],
    'https://lucadanni.com/': ['Shopify'],
    'https://www.harrisfarm.com.au/': ['Shopify'],
    'https://www.greatgeorgewatches.com/': ['Shopify'],
    'http://www.choosemuse.com/': ['Shopify'],
    'https://brilliant.org/': ['?'],
    'https://www.holstee.com/': ['Shopify'],
    'https://www.kyliecosmetics.com/': ['Shopify'],
    
    # https://www.zfort.com/blog/top-10-magento-ecommerce-websites
    'https://www.joelandsonfabrics.com/': ['Magento'],
    'https://www.sigmabeauty.com/': ['Magento'],
    'https://www.coxandcox.co.uk/': ['Magento'],
    'https://www.boodles.com/': ['Magento'],
    'https://www.thenewcraftsmen.com/': ['Magento'],
    'https://www.warbyparker.com/': ['?'],
    'https://www.swellbottle.com/': ['WooCommerce'],
    'https://www.fredperry.com/': ['Magento'],
    'https://www.hellyhansen.com/': ['Magento', 'Kibo'],
    
    # https://winningwp.com/examples-of-websites-using-squarespace/
    'http://www.newinc.org/': ['Squarespace'],
    'http://www.aplusi.com/': ['?'],
    'https://fightingeel.com/': ['Squarespace'],
    'http://darrenbooth.com/': ['Squarespace'],
    'http://www.picathingswelove.com/': ['Squarespace'],
    'http://corp.bandsintown.com/': ['Squarespace'],
    'http://thehatchinstitute.org/': ['?'],  # for these sites, squarespace was under 'Content Management System'
    'http://aldernyc.com/': ['?'],
    'https://thechicks.com/': ['?'],
    'http://www.tianstudios.com/': ['Squarespace'],
    'http://www.corcellars.com/': ['?'],
    'http://www.liftedlandscape.com/': ['Squarespace'],
    'http://betatakaki.com/': ['?'],
    'http://bluedognyc.com/': ['?'],
    
    # https://www.wix.com/blog/ecommerce/2018/10/ecommerce-website-design-examples
    'https://www.coalandcanary.com/': ['Wix Stores', 'Ecwid'],
    'https://www.ultasmile.com/': ['Wix Stores'],
    'https://www.evolveclothinggallery.com/': ['Wix Stores'],
    'https://www.thespicesuite.com/': ['Wix Stores'],
    'https://www.kaekoo.com/': ['Wix Stores'],
    'https://www.rubylove.com/': ['Wix Stores'],
    'https://www.celiab.com/': ['Wix Stores'],
    # https://superbwebsitebuilders.com/wix-ecommerce-examples-online-stores-shops/
    'https://www.drysteppers.com/': ['Shopify'],
    'http://www.piece-of-history.com/': ['Wix Stores'],
    'https://www.vintique-watch.com/': ['?'],
    'https://www.originalteatowel.co.uk/': ['Wix Stores'],
    'http://www.bensimonboutique.com/': ['Wix Stores'],
    'https://www.amoodz.com/': ['Wix Stores'],
    'http://www.smallchangefinery.com/': ['Wix Stores'],
    'https://www.andreamirandasalas.com/': ['Wix Stores'],
#     'https://www.lefayeproducts.com/': ['Wix Stores', 'Ecwid'],  # domain for sale
    'https://www.claycrocks.com/': ['?'],
    'https://www.kerripom.com/': ['Wix Stores'],
    'https://www.kimweitkamp.com/': ['?'],
    'https://www.tarotbyseven.com/': ['Wix Stores', 'Ecwid'],
    'https://www.theguitarhanger.com/': ['Wix Stores'],
}

print(f"Recheck with BuiltWith detailed view:\n{[make_url_name(site) for site, cart_software in actual.items() if cart_software == ['?']]}")

Recheck with BuiltWith detailed view:
['hallensteins', 'glassons', 'picspeanutbutter', 'mountainwarehouse', 'designerwardrobe', 'superette', 'farfetch', 'zara', 'strawberrynet', 'iherb', 'beautybliss', 'rebelsport', 'nike', 'dicksmith', 'wish', 'themarket', 'cupcake', 'buddhibaby.ca', 'brilliant', 'warbyparker', 'aplusi', 'thehatchinstitute', 'aldernyc', 'thechicks', 'corcellars', 'betatakaki', 'bluedognyc', 'vintique-watch', 'claycrocks', 'kimweitkamp']


In [4]:
def cart_software_detector(html):
    """
    The commented-out detectors are old versions that worked very well, but 
    not quite as well as their replacement.
    """
    bs = BeautifulSoup(html)
    ecom = []
        
    # Demandware is a subsidiary of Salesforce and was renamed to Salesforce Commerce Cloud
    if any('demandware' in tag['src'] for tag in bs.find_all('img', {'src': True})):
        ecom.append('Demandware')
        
    if bs.find('script', attrs={'type': 'text/x-magento-init'}) is not None:
        ecom.append('Magento')
        
    if bs.find('span', attrs={'class': 'nosto_cart'}) is not None:
        ecom.append('nosto')
        
#     if any('var Shopify =' in script.string for script in bs.find_all('script') if script.string]):
#         ecom.append('Shopify')
    if any('shopify' in tag['href'] for tag in bs.find_all('link', {'rel': 'stylesheet', 'href': True})):
        ecom.append('Shopify')
        
    sitecore_detector = any('sitecore-link-wrapper' in tag['class'] for tag in bs.find_all('div', {'class': True}))
    if sitecore_detector or 'SITECORE_APIKEY' in str(bs):
        ecom.append('Sitecore Experience Commerce')
        
    if bs.find('link', {'rel': 'preconnect', 'href': 'https://images.squarespace-cdn.com'}) is not None:
        ecom.append('Squarespace')
        
    # These two Wix detections work perfectly/identically on the testing websites, so I left just the computationally cheapest
    if bs.find('meta', {'name': 'generator', 'content': 'Wix.com Website Builder'}) is not None:
        ecom.append('Wix Stores')
#     elif any('static.parastorage.com' in tag['src'] for tag in bs.find_all('script', {'src': True})):
#         ecom.append('Wix Stores')
        
#     woocommerce_script_detector = ['woocommerce' in tag['src'] for tag in bs.find_all('script', {'src': True})]
#     if True in woocommerce_script_detector:
#         ecom.append('WooCommerce')
#     woocommerce_link_detector = ['woocommerce' in tag['id'] for tag in bs.find_all('link', {'rel': 'stylesheet', 'id': True})]
#     if True in woocommerce_link_detector:
#         ecom.append('WooCommerce')
    if bs.find('style', {'id': 'woocommerce-inline-inline-css', 'type': 'text/css'}) is not None:
        ecom.append('WooCommerce')
        
    return ecom

In [5]:
html_folder = "test_scraping_html"
site_to_html = dict()
scrape_errors = dict()

OVERWRITE_HTML_FILES = False

ua = UserAgent() # pretend to be Chrome user

for url in actual.keys():
    try:
        fpath = os.path.join(html_folder, f"{make_url_name(url)}.html")
        
        if not os.path.exists(fpath) or OVERWRITE_HTML_FILES:   
            r = requests.get(url, timeout=20, headers={'User-Agent': ua.chrome})
            html = r.content.decode()
            with open(fpath, 'w') as f:
                f.write(html)
        else: 
            with open(fpath, 'r') as f:
                html = f.read()
            
        site_to_html[url] = html
        
    except Exception as e:
        error_name = type(e).__name__
        print(f"{error_name}: {url}")
        scrape_errors[url] = error_name

UnicodeDecodeError: https://www.pbtech.co.nz/
SSLError: https://www.vintique-watch.com/


In [6]:
pred = {
    site: cart_software_detector(html) for site, html in site_to_html.items()
}

robotblocks = [
    'https://www.repco.co.nz/'
    'https://kmart.co.nz/',
    'https://www.ebgames.co.nz/',
    'https://www.dicksmith.co.nz/',
    'https://www.amazon.com/',
]
for site in robotblocks:
    scrape_errors[site] = "robotblock"

results = pd.DataFrame(
    [(make_url_name(site), site, actual[site], pred.get(site, "")) for site in actual.keys()],
    columns=['name', 'url', 'BuiltWith', 'pred']
)

results['Correct'] = [actual == pred if actual != ['?'] else "" 
                      for actual, pred in zip(results['BuiltWith'], results['pred'])]
results['Partial_correct'] = [len(set(actual).intersection(set(pred))) > 0 if actual != ['?'] else ""
                              for actual, pred, in zip(results['BuiltWith'], results['pred'])]  # check for any overlap
results['Exception'] = [scrape_errors.get(site, "") for site in results['url']]

results.to_csv("results/test_scraping_results.csv")
results

Unnamed: 0,name,url,BuiltWith,pred,Correct,Partial_correct,Exception
0,macpac,http://macpac.co.nz/,"[Demandware, Salesforce Commerce Cloud]",[Demandware],False,True,
1,huffer,https://www.huffer.co.nz/,[Magento],[Magento],True,True,
2,allbirds,https://www.allbirds.co.nz,[Shopify],[Shopify],True,True,
3,infinitedefinite,https://infinitedefinite.com/,[Shopify],[Shopify],True,True,
4,hallensteins,https://www.hallensteins.com/,[?],[],,,
...,...,...,...,...,...,...,...
137,claycrocks,https://www.claycrocks.com/,[?],[Wix Stores],,,
138,kerripom,https://www.kerripom.com/,[Wix Stores],[Wix Stores],True,True,
139,kimweitkamp,https://www.kimweitkamp.com/,[?],[Wix Stores],,,
140,tarotbyseven,https://www.tarotbyseven.com/,"[Wix Stores, Ecwid]",[Wix Stores],False,True,


## Check Results

Note that these results do not consider *implied* cart software. For example:

- I know that 'Salesforce Commerce Cloud' is a different name for 'Demandware' but I have not coded this logic.
- I am 80% certain that the presence of 'nosto' implies 'Magento' but I have not coded this logic (because I'm not 100% sure).
- I think 'WooCommerce Checkout' is redundant, and 'WooCommerce' would suffice.
- 'Shopify Plus' is a more expensive plan of 'Shopify', I don't know if it's necessary to distinguish these.

In [7]:
performance = pd.DataFrame({'software': sorted(set(results['BuiltWith'].sum()))})
performance['BuiltWith'] = [sum([software in x for x in results['BuiltWith']]) for software in performance['software']]
performance['true_pos'] = [sum([software in actual and software in pred for actual, pred in zip(results['BuiltWith'], results['pred'])]) 
                           for software in performance['software']]
performance['false_pos'] = [sum([software not in actual and software in pred for actual, pred in zip(results['BuiltWith'], results['pred'])]) 
                            for software in performance['software']]

performance.to_csv("results/test_scraping_performance.csv")
performance[performance['BuiltWith'] > 1]  # filter so it's not as cluttered

Unnamed: 0,software,BuiltWith,true_pos,false_pos
1,?,30,0,0
3,Amplience,2,0,0
5,CS Cart,2,0,0
6,Demandware,8,8,0
7,Ecwid,2,0,0
8,HCL Commerce,2,0,0
10,Kibo,3,0,0
11,Magento,17,11,0
13,Powerfront,2,0,0
14,SAP Commerce Cloud,3,0,0


## Testing

In [8]:
# See sites which have Wix Stores as actual OR predicted
wix_mask = [
    'Wix Stores' in actual + pred if pred != ""
    else 'Wix Stores' in actual
    for actual, pred in zip(results['BuiltWith'], results['pred'])
]
results[wix_mask]

Unnamed: 0,name,url,BuiltWith,pred,Correct,Partial_correct,Exception
122,coalandcanary,https://www.coalandcanary.com/,"[Wix Stores, Ecwid]",[Wix Stores],False,True,
123,ultasmile,https://www.ultasmile.com/,[Wix Stores],[Wix Stores],True,True,
124,evolveclothinggallery,https://www.evolveclothinggallery.com/,[Wix Stores],[Wix Stores],True,True,
125,thespicesuite,https://www.thespicesuite.com/,[Wix Stores],[Wix Stores],True,True,
126,kaekoo,https://www.kaekoo.com/,[Wix Stores],[Wix Stores],True,True,
127,rubylove,https://www.rubylove.com/,[Wix Stores],[Wix Stores],True,True,
128,celiab,https://www.celiab.com/,[Wix Stores],[Wix Stores],True,True,
130,piece-of-history,http://www.piece-of-history.com/,[Wix Stores],[Wix Stores],True,True,
132,originalteatowel,https://www.originalteatowel.co.uk/,[Wix Stores],[Wix Stores],True,True,
133,bensimonboutique,http://www.bensimonboutique.com/,[Wix Stores],[Wix Stores],True,True,


In [9]:
for site in results[['Wix Stores' in x for x in results['BuiltWith']]]['url']:
    print(site)
    bs = BeautifulSoup(site_to_html[site])
    
    print(bs.find('meta', {'name': 'generator', 'content': 'Wix.com Website Builder'}) is not None)
#     print('static.parastorage.com' in site_to_html[site].lower())
    print(any('static.parastorage.com' in tag['src'] for tag in bs.find_all('script', {'src': True})))
    print('wix.com' in site_to_html[site].lower())

https://www.coalandcanary.com/
True
True
True
https://www.ultasmile.com/
True
True
True
https://www.evolveclothinggallery.com/
True
True
True
https://www.thespicesuite.com/
True
True
True
https://www.kaekoo.com/
True
True
True
https://www.rubylove.com/
True
True
True
https://www.celiab.com/
True
True
True
http://www.piece-of-history.com/
True
True
True
https://www.originalteatowel.co.uk/
True
True
True
http://www.bensimonboutique.com/
True
True
True
https://www.amoodz.com/
True
True
True
http://www.smallchangefinery.com/
True
True
True
https://www.andreamirandasalas.com/
True
True
True
https://www.kerripom.com/
True
True
True
https://www.tarotbyseven.com/
True
True
True
https://www.theguitarhanger.com/
True
True
True


In [10]:
# See which sites have custom cart software
results[['custom' in x for x in results['BuiltWith']]]

Unnamed: 0,name,url,BuiltWith,pred,Correct,Partial_correct,Exception
7,countdown,https://shop.countdown.co.nz/,[custom],[],False,False,
21,onceit,https://www.onceit.co.nz/,[custom],[],False,False,
25,kidrepublic,https://www.kidrepublic.co.nz/,[custom],[],False,False,
32,asos,http://www.asos.com/,[custom],[],False,False,
38,sephora,https://www.sephora.nz/,[custom],[],False,False,
39,torpedo7,https://www.torpedo7.co.nz/,[custom],[],False,False,
47,catch,https://www.catch.co.nz/,[custom],[],False,False,
48,toyco,https://www.toyco.co.nz/,[custom],[],False,False,
50,pbtech,https://www.pbtech.co.nz/,[custom],,False,False,UnicodeDecodeError
51,jbhifi,https://www.jbhifi.co.nz/,[custom],[],False,False,


In [11]:
for site in results[['custom' in x for x in results['BuiltWith']]]['url']:
    if site in scrape_errors:
        continue
    
    print(site)
    bs = BeautifulSoup(site_to_html[site])
    
    print([tag['title'] for tag in bs.find_all('a', {'title': True}) if 'cart' in tag['title'].lower()])
    print([tag_class for tag in bs.find_all('div', {'class': True}) for tag_class in tag['class'] if 'cart' in tag_class])
    print()
    
    print([tag['id'] for tag in bs.find_all('div', {'id': True}) ])#if 'minicart' in tag['id'].lower()])
    print()

https://shop.countdown.co.nz/
[]
[]

[]

https://www.onceit.co.nz/
['Link to shop the The Others, Cartel and More - New Restock! sale']
[]

[]

https://www.kidrepublic.co.nz/
['Shopping Cart']
['cart_qty_img']

['bodywrapper', 'ctl00_divHeader', 'ctl00_HeaderTop_pnlContentContainerData', 'ctl00_HeaderTop_Content_162_pnlContentData', 'ctl00_HeaderTop_Content_180_pnlContentData', 'ctl00_HeaderTop_Content_163_pnlContentData', 'ctl00_HeaderTop_Content_3_pnlContentContainerData', 'ctl00_HeaderTop_Content_3_Content_25_pnlContentData', 'ctl00_HeaderMiddle_pnlContentContainerData', 'ctl00_HeaderMiddle_Content_192_pnlContentData', 'ctl00_HeaderMiddle_Content_23_pnlContentContainerData', 'ctl00_HeaderMiddle_Content_23_Content_5_pnlContentData', 'ctl00_HeaderMiddle_Content_23_Content_5_dvMenu', 'mainMenuSearchBar', 'mainMenuNavigationBar', 'ctl00_HeaderBottom_pnlContentContainerData', 'ctl00_tdCentre', 'ctl00_MainCentre_container_pnlContentContainerData', 'ctl00_MainCentre_container_Content_183_p

## Detect card payment options - Visa, Mastercard etc.

In [13]:
def has_card_detector(html):
    bs = BeautifulSoup(html)
    
    if 'addtocart' in html.lower():
        return True
    elif any(tag['href'] for tag in bs.find_all('a', {'href': True}) if 'payment' in tag['href']):
        return True
    elif any('payment' in tag_class for tag in bs.find_all('li', {'class': True}) for tag_class in tag['class']):
        return True
    
    return False

def payment_system_detector(html):
    payment_names = ['visa','mastercard','amex','applepay','afterpay','zippay',
                     'alipay','klarna']
    
    html_lower = html.lower()
    detected_cards = [
        payment_name for payment_name in payment_names
        if payment_name in html_lower
    ]
    
    return detected_cards


card = pd.DataFrame(
    [(make_url_name(site), site) for site in actual.keys()],
    columns=['name', 'url']
)
card['has_card'] = [has_card_detector(site_to_html[site]) if site not in scrape_errors else False for site in card['url']]
card['payment_names'] = [payment_system_detector(site_to_html[site]) if site not in scrape_errors else [] for site in card['url']]

card[card['has_card']]

Unnamed: 0,name,url,has_card,payment_names
1,huffer,https://www.huffer.co.nz/,True,[]
2,allbirds,https://www.allbirds.co.nz,True,[afterpay]
3,infinitedefinite,https://infinitedefinite.com/,True,[]
4,hallensteins,https://www.hallensteins.com/,True,"[visa, mastercard, amex, applepay, afterpay, a..."
5,glassons,https://www.glassons.com/nz,True,"[visa, mastercard, amex, afterpay]"
...,...,...,...,...
134,amoodz,https://www.amoodz.com/,True,"[visa, mastercard]"
135,smallchangefinery,http://www.smallchangefinery.com/,True,[]
138,kerripom,https://www.kerripom.com/,True,[]
140,tarotbyseven,https://www.tarotbyseven.com/,True,[]


In [14]:
sum(['addtocart' in site_to_html[site].lower() for site in card['url'] if site not in scrape_errors])

58

In [15]:
sum([any(tag['href'] for tag in BeautifulSoup(site_to_html[site]).find_all('a', {'href': True}) if 'payment' in tag['href']) 
     for site in card['url'] if site not in scrape_errors])

12

In [16]:
sum([any('payment' in tag_class for tag in BeautifulSoup(site_to_html[site]).find_all('li', {'class': True}) for tag_class in tag['class']) 
     for site in card['url'] if site not in scrape_errors])

9

In [17]:
# See which sites do not have any payment info. detected (ie. no cart software, custom cart or payment system detected)
test = results.merge(card, how='inner', left_on=['name','url'], right_on=['name','url'])
test[~np.logical_or(np.logical_or(test['pred'].apply(len) > 0, test['has_card']), test['payment_names'].apply(len) > 0)]

Unnamed: 0,name,url,BuiltWith,pred,Correct,Partial_correct,Exception,has_card,payment_names
7,countdown,https://shop.countdown.co.nz/,[custom],[],False,False,,False,[]
9,picspeanutbutter,https://www.picspeanutbutter.com/nz/,[?],[],,,,False,[]
10,rainbowconfectionery,https://rainbowconfectionery.co.nz/,[Magento],[],False,False,,False,[]
23,justjeans,https://www.justjeans.co.nz/,"[Kibo, HCL Commerce]",[],False,False,,False,[]
41,nike,https://www.nike.com/nz/,[?],[],,,,False,[]
43,kmart,https://www.kmart.co.nz/,[HCL Commerce],[],False,False,,False,[]
44,farmers,https://www.farmers.co.nz/,[Intershop],[],False,False,,False,[]
50,pbtech,https://www.pbtech.co.nz/,[custom],,False,False,UnicodeDecodeError,False,[]
52,ebgames,https://www.ebgames.co.nz/,[custom],[],False,False,robotblock,False,[]
53,dicksmith,https://www.dicksmith.co.nz/,[?],[],,,robotblock,False,[]


In [19]:
test[test['Exception'] != ""]

Unnamed: 0,name,url,BuiltWith,pred,Correct,Partial_correct,Exception,has_card,payment_names
50,pbtech,https://www.pbtech.co.nz/,[custom],,False,False,UnicodeDecodeError,False,[]
52,ebgames,https://www.ebgames.co.nz/,[custom],[],False,False,robotblock,False,[]
53,dicksmith,https://www.dicksmith.co.nz/,[?],[],,,robotblock,False,[]
131,vintique-watch,https://www.vintique-watch.com/,[?],,,,SSLError,False,[]
