# Error Analysis of Products
This looks at how often products' default sellers and shippers switch from Amazon to a third-party.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import re
import glob
import math
import json
from collections import Counter
from multiprocessing import Pool

from tqdm import tqdm
from lxml import html
import numpy as np
import pandas as pd

import parsers as P
from parsers import process_search_result
from utils import value_counts

In [22]:
# inputs
fn_prod = '../data/output/datasets/products.csv.xz'
pattern_spot_check = '../data/input/spotcheck/selenium-products/*/*/*/2021/05/*/webpage_product.html'

In [116]:
df_prod = pd.read_csv(fn_prod, compression='xz')
len(df_prod)

157405

## check product pages

In [117]:
value_counts(df_prod, "is_page_gone")

Unnamed: 0,count,percentage
False,156209,0.992402
True,1196,0.007598


In [118]:
value_counts(df_prod, "no_buybox_winner")

Unnamed: 0,count,percentage
False,151805,0.964423
True,5600,0.035577


In [159]:
len(df_prod[(df_prod.is_out_of_stock == True) |
            (df_prod.is_page_gone == True)]) / len(df_prod)

0.03895047806613513

In [119]:
df_prod.is_sold_by_amazon.value_counts(normalize=True, dropna=False)

False    0.615038
True     0.282456
True     0.102506
Name: is_sold_by_amazon, dtype: float64

In [120]:
len(df_prod[df_prod.sold_by.isnull()]) / len(df_prod)

0.10250627362536133

In [14]:
from parsers import parse_product_page

In [74]:
fn = '../data/input/selenium-products/5/5D/B075D9RDS1/2021/02/18/webpage_product.html'

In [72]:
fn = '../data/input/selenium-products/5/5D/B015DTI1OY/2021/02/17/webpage_product.html'

In [113]:
fn = '../data/input/selenium-products/S/SQ/B07SQBG2VG/2021/02/18/webpage_product.html'
parse_product_page(fn)

{'fn': '../data/input/selenium-products/S/SQ/B07SQBG2VG/2021/02/18/webpage_product.html',
 'title': 'L.O.L Surprise Box Officially Licensed L.O.L Surprise Mystery Subscription Box',
 'shipped_by': None,
 'sold_by': None,
 'has_third_party_sellers': False,
 'product_by_amazon': False,
 'our_brands_carousel': False,
 'ads': [],
 'no_buybox_winner': False,
 'is_out_of_stock': False,
 'is_page_gone': False,
 'is_custom': False,
 'suggestions': ['B07SQBG2VG', 'B07SQBG2VG']}

In [87]:
data = []
for fn in tqdm(to_test):
    data.append(parse_product_page(fn))

100%|██████████| 1344/1344 [00:50<00:00, 26.38it/s]


In [107]:
df = pd.DataFrame(data)

In [108]:
df = df[df.is_out_of_stock == False]

In [110]:
df['is_custom'] = df.title.apply(lambda x: any(y in x.lower() for y in ["personalized", "custom"]))

In [111]:
len(df[df.sold_by.isnull()]) / len(df), len(df[(df.sold_by.isnull()) & (~df.is_custom ==True)]) / len(df)

(0.3064113238967527, 0.09825145711906745)

In [121]:
# df[(df.sold_by.isnull()) & (~df.is_custom ==True)].sample(100, random_state=100).fn.tolist()

In [41]:
df_prod[(df_prod.is_sold_by_amazon.isnull()) & 
        (~df_prod.title.isnull()) &
        (df_prod.is_out_of_stock == False) &
        (df_prod.no_buybox_winner == False)]

Unnamed: 0,fn,title,shipped_by,sold_by,has_third_party_sellers,product_by_amazon,our_brands_carousel,ads,no_buybox_winner,is_out_of_stock,is_page_gone,suggestions,n_ads,asin,is_amazon,is_sold_by_amazon,is_shipped_by_amazon
159,../data/input/selenium-products/5/5D/B005D0DU7...,Nature Made Extra Strength Vitamin B12 3000 mc...,,,False,False,True,[],False,False,False,"['B005D0DU7M', 'B005D0DU7M', 'B005D0DU7M', 'B0...",0,B005D0DU7M,False,,
451,../data/input/selenium-products/5/5F/B075FCSCT...,"iHome 10"" x 13"" iCVBT10 Reflect PRO Portable, ...",,,True,False,False,[],False,False,False,"['B075FCSCTX', 'B075FCSCTX', 'B075FCSCTX', 'B0...",0,B075FCSCTX,False,,
455,../data/input/selenium-products/5/5F/B005FGQIL...,"Amazon.com $15 Gift Cards, Pack of 3 (Holiday ...",,,False,False,False,[],False,False,False,"['B005FGQIL4', 'B005FGQIL4', 'B005FGQIL4', 'B0...",0,B005FGQIL4,True,,
509,../data/input/selenium-products/5/5F/B075FBPT3...,PowerA Wired Controller for Nintendo Switch - ...,,,True,False,True,[],False,False,False,"['B075FBPT3W', 'B075FBPT3W', 'B075FBPT3W', 'B0...",0,B075FBPT3W,False,,
817,../data/input/selenium-products/5/5S/B075SBQN6...,Kindle Publishing: A Clear Guide to Making You...,,,True,False,False,[],False,False,False,"['B075SBQN6S', 'B075SBQN6S', 'B084SDV2ZW', 'B0...",0,B075SBQN6S,False,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156879,../data/input/selenium-products/0/0S/B000SRI5O...,Planters Nuts & Chocolate M&M's Trail Mix (6 o...,,,False,False,True,[],False,False,False,"['B000SRI5OS', 'B000SRI5OS', 'B000SRI5OS', 'B0...",0,B000SRI5OS,False,,
156935,../data/input/selenium-products/0/0S/B000SRKA6...,SKIPPY Peanut Butter Spread - Creamy - Natural...,,,False,False,True,"[b'<div id=""ape_Detail_ams-detail-right-v2_des...",False,False,False,"['B000SRKA6E', 'B000SRKA6E', 'B000SRKA6E', 'B0...",4,B000SRKA6E,False,,
157080,../data/input/selenium-products/0/02/B0002YKBV...,3M - 8511PB1-A-PS 8511 Paint Sanding N95 Cool-...,,,True,False,False,"[b'<div id=""ape_Detail_ams-detail-right-v2_des...",False,False,False,"['B0002YKBV2', 'B0002YKBV2', 'B0002YKBV2', 'B0...",5,B0002YKBV2,False,,
157220,../data/input/selenium-products/0/0O/B000O0GLS...,"Cascadian Farm Organic Cut Green Beans, Premiu...",,,False,False,True,"[b'<div id=""ape_Detail_ams-detail-right-v2_des...",False,False,False,"['B000O0GLSQ', 'B000O0GLSQ', 'B000O0GLSQ', 'B0...",4,B000O0GLSQ,False,,


In [54]:
to_test = df_prod[(df_prod.is_sold_by_amazon.isnull()) & 
        (~df_prod.title.isnull()) &
        (df_prod.is_out_of_stock == False) &
        (df_prod.no_buybox_winner == False)].fn.to_list()

## Check re-collected product pages
We pulled a random sample of valid product pages, and re-collected them a few months later.

In [122]:
files_products = glob.glob(pattern_spot_check)
len(files_products)

3465

In [123]:
# parse the re-collected prodcuts.
product_data = []
with Pool(processes=32) as pool:
    for record in tqdm(pool.imap_unordered(P.parse_product_page, 
                                           files_products), 
                       total=len(files_products)):
        product_data.append(record)

df_prod_new = pd.DataFrame(product_data)

100%|██████████| 3465/3465 [00:34<00:00, 101.09it/s]


In [124]:
df_prod_new = pd.DataFrame(product_data)

In [125]:
df_prod_new["asin"] = df_prod_new.fn.apply(lambda x: x.split('/2021')[0].split('/')[-1])

In [127]:
valid_asins = df_prod[~df_prod.sold_by.isnull()].asin

In [11]:
# df_prod_new = df_prod_new[~df_prod_new.sold_by.isnull()]

In [128]:
df_prod_new = df_prod_new[df_prod_new.asin.isin(valid_asins)]

In [129]:
df_prod_new = df_prod_new.sample(2500, random_state=303)

In [132]:
value_counts(df_prod_new, "is_out_of_stock")

Unnamed: 0,count,percentage
False,2356,0.9424
True,144,0.0576


In [130]:
value_counts(df_prod_new, "is_page_gone")

Unnamed: 0,count,percentage
False,2460,0.984
True,40,0.016


In [131]:
value_counts(df_prod_new, "no_buybox_winner")

Unnamed: 0,count,percentage
False,2420,0.968
True,80,0.032


In [133]:
# what percentage of pages are out of stock or gone
len(df_prod_new[(df_prod_new.is_out_of_stock == True) | 
                (df_prod_new.is_page_gone == True)]) / len(df_prod_new)

0.0736

In [134]:
df_prod_new.fillna(value=np.nan, inplace=True)
df_prod.fillna(value=np.nan, inplace=True)

In [135]:
# make a dataframe of new and old product page info
df = df_prod[df_prod.asin.isin(df_prod_new.asin.unique())].merge(df_prod_new, on='asin', suffixes= ('_og', '_new'))

In [136]:
len(df)

2500

In [137]:
df.is_sold_by_amazon.value_counts(normalize=True)

False    0.6828
True     0.3172
Name: is_sold_by_amazon, dtype: float64

In [146]:
amazon_sellers = ['zappos', 'whole foods', 'amazon']

def who_switched(row, col1='sold_by_new', col2='sold_by_og'):
    sold_new = False
    sold_old = False
    if any(seller in str(row[col1]).lower() for seller in amazon_sellers):
        sold_new = True
    if any(seller in str(row[col2]).lower() for seller in amazon_sellers):
        sold_old = True
    
    if sold_new == False and sold_old == False:
        return 'third party'
    
    if sold_new == True and sold_old == False:
        return 'to amazon'
    
    if sold_new == False and sold_old == True:
        return 'to third party'
    
    if sold_new == True and sold_old == True:
        return "amazon"
    
def seller_switch(row):
    return who_switched(row, col1='shipped_by_new', col2='shipped_by_og')

In [23]:
# blank = df[(df.sold_by_new.isnull() | df.sold_by_og.isnull()) &
#            ((df.is_page_gone_new == False) & 
#             (df.no_buybox_winner_new == False) & 
#            (df.is_out_of_stock_new == False))]

In [24]:
# blank[['shipped_by_og', 'sold_by_og', 
#        'shipped_by_new', 'sold_by_new',
#        'fn_og', 'fn_new']]

In [139]:
df = df[~(df.sold_by_new.isnull() | df.sold_by_og.isnull())]
len(df)

2121

In [140]:
df.is_sold_by_amazon.value_counts(normalize=True)

False    0.694955
True     0.305045
Name: is_sold_by_amazon, dtype: float64

In [147]:
df['seller_delta'] = df.apply(who_switched, axis=1)
df['shipper_delta'] = df.apply(seller_switch, axis=1)

In [148]:
# how many changed?
df[df.seller_delta.isin(['to amazon', 'to third party'])].asin.nunique() / df.asin.nunique()

0.03488920320603489

In [149]:
value_counts(df[df.sold_by_new != df.sold_by_og], "seller_delta")

Unnamed: 0,count,percentage
third party,259,0.750725
to third party,37,0.107246
to amazon,37,0.107246
amazon,12,0.034783


In [150]:
value_counts(df, "seller_delta")

Unnamed: 0,count,percentage
third party,1437,0.677511
amazon,610,0.2876
to third party,37,0.017445
to amazon,37,0.017445


caclulate ranges using this site: https://www.surveysystem.com/sscalc.htm

In [151]:
def get_confidence_interval(perc = 1.5,
                            ss = 25,
                            pop = 100,
                            confidence_level = 95):
    """
    Calculates confidence interval given a percentage, sample size, and population size.
    taken from: https://www.surveysystem.com/sscalc.htm
    see: https://opentextbc.ca/introbusinessstatopenstax/chapter/a-confidence-interval-for-a-population-proportion/
    """
    conf2z = {
        95 : 3.8416,
        99 : 6.6564
    }
    
    zValC = conf2z.get(confidence_level)
    if pop == 0:
        pf = 1
    else:
        pf = (pop - ss) / (pop - 1)

    return math.sqrt(zValC * (perc / 100) * (1 - perc / 100) / ss * pf) * 100   

In [152]:
# seller changes to Amazon
perc_seller_to_amazon = df[df.seller_delta.isin(['to amazon', 'to amazon'])].asin.nunique() / df.asin.nunique() * 100
CI = get_confidence_interval(perc_seller_to_amazon, ss=len(df), pop=df_prod.asin.nunique())
perc_seller_to_amazon - CI, perc_seller_to_amazon + CI

(1.191046008750707, 2.297874311852782)

In [153]:
# seller changes to third-party
perc_seller_to_3p = df[df.seller_delta.isin(['to amazon', 'to third party'])].asin.nunique() / df.asin.nunique() * 100
CI = get_confidence_interval(perc_seller_to_3p, ss=len(df), pop=df_prod.asin.nunique())
perc_seller_to_3p - CI, perc_seller_to_3p + CI

(2.713253307929728, 4.26458733327725)

In [154]:
# shipper changes to Amazon
perc_shipper_to_amazon = df[df.shipper_delta.isin(['to amazon', 'to amazon'])].asin.nunique() / df.asin.nunique() * 100
CI = get_confidence_interval(perc_shipper_to_amazon, ss=len(df),pop=df_prod.asin.nunique())
perc_shipper_to_amazon - CI, perc_shipper_to_amazon + CI

(2.1695231921151406, 3.5824805796906114)

In [155]:
# shipper changes to third-party
perc_shipper_to_3p = df[df.shipper_delta.isin(['to amazon', 'to third party'])].asin.nunique() / df.asin.nunique() * 100
CI = get_confidence_interval(perc_shipper_to_3p, ss=len(df), pop=df_prod.asin.nunique())
perc_shipper_to_3p - CI, perc_shipper_to_3p + CI

(5.33297602724795, 7.3968683857647815)