# Error Analysis of Products
This looks at how often products' default sellers and shippers switch from Amazon to a third-party.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import re
import glob
import math
import json
from collections import Counter
from multiprocessing import Pool

from tqdm import tqdm
from lxml import html
import numpy as np
import pandas as pd

import parsers as P
from parsers import process_search_result
from utils import value_counts

In [3]:
# inputs
fn_prod = '../data/intermediary/products.csv.xz'
pattern_spot_check = '../data/input/spotcheck_2/selenium-products/*/*/*/2021/05/*/webpage_product.html'

In [4]:
files_products = glob.glob(pattern_spot_check)
len(files_products)

3465

In [5]:
df_prod = pd.read_csv(fn_prod, compression='xz')
len(df_prod)

157405

In [21]:
valid_asins = df_prod[~df_prod.sold_by.isnull()].asin

In [17]:
# parse the re-collected prodcuts.
product_data = []
with Pool(processes=32) as pool:
    for record in tqdm(pool.imap_unordered(P.parse_product_page, 
                                           files_products), 
                       total=len(files_products)):
        product_data.append(record)

df_prod_new = pd.DataFrame(product_data)

100%|██████████| 3465/3465 [00:21<00:00, 160.07it/s]


In [22]:
df_prod_new = pd.DataFrame(product_data)

In [23]:
df_prod_new["asin"] = df_prod_new.fn.apply(lambda x: x.split('/2021')[0].split('/')[-1])

In [19]:
# df_prod_new = df_prod_new[~df_prod_new.sold_by.isnull()]

In [24]:
df_prod_new = df_prod_new[df_prod_new.asin.isin(valid_asins)]

In [25]:
df_prod_new = df_prod_new.sample(2500, random_state=303)

In [26]:
value_counts(df_prod_new, "is_page_gone")

Unnamed: 0,count,percentage
False,2462,0.9848
True,38,0.0152


In [27]:
value_counts(df_prod_new, "no_buybox_winner")

Unnamed: 0,count,percentage
False,2424,0.9696
True,76,0.0304


In [28]:
value_counts(df_prod_new, "is_out_of_stock")

Unnamed: 0,count,percentage
False,2353,0.9412
True,147,0.0588


In [29]:
# what percentage of pages are out of stock or gone
len(df_prod_new[(df_prod_new.is_out_of_stock == True) | 
                (df_prod_new.is_page_gone == True)]) / len(df_prod_new)

0.074

In [31]:
df_prod_new.fillna(value=np.nan, inplace=True)
df_prod.fillna(value=np.nan, inplace=True)

In [53]:
# make a dataframe of new and old product page info
df = df_prod[df_prod.asin.isin(df_prod_new.asin.unique())].merge(df_prod_new, on='asin', suffixes= ('_og', '_new'))

In [54]:
len(df)

2500

In [55]:
df.is_sold_by_amazon.value_counts(normalize=True)

False    0.6912
True     0.3088
Name: is_sold_by_amazon, dtype: float64

In [34]:
amazon_sellers = ['zappos', 'whole foods', 'amazon']

def who_switched(row, col1='sold_by_new', col2='sold_by_og'):
    sold_new = False
    sold_old = False
    if any(seller in row[col1].lower() for seller in amazon_sellers):
        sold_new = True
    if any(seller in row[col2].lower() for seller in amazon_sellers):
        sold_old = True
    
    if sold_new == False and sold_old == False:
        return 'third party'
    
    if sold_new == True and sold_old == False:
        return 'to amazon'
    
    if sold_new == False and sold_old == True:
        return 'to third party'
    
    if sold_new == True and sold_old == True:
        return "amazon"
    
def seller_switch(row):
    return who_switched(row, col1='shipped_by_new', col2='shipped_by_og')

In [None]:
# blank = df[(df.sold_by_new.isnull() | df.sold_by_og.isnull()) &
#            ((df.is_page_gone_new == False) & 
#             (df.no_buybox_winner_new == False) & 
#            (df.is_out_of_stock_new == False))]

In [None]:
# blank[['shipped_by_og', 'sold_by_og', 
#        'shipped_by_new', 'sold_by_new',
#        'fn_og', 'fn_new']]

In [56]:
df = df[~(df.sold_by_new.isnull() | df.sold_by_og.isnull())]
len(df)

2047

In [57]:
df.is_sold_by_amazon.value_counts(normalize=True)

False    0.69956
True     0.30044
Name: is_sold_by_amazon, dtype: float64

In [37]:
df['seller_delta'] = df.apply(who_switched, axis=1)
df['shipper_delta'] = df.apply(seller_switch, axis=1)

In [38]:
# how many changed?
df[df.seller_delta.isin(['to amazon', 'to third party'])].asin.nunique() / df.asin.nunique()

0.027357107962872496

In [39]:
value_counts(df[df.sold_by_new != df.sold_by_og], "seller_delta")

Unnamed: 0,count,percentage
third party,296,0.822222
to third party,32,0.088889
to amazon,24,0.066667
amazon,8,0.022222


In [40]:
value_counts(df, "seller_delta")

Unnamed: 0,count,percentage
third party,1408,0.687836
amazon,583,0.284807
to third party,32,0.015633
to amazon,24,0.011724


caclulate ranges using this site: https://www.surveysystem.com/sscalc.htm

In [41]:
def get_confidence_interval(perc = 1.5,
                            ss = 25,
                            pop = 100,
                            confidence_level = 95):
    """
    Calculates confidence interval given a percentage, sample size, and population size.
    taken from: https://www.surveysystem.com/sscalc.htm
    see: https://opentextbc.ca/introbusinessstatopenstax/chapter/a-confidence-interval-for-a-population-proportion/
    """
    conf2z = {
        95 : 3.8416,
        99 : 6.6564
    }
    
    zValC = conf2z.get(confidence_level)
    if pop == 0:
        pf = 1
    else:
        pf = (pop - ss) / (pop - 1)

    return math.sqrt(zValC * (perc / 100) * (1 - perc / 100) / ss * pf) * 100   

In [43]:
# seller changes to Amazon
perc_seller_to_amazon = df[df.seller_delta.isin(['to amazon', 'to amazon'])].asin.nunique() / df.asin.nunique() * 100
CI = get_confidence_interval(perc_seller_to_amazon, ss=len(df), pop=df_prod.asin.nunique())
perc_seller_to_amazon - CI, perc_seller_to_amazon + CI

(0.7091693549285787, 1.6357256133176352)

In [42]:
# seller changes to third-party
perc_seller_to_3p = df[df.seller_delta.isin(['to amazon', 'to third party'])].asin.nunique() / df.asin.nunique() * 100
CI = get_confidence_interval(perc_seller_to_3p, ss=len(df), pop=df_prod.asin.nunique())
perc_seller_to_3p - CI, perc_seller_to_3p + CI

(2.0336610617912205, 3.4377605307832786)

In [45]:
# shipper changes to Amazon
perc_shipper_to_amazon = df[df.shipper_delta.isin(['to amazon', 'to amazon'])].asin.nunique() / df.asin.nunique() * 100
CI = get_confidence_interval(perc_shipper_to_amazon, ss=len(df),pop=df_prod.asin.nunique())
perc_shipper_to_amazon - CI, perc_shipper_to_amazon + CI

(2.205158538635763, 3.6570788819797717)

In [44]:
# shipper changes to third-party
perc_shipper_to_3p = df[df.shipper_delta.isin(['to amazon', 'to third party'])].asin.nunique() / df.asin.nunique() * 100
CI = get_confidence_interval(perc_shipper_to_3p, ss=len(df), pop=df_prod.asin.nunique())
perc_shipper_to_3p - CI, perc_shipper_to_3p + CI

(5.1209524378938465, 7.189746145398777)