# Error Analysis of Products
This looks at how often products' default sellers and shippers switch from Amazon to a third-party.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import re
import glob
from collections import Counter
import json
from multiprocessing import Pool


from tqdm import tqdm
from lxml import html
import numpy as np
import pandas as pd
# import s3

import parsers as P
from parsers import process_search_result
from utils import value_counts

In [65]:
# inputs
fn_prod = '../data/intermediary/products.csv.xz'
pattern_spot_check = '../data/input/spotcheck_2/selenium-products/*/*/*/2021/05/*/webpage_product.html'

In [66]:
files_products = glob.glob(pattern_spot_check)
len(files_products)

3465

In [16]:
df_prod = pd.read_csv(fn_prod, compression='xz')
len(df_prod)

In [64]:
# parse the re-collected prodcuts.
product_data = []
with Pool(processes=32) as pool:
    for record in tqdm(pool.imap_unordered(P.parse_product_page, 
                                           files_products), 
                       total=len(files_products)):
        product_data.append(record)

df_prod_new = pd.DataFrame(product_data)

100%|██████████| 3465/3465 [00:23<00:00, 147.06it/s]


In [67]:
df_prod_new["asin"] = df_prod_new.fn.apply(lambda x: x.split('/2021')[0].split('/')[-1])

In [68]:
value_counts(df_prod_new, "is_page_gone")

Unnamed: 0,count,percentage
False,3407,0.983261
True,58,0.016739


In [69]:
value_counts(df_prod_new, "no_buybox_winner")

Unnamed: 0,count,percentage
False,3353,0.967677
True,112,0.032323


In [70]:
value_counts(df_prod_new, "is_out_of_stock")

Unnamed: 0,count,percentage
False,3260,0.940837
True,205,0.059163


In [71]:
# what percentage of pages are out of stock or gone
len(df_prod_new[(df_prod_new.is_out_of_stock == True) | 
                (df_prod_new.is_page_gone == True)]) / len(df_prod_new)

0.0759018759018759

In [126]:
len(df_prod_new[(df_prod_new.is_out_of_stock != True) | 
            (df_prod_new.is_page_gone != True)])

3465

In [72]:
# this is how we sample the products
# to_check = df_prod.sample(2500, random_state=303).asin.tolist()

In [73]:
# df_prod[df_prod.asin.isin(to_check)].shipped_by.value_counts(normalize=True)

Amazon                0.560811
Amazon.com            0.307207
AmazonFresh           0.009910
Whole Foods Market    0.008108
Pharmapacks           0.002703
                        ...   
BA GROUP              0.000450
MarketHype            0.000450
iServe                0.000450
Home Sheets           0.000450
Xcess Limited         0.000450
Name: shipped_by, Length: 237, dtype: float64

In [26]:
# df_prod[df_prod.asin.isin(to_check)].sold_by.value_counts(normalize=True)

Amazon.com            0.304956
AmazonFresh           0.012245
Whole Foods Market    0.008746
shein online store    0.002915
Pharmapacks           0.002332
                        ...   
Awekris               0.000583
HUHETA-US             0.000583
ADREAMLY CLOTHING     0.000583
HeetaDirect           0.000583
Nordic Naturals       0.000583
Name: sold_by, Length: 1087, dtype: float64

In [82]:
df_prod_new.iloc[0]

fn                         ../data/input/spotcheck_2/selenium-products/5/...
title                      LD Remanufactured Ink Cartridge Replacement fo...
shipped_by                                                       LD Products
sold_by                                                          LD Products
has_third_party_sellers                                                 True
product_by_amazon                                                      False
our_brands_carousel                                                    False
ads                                                                       []
no_buybox_winner                                                       False
is_out_of_stock                                                        False
is_page_gone                                                           False
suggestions                [B015DPWXCY, B015DPWXCY, B015DPWXCY, B015DPWXC...
asin                                                              B015DPWXCY

In [83]:
df_prod_new.fillna(value=np.nan, inplace=True)

In [84]:
df_prod.fillna(value=np.nan, inplace=True)

In [85]:
df = df_prod[df_prod.asin.isin(df_prod_new.asin.unique())].merge(df_prod_new, on='asin', suffixes= ('_og', '_new'))

In [86]:
len(df)

3465

In [87]:
amazon_sellers = ['zappos', 'whole foods', 'amazon']

def who_switched(row, col1='sold_by_new', col2='sold_by_og'):
    sold_new = False
    sold_old = False
    if any(seller in row[col1].lower() for seller in amazon_sellers):
        sold_new = True
    if any(seller in row[col2].lower() for seller in amazon_sellers):
        sold_old = True
    
    if sold_new == False and sold_old == False:
        return 'third party'
    
    if sold_new == True and sold_old == False:
        return 'to amazon'
    
    if sold_new == False and sold_old == True:
        return 'to third party'
    
    if sold_new == True and sold_old == True:
        return "amazon"
    
def seller_switch(row):
    return who_switched(row, col1='shipped_by_new', col2='shipped_by_og')

In [None]:
# blank = df[(df.sold_by_new.isnull() | df.sold_by_og.isnull()) &
#            ((df.is_page_gone_new == False) & 
#             (df.no_buybox_winner_new == False) & 
#            (df.is_out_of_stock_new == False))]

In [None]:
# blank[['shipped_by_og', 'sold_by_og', 
#        'shipped_by_new', 'sold_by_new',
#        'fn_og', 'fn_new']]

In [88]:
df = df[~(df.sold_by_new.isnull() | df.sold_by_og.isnull())]

In [103]:
len(df)

2600

In [90]:
df['seller_delta'] = df.apply(who_switched, axis=1)
df['shipper_delta'] = df.apply(seller_switch, axis=1)

In [91]:
# how many changed?
df[df.seller_delta.isin(['to amazon', 'to third party'])].asin.nunique() / df.asin.nunique()

0.026153846153846153

In [92]:
value_counts(df[df.sold_by_new != df.sold_by_og], "seller_delta")

Unnamed: 0,count,percentage
third party,363,0.819413
to third party,41,0.092551
to amazon,27,0.060948
amazon,12,0.027088


In [93]:
value_counts(df, "seller_delta")

Unnamed: 0,count,percentage
third party,1789,0.688077
amazon,743,0.285769
to third party,41,0.015769
to amazon,27,0.010385


caclulate ranges using this site: https://www.surveysystem.com/sscalc.htm

In [104]:
def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return m, m-h, m+h

In [106]:
import scipy.stats as st

In [108]:
from statsmodels.stats.proportion import proportion_confint

ModuleNotFoundError: No module named 'statsmodels'

In [109]:
import math

In [114]:
p = 2600 / 157405
d = 95
sample_size = 16 * p * (100-p) / (d ^ 2)


In [115]:
sample_size

0.28413197893203257

In [107]:
st.t.interval(alpha=0.95, 
              df=len(data)-1, 
              loc=np.mean(data), 
              scale=st.sem(data)) 


NameError: name 'data' is not defined

In [94]:
df[df.seller_delta.isin(['to amazon', 'to third party'])].asin.nunique() / df.asin.nunique()

0.026153846153846153

In [95]:
df[df.shipper_delta.isin(['to amazon', 'to third party'])].asin.nunique() / df.asin.nunique()

0.05923076923076923

In [None]:
# len(df[df.shipped_by_new != df.shipped_by_og]) / len(df)

In [96]:
value_counts(df[df.shipped_by_new != df.shipped_by_og], "shipper_delta")

Unnamed: 0,count,percentage
to third party,78,0.278571
to amazon,76,0.271429
third party,74,0.264286
amazon,52,0.185714


In [97]:
# value_counts(df_prod, "is_out_of_stock")

Unnamed: 0,count,percentage
False,152613,0.969556
True,4792,0.030444


In [44]:
# OG amazon sold.
df_prod.sold_by.value_counts(normalize=True)

Amazon.com                 0.287229
AmazonFresh                0.011006
Whole Foods Market         0.007759
Pharmapacks                0.002296
Amazon.com Services LLC    0.001828
                             ...   
probebi                    0.000007
Allizom                    0.000007
Batianda                   0.000007
Phogary US                 0.000007
StanCo LTD.                0.000007
Name: sold_by, Length: 41906, dtype: float64

In [99]:
df_prod[df_prod.asin.isin(df_prod_new.asin)].is_amazon.value_counts(normalize=True)

False    0.935065
True     0.064935
Name: is_amazon, dtype: float64

In [100]:
df_prod[df_prod.asin.isin(df_prod_new.asin)].sold_by.value_counts(normalize=True)

Amazon.com                 0.285174
AmazonFresh                0.011356
Whole Foods Market         0.008202
Amazon.com Services LLC    0.002839
Pharmapacks                0.002524
                             ...   
BlessLiving Home           0.000315
Color Technik              0.000315
Trophikos, LLC.            0.000315
DRIPEX-US                  0.000315
Bronson Laboratories       0.000315
Name: sold_by, Length: 2008, dtype: float64

In [101]:
df_prod_new

Unnamed: 0,fn,title,shipped_by,sold_by,has_third_party_sellers,product_by_amazon,our_brands_carousel,ads,no_buybox_winner,is_out_of_stock,is_page_gone,suggestions,asin
0,../data/input/spotcheck_2/selenium-products/5/...,LD Remanufactured Ink Cartridge Replacement fo...,LD Products,LD Products,True,False,False,[],False,False,False,"[B015DPWXCY, B015DPWXCY, B015DPWXCY, B015DPWXC...",B015DPWXCY
1,../data/input/spotcheck_2/selenium-products/5/...,"Yoleo Indoor Basketball Arcade Game, Official ...",Weallnersse US,Weallnersse US,False,False,False,[],False,False,False,"[B085DF5126, B085DF5126, B085DF5126, B085DF512...",B085DF5126
2,../data/input/spotcheck_2/selenium-products/5/...,"Safety Goggles, Anti-Fog Protective Safety Gla...",Amazon,HOMESMG,True,False,False,"[b'<div id=""ape_Detail_ams-detail-right-v2_des...",False,False,False,"[B085DSZ26J, B085DSZ26J, B085DSZ26J, B085DSZ26...",B085DSZ26J
3,../data/input/spotcheck_2/selenium-products/5/...,JVC HAFX5V Gumy Plus Inner Ear Headphones (Gra...,Amazon.com,Amazon.com,True,False,False,"[b'<div id=""ape_Detail_ams-detail-right-v2_des...",False,False,False,"[B005DHKRVC, B005DHKRVC, B005DHKRVC, B005DHKRV...",B005DHKRVC
4,../data/input/spotcheck_2/selenium-products/5/...,"LEATHERMAN, Wingman Multitool with Spring-Acti...",Amazon,NetRush,True,False,False,"[b'<div id=""ape_Detail_ams-detail-right-v2_des...",False,False,False,"[B005DI0XM4, B005DI0XM4, B005DI0XM4, B005DI0XM...",B005DI0XM4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3460,../data/input/spotcheck_2/selenium-products/0/...,"Scotch Tape Double Sided, 1/2 in x 300 in (002...",Amazon.com,Amazon.com,True,False,False,[],False,False,False,"[B000XAJFC6, B000XAJFC6, B000XAJFC6, B000XAJFC...",B000XAJFC6
3461,../data/input/spotcheck_2/selenium-products/0/...,"Bush's Best Black Beans, 15 oz",AmazonFresh,AmazonFresh,False,False,False,[],False,False,False,"[B000Q5L3HI, B000Q5L3HI, B000Q5L3HI, B000Q5L3H...",B000Q5L3HI
3462,../data/input/spotcheck_2/selenium-products/0/...,Aurora AS420C Desktop-Style Cross-Cut Paper Sh...,Amazon.com,Amazon.com,True,False,False,[],False,False,False,"[B08L3XJXF6, B000QTAU7I, B000QTAU7I, B000QTAU7...",B000QTAU7I
3463,../data/input/spotcheck_2/selenium-products/0/...,TOMS Men's Classic Alpargata Slip-On Shoe,Amazon.com,Amazon.com,True,False,False,[],False,False,False,"[B000XBM1L2, B000XBM1L2, B000XBM1L2, B000XBM1L...",B000XBM1L2


In [49]:
df_prod_new.sold_by.value_counts(normalize=True)

Amazon.com                 0.793380
Amazon.com Services LLC    0.017051
AmazonFresh                0.004012
Zappos                     0.003009
Best Buy                   0.003009
                             ...   
Manscaped                  0.001003
Skillmatics USA            0.001003
ICEPURE Store              0.001003
TEJATAN                    0.001003
Woot                       0.001003
Name: sold_by, Length: 182, dtype: float64

In [52]:
# missing data
df_prod[df_prod.asin.isin(to_check)].sold_by.value_counts(normalize=True)

Amazon.com            0.304956
AmazonFresh           0.012245
Whole Foods Market    0.008746
shein online store    0.002915
Pharmapacks           0.002332
                        ...   
Awekris               0.000583
HUHETA-US             0.000583
ADREAMLY CLOTHING     0.000583
HeetaDirect           0.000583
Nordic Naturals       0.000583
Name: sold_by, Length: 1087, dtype: float64

In [53]:
df_search = pd.read_csv('../data/intermediary/searches.csv.gz', compression='gzip')

In [55]:
df_amazon = pd.read_csv('../data/intermediary/amazon_private_label.csv.xz', compression='xz')

In [56]:
df_prod_new.merge(df_search[['asin', 'search_term']]).sold_by.value_counts()

Amazon.com                 4007
Amazon.com Services LLC     188
Hotodeal Official Store      40
Best Buy                     37
Govee US                     27
                           ... 
Dimuntec-US                   1
nubisheng-usa                 1
DEWVIE                        1
4uSports                      1
goodidea0113                  1
Name: sold_by, Length: 182, dtype: int64

In [57]:
df_prod_new['is_amazon'] = df_prod_new.asin.isin(df_amazon.asin)

In [58]:
df_prod_new.is_amazon.value_counts()

False    1009
True      159
Name: is_amazon, dtype: int64

In [59]:
value_counts(df_prod_new, 'sold_by')

Unnamed: 0,count,percentage
Amazon.com,791,0.793380
Amazon.com Services LLC,17,0.017051
AmazonFresh,4,0.004012
Zappos,3,0.003009
Best Buy,3,0.003009
...,...,...
Manscaped,1,0.001003
Skillmatics USA,1,0.001003
ICEPURE Store,1,0.001003
TEJATAN,1,0.001003


In [60]:
value_counts(df_prod, 'sold_by')

Unnamed: 0,count,percentage
Amazon.com,38649,0.287229
AmazonFresh,1481,0.011006
Whole Foods Market,1044,0.007759
Pharmapacks,309,0.002296
Amazon.com Services LLC,246,0.001828
...,...,...
probebi,1,0.000007
Allizom,1,0.000007
Batianda,1,0.000007
Phogary US,1,0.000007


In [61]:
df_prod_new

Unnamed: 0,fn,title,shipped_by,sold_by,has_third_party_sellers,product_by_amazon,our_brands_carousel,ads,no_buybox_winner,is_out_of_stock,is_page_gone,suggestions,asin,is_amazon
0,../data/input/spotcheck/selenium-products/5/5D...,Solar Charger for Ring Video Doorbell (2020 re...,Amazon.com Services LLC,Amazon.com Services LLC,False,False,True,[],False,False,False,"[B085DVTYHN, B085DVTYHN, B085DVTYHN, B085DVTYHN]",B085DVTYHN,True
1,../data/input/spotcheck/selenium-products/5/52...,GEL Brand Gourmet Italian Seasoning 5 oz Famil...,Amazon,UO Foods,True,False,False,"[b'<div id=""ape_Detail_ams-detail-right-v2_des...",False,False,False,"[B0052Y6JSQ, B0052Y6JSQ, B0052Y6JSQ, B0052Y6JS...",B0052Y6JSQ,False
2,../data/input/spotcheck/selenium-products/5/5D...,Viper Hideaway Cabinet & Steel-Tip Dartboard R...,Amazon.com,Amazon.com,True,False,False,[],False,False,False,"[B075DJ8QKF, B075DJ8QKF, B075DJ8QKF, B075DJ8QK...",B075DJ8QKF,False
3,../data/input/spotcheck/selenium-products/5/5D...,Nautica Girls' Spaghetti Strap Fashion Dress,Amazon.com,Amazon.com,False,False,False,"[b'<div id=""ape_Detail_ams-detail-right-v2_des...",False,False,False,"[B085D524JJ, B085D524JJ, B085D524JJ, B085D524J...",B085D524JJ,False
4,../data/input/spotcheck/selenium-products/5/5D...,Utz Old Fashioned Pretzel Rods 27 oz. Barrel T...,,,True,False,False,"[b'<div id=""ape_Detail_ams-detail-right-v2_des...",True,False,False,"[B005DCZHRQ, B005DCZHRQ, B005DCZHRQ, B005DCZHR...",B005DCZHRQ,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,../data/input/spotcheck/selenium-products/0/0X...,"Yankee Candle 00609032519490 Scented Candle, L...",Amazon.com,Amazon.com,True,False,False,"[b'<div id=""ape_Detail_ams-detail-right-v2_des...",False,False,False,"[B000X457HO, B000X457HO, B000X457HO, B000X457H...",B000X457HO,False
1164,../data/input/spotcheck/selenium-products/0/04...,Car Sun Shades for Side and Rear Window (4 Pac...,Amazon,Unique Jems inc.,True,False,False,"[b'<div id=""ape_Detail_ams-detail-right-v2_des...",False,False,False,"[B0104OFF7E, B0104OFF7E, B0104OFF7E, B0104OFF7...",B0104OFF7E,False
1165,../data/input/spotcheck/selenium-products/0/02...,Life Stages LS-1648DD Double Door Folding Crat...,,,True,False,False,"[b'<div id=""ape_Detail_ams-detail-right-v2_des...",True,False,False,"[B0002AT3MO, B0002AT3MO, B0002AT3MO, B0002AT3M...",B0002AT3MO,False
1166,../data/input/spotcheck/selenium-products/0/02...,Purina ONE Sensitive Skin & Stomach With Real ...,Amazon.com,Amazon.com,True,False,False,[],False,False,False,"[B0002MLAEQ, B0002MLAEQ, B0002MLAEQ, B0002MLAE...",B0002MLAEQ,False


In [None]:
n_0 = ((Z**2) * p * (1-p)) / (e**2)


In [None]:
e = 

In [119]:
import math
 
# SUPPORTED CONFIDENCE LEVELS: 50%, 68%, 90%, 95%, and 99%
confidence_level_constant = [50,.67], [68,.99], [90,1.64], [95,1.96], [99,2.57]
 
# CALCULATE THE SAMPLE SIZE
def sample_size(population_size, confidence_level, confidence_interval):
    Z = 0.0
    p = 0.5
    e = confidence_interval/100.0
    N = population_size
    n_0 = 0.0
    n = 0.0

    # LOOP THROUGH SUPPORTED CONFIDENCE LEVELS AND FIND THE NUM STD
    # DEVIATIONS FOR THAT CONFIDENCE LEVEL
    for i in confidence_level_constant:
        if i[0] == confidence_level:
            Z = i[1]

    if Z == 0.0:
        return -1

    # CALC SAMPLE SIZE
    n_0 = ((Z**2) * p * (1-p)) / (e**2)

    # ADJUST SAMPLE SIZE FOR FINITE POPULATION
    n = n_0 / (1 + ((n_0 - 1) / float(N)) )

    return int(math.ceil(n)) # THE SAMPLE SIZE

sample_sz = 0
population_sz = 100000
confidence_level = 95.0
confidence_interval = 2.0

sample_sz = sample_size(population_sz, confidence_level, confidence_interval)

sample_sz

2345

In [122]:
n_0 = 0
N = 20000
n = n_0 / (1 + ((n_0 - 1) / float(N)) )

In [123]:
n

0.0

In [120]:
def margin_of_error(population_size, confidence_level, sample_sz):
    Z = 0.0
    p = 0.5
    N = population_size
    n_0 = 0.0
    n = sample_sz

    # LOOP THROUGH SUPPORTED CONFIDENCE LEVELS AND FIND THE NUM STD
    # DEVIATIONS FOR THAT CONFIDENCE LEVEL
    for i in confidence_level_constant:
        if i[0] == confidence_level:
            Z = i[1]

    if Z == 0.0:
        return -1
    
    n_0 = 
    
    e = math.sqrt(((Z**2) * p * (1-p)) / n_0)

    confidence_interval = e / 100.0

    return confidence_interval
    

In [121]:
margin_of_error(2600, 95, 157405)

2.4701129573967186e-05

In [124]:
Z = 1.96