# Error Analysis of Products
This looks at how often products' default sellers and shippers switch from Amazon to a third-party.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import re
import glob
from collections import Counter
import json
from multiprocessing import Pool


from tqdm import tqdm
from lxml import html
import numpy as np
import pandas as pd
# import s3

import parsers as P
from parsers import process_search_result
from utils import value_counts

In [3]:
# inputs
fn_prod = '../data/intermediary/products.csv.bz2'
pattern_spot_check = '../data/input/spotcheck/selenium-products/*/*/*/2021/03/*/webpage_product.html'

In [4]:
files_products = glob.glob(pattern_spot_check)
len(files_products)

1168

In [5]:
df_prod = pd.read_csv(fn_prod, compression='gzip')

In [8]:
product_data = []
with Pool(processes=32) as pool:
    for record in tqdm(pool.imap_unordered(P.parse_product_page, 
                                           files_products), 
                       total=len(files_products)):
        product_data.append(record)

df_prod_new = pd.DataFrame(product_data)

100%|██████████| 1168/1168 [00:29<00:00, 39.53it/s]


In [7]:
df_prod_new["asin"] = df_prod_new.fn.apply(lambda x: x.split('/2021')[0].split('/')[-1])

NameError: name 'df_prod_new' is not defined

In [7]:
value_counts(df_prod_new, "is_page_gone")

Unnamed: 0,count,percentage
False,1155,0.98887
True,13,0.01113


In [8]:
value_counts(df_prod_new, "no_buybox_winner")

Unnamed: 0,count,percentage
False,1132,0.969178
True,36,0.030822


In [9]:
value_counts(df_prod_new, "is_out_of_stock")

Unnamed: 0,count,percentage
False,1137,0.973459
True,31,0.026541


In [6]:
# what percentage of pages are out of stock or gone
len(df_prod_new[(df_prod_new.is_out_of_stock == True) | 
                (df_prod_new.is_page_gone == True)]) / len(df_prod_new)

NameError: name 'df_prod_new' is not defined

In [11]:
df_prod[df_prod.asin.isin(to_check)].shipped_by.value_counts(normalize=True)

NameError: name 'df_prod' is not defined

In [144]:
df_prod[df_prod.asin.isin(to_check)].sold_by.value_counts(normalize=True)

Amazon.com             0.621622
Happy IT.              0.027027
WEDOWORK               0.027027
ELFTUNE                0.027027
Number 1 In Service    0.027027
TOKO                   0.027027
Whole Foods Market     0.027027
HORIZON3 INC           0.027027
US Medical Pro         0.027027
JoyinDirect            0.027027
MSAAEX                 0.027027
Qmantoys               0.027027
CBQ                    0.027027
DRINKMATTERS           0.027027
beautychen             0.027027
Name: sold_by, dtype: float64

In [6]:
df_prod_new.iloc[0]

fn                         ../data/input/spotcheck/selenium-products/5/5D...
shipped_by                                           Amazon.com Services LLC
sold_by                                              Amazon.com Services LLC
has_third_party_sellers                                                False
product_by_amazon                                                      False
our_brands_carousel                                                     True
ads                                                                       []
no_buybox_winner                                                       False
is_page_gone                                                           False
suggestions                 [B085DVTYHN, B085DVTYHN, B085DVTYHN, B085DVTYHN]
Name: 0, dtype: object

In [57]:
df_prod_new.fillna(value=np.nan, inplace=True)

In [58]:
df_prod.fillna(value=np.nan, inplace=True)

In [59]:
df = df_prod[df_prod.asin.isin(df_prod_new.asin.unique())].merge(df_prod_new, on='asin', suffixes= ('_og', '_new'))

In [125]:
len(df)

927

In [111]:
amazon_sellers = ['zappos', 'whole foods', 'amazon']

def who_switched(row, col1='sold_by_new', col2='sold_by_og'):
    sold_new = False
    sold_old = False
    if any(seller in row[col1].lower() for seller in amazon_sellers):
        sold_new = True
    if any(seller in row[col2].lower() for seller in amazon_sellers):
        sold_old = True
    
    if sold_new == False and sold_old == False:
        return 'third party'
    
    if sold_new == True and sold_old == False:
        return 'to amazon'
    
    if sold_new == False and sold_old == True:
        return 'to third party'
    
    if sold_new == True and sold_old == True:
        return "amazon"
    
def seller_switch(row):
    return who_switched(row, col1='shipped_by_new', col2='shipped_by_og')

In [61]:
# blank = df[(df.sold_by_new.isnull() | df.sold_by_og.isnull()) &
#            ((df.is_page_gone_new == False) & 
#             (df.no_buybox_winner_new == False) & 
#            (df.is_out_of_stock_new == False))]

In [113]:
# blank[['shipped_by_og', 'sold_by_og', 
#        'shipped_by_new', 'sold_by_new',
#        'fn_og', 'fn_new']]

In [65]:
df = df[~(df.sold_by_new.isnull() | df.sold_by_og.isnull())]

In [114]:
len(df)

927

In [108]:
df['seller_delta'] = df.apply(who_switched, axis=1)
df['shipper_delta'] = df.apply(seller_switch, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['seller_delta'] = df.apply(who_switched, axis=1)


In [120]:
df[df.seller_delta.isin(['to amazon', 'to third party'])].asin.nunique() / df.asin.nunique()

0.022653721682847898

In [118]:
value_counts(df[df.sold_by_new != df.sold_by_og], "seller_delta")

Unnamed: 0,count,percentage
third party,20,0.425532
to third party,20,0.425532
amazon,6,0.12766
to amazon,1,0.021277


In [150]:
value_counts(df, "seller_delta")

Unnamed: 0,count,percentage
amazon,770,0.830636
third party,136,0.14671
to third party,20,0.021575
to amazon,1,0.001079


In [152]:
770 / 1168

0.6592465753424658

In [148]:
20 / 1168

0.017123287671232876

In [149]:
16 / 1168

0.0136986301369863

In [134]:
df[df.seller_delta.isin(['to amazon', 'to third party'])].asin.nunique() / df.asin.nunique()

0.022653721682847898

In [122]:
df[df.shipper_delta.isin(['to amazon', 'to third party'])].asin.nunique() / df.asin.nunique()

0.021574973031283712

In [132]:
# len(df[df.shipped_by_new != df.shipped_by_og]) / len(df)

0.039913700107874865

In [130]:
value_counts(df[df.shipped_by_new != df.shipped_by_og], "shipper_delta")

Unnamed: 0,count,percentage
amazon,16,0.432432
to third party,13,0.351351
to amazon,7,0.189189
third party,1,0.027027


In [70]:
value_counts(df_prod, "is_out_of_stock")

Unnamed: 0,count,percentage
False,152611,0.969544
True,4794,0.030456


In [153]:
# OG amazon sold.
df_prod.sold_by.value_counts(normalize=True)

Amazon.com                 0.287234
AmazonFresh                0.011007
Whole Foods Market         0.007766
Pharmapacks                0.002289
Amazon.com Services LLC    0.001828
                             ...   
Honey Badger LLC           0.000007
OnlineSellingFirm          0.000007
NEQUIO X                   0.000007
UnionPower                 0.000007
ATOPSTAR                   0.000007
Name: sold_by, Length: 41908, dtype: float64

In [165]:
df_prod[df_prod.asin.isin(df_prod_new.asin)].is_amazon.value_counts()

False    1009
True      159
Name: is_amazon, dtype: int64

In [164]:
df_prod[df_prod.asin.isin(df_prod_new.asin)].sold_by.value_counts(normalize=True)

Amazon.com                 0.830366
Amazon.com Services LLC    0.014058
Zappos                     0.003749
DR.MOXA                    0.001874
Pharmapacks                0.001874
                             ...   
iWenSheng                  0.000937
KORVOS                     0.000937
SnackBOX                   0.000937
Allopo                     0.000937
Saltverk                   0.000937
Name: sold_by, Length: 161, dtype: float64

In [166]:
df_prod_new

Unnamed: 0,fn,shipped_by,sold_by,has_third_party_sellers,product_by_amazon,our_brands_carousel,ads,no_buybox_winner,is_out_of_stock,is_page_gone,suggestions,asin
0,../data/input/spotcheck/selenium-products/5/5D...,Amazon.com Services LLC,Amazon.com Services LLC,False,False,True,[],False,False,False,"[B085DVTYHN, B085DVTYHN, B085DVTYHN, B085DVTYHN]",B085DVTYHN
1,../data/input/spotcheck/selenium-products/5/5D...,Amazon.com,Amazon.com,True,False,False,[],False,False,False,"[B075DJ8QKF, B075DJ8QKF, B075DJ8QKF, B075DJ8QK...",B075DJ8QKF
2,../data/input/spotcheck/selenium-products/5/58...,Amazon.com,Amazon.com,True,False,False,"[b'<div id=""ape_Detail_desktop-detail-ilm_desk...",False,False,False,"[B075898MXN, B075898MXN, B075898MXN, B075898MX...",B075898MXN
3,../data/input/spotcheck/selenium-products/5/58...,Amazon.com,Amazon.com,True,False,True,"[b'<div id=""ape_Detail_ams-detail-right-v2_des...",False,False,False,"[B0058TWFWE, B0058TWFWE, B0058TWFWE, B0058TWFW...",B0058TWFWE
4,../data/input/spotcheck/selenium-products/5/5S...,Amazon,Organic Verdana,False,False,True,"[b'<div id=""ape_Detail_ams-detail-right-v2_des...",False,False,False,"[B085S6RCVH, B085S6RCVH, B085S6RCVH, B085S6RCV...",B085S6RCVH
...,...,...,...,...,...,...,...,...,...,...,...,...
1163,../data/input/spotcheck/selenium-products/0/0X...,Amazon.com,Amazon.com,True,False,False,"[b'<div id=""ape_Detail_ams-detail-right-v2_des...",False,False,False,"[B000XSG300, B000XSG300, B000XSG300, B000XSG30...",B000XSG300
1164,../data/input/spotcheck/selenium-products/0/02...,Amazon.com,Amazon.com,True,False,False,[],False,False,False,"[B0002MLAEQ, B0002MLAEQ, B0002MLAEQ, B0002MLAE...",B0002MLAEQ
1165,../data/input/spotcheck/selenium-products/0/0X...,Amazon.com,Amazon.com,True,False,False,"[b'<div id=""ape_Detail_ams-detail-right-v2_des...",False,False,False,"[B000X457HO, B000X457HO, B000X457HO, B000X457H...",B000X457HO
1166,../data/input/spotcheck/selenium-products/0/0Q...,Amazon.com,Amazon.com,True,False,False,"[b'<div id=""ape_Detail_ams-detail-right-v2_des...",False,False,False,"[B000QTUJXS, B000QTUJXS, B000QTUJXS, B000QTUJX...",B000QTUJXS


In [159]:
len(df_prod_new)

1168

In [158]:
df_prod_new.sold_by.value_counts(normalize=True)

Amazon.com                 0.793380
Amazon.com Services LLC    0.017051
AmazonFresh                0.004012
Best Buy                   0.003009
Zappos                     0.003009
                             ...   
Another Day in Paradis     0.001003
EleQit Pty Ltd             0.001003
M. ROSENFELD               0.001003
HAVEN FURNITURE CO         0.001003
Organic Verdana            0.001003
Name: sold_by, Length: 182, dtype: float64

In [74]:
value_counts(df_prod, "is_page_gone")

Unnamed: 0,count,percentage
False,156209,0.992402
True,1196,0.007598


In [76]:
value_counts(df_prod, "no_buybox_winner")

Unnamed: 0,count,percentage
False,151806,0.964429
True,5599,0.035571


In [156]:
# missing data
df_prod[df_prod.asin.isin(to_check)].sold_by.value_counts(normalize=True)

Amazon.com             0.621622
Happy IT.              0.027027
WEDOWORK               0.027027
ELFTUNE                0.027027
Number 1 In Service    0.027027
TOKO                   0.027027
Whole Foods Market     0.027027
HORIZON3 INC           0.027027
US Medical Pro         0.027027
JoyinDirect            0.027027
MSAAEX                 0.027027
Qmantoys               0.027027
CBQ                    0.027027
DRINKMATTERS           0.027027
beautychen             0.027027
Name: sold_by, dtype: float64

In [167]:
df_search = pd.read_csv('../data/intermediary/searches.csv.gz', compression='gzip')

In [175]:
df_amazon = pd.read_csv('../data/intermediary/amazon_private_label.csv.gz')

In [174]:
df_prod_new.merge(df_search[['asin', 'search_term']]).sold_by.value_counts()

Amazon.com                 4007
Amazon.com Services LLC     188
Hotodeal Official Store      40
Best Buy                     37
Govee US                     27
                           ... 
YSYKR-BM                      1
Greenwald Brands              1
Supseller Store               1
Shopbop | East Dane (         1
Sneakers&More                 1
Name: sold_by, Length: 182, dtype: int64

In [179]:
df_prod_new['is_amazon'] =df_prod_new.asin.isin(df_amazon.asin)

In [180]:
df_prod_new.is_amazon.value_counts()

False    1009
True      159
Name: is_amazon, dtype: int64

In [182]:
value_counts(df_prod_new, 'sold_by')

Unnamed: 0,count,percentage
Amazon.com,791,0.793380
Amazon.com Services LLC,17,0.017051
AmazonFresh,4,0.004012
Best Buy,3,0.003009
Zappos,3,0.003009
...,...,...
Another Day in Paradis,1,0.001003
EleQit Pty Ltd,1,0.001003
M. ROSENFELD,1,0.001003
HAVEN FURNITURE CO,1,0.001003


In [181]:
value_counts(df_prod, 'sold_by')

Unnamed: 0,count,percentage
Amazon.com,38649,0.287234
AmazonFresh,1481,0.011007
Whole Foods Market,1045,0.007766
Pharmapacks,308,0.002289
Amazon.com Services LLC,246,0.001828
...,...,...
Honey Badger LLC,1,0.000007
OnlineSellingFirm,1,0.000007
NEQUIO X,1,0.000007
UnionPower,1,0.000007


In [187]:
# this is how we sample the products
to_check = df_prod.sample(2000, random_state=303).asin.tolist()

In [189]:
df_prod_new

Unnamed: 0,fn,shipped_by,sold_by,has_third_party_sellers,product_by_amazon,our_brands_carousel,ads,no_buybox_winner,is_out_of_stock,is_page_gone,suggestions,asin,is_amazon
0,../data/input/spotcheck/selenium-products/5/5D...,Amazon.com Services LLC,Amazon.com Services LLC,False,False,True,[],False,False,False,"[B085DVTYHN, B085DVTYHN, B085DVTYHN, B085DVTYHN]",B085DVTYHN,True
1,../data/input/spotcheck/selenium-products/5/5D...,Amazon.com,Amazon.com,True,False,False,[],False,False,False,"[B075DJ8QKF, B075DJ8QKF, B075DJ8QKF, B075DJ8QK...",B075DJ8QKF,False
2,../data/input/spotcheck/selenium-products/5/58...,Amazon.com,Amazon.com,True,False,False,"[b'<div id=""ape_Detail_desktop-detail-ilm_desk...",False,False,False,"[B075898MXN, B075898MXN, B075898MXN, B075898MX...",B075898MXN,False
3,../data/input/spotcheck/selenium-products/5/58...,Amazon.com,Amazon.com,True,False,True,"[b'<div id=""ape_Detail_ams-detail-right-v2_des...",False,False,False,"[B0058TWFWE, B0058TWFWE, B0058TWFWE, B0058TWFW...",B0058TWFWE,False
4,../data/input/spotcheck/selenium-products/5/5S...,Amazon,Organic Verdana,False,False,True,"[b'<div id=""ape_Detail_ams-detail-right-v2_des...",False,False,False,"[B085S6RCVH, B085S6RCVH, B085S6RCVH, B085S6RCV...",B085S6RCVH,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,../data/input/spotcheck/selenium-products/0/0X...,Amazon.com,Amazon.com,True,False,False,"[b'<div id=""ape_Detail_ams-detail-right-v2_des...",False,False,False,"[B000XSG300, B000XSG300, B000XSG300, B000XSG30...",B000XSG300,False
1164,../data/input/spotcheck/selenium-products/0/02...,Amazon.com,Amazon.com,True,False,False,[],False,False,False,"[B0002MLAEQ, B0002MLAEQ, B0002MLAEQ, B0002MLAE...",B0002MLAEQ,False
1165,../data/input/spotcheck/selenium-products/0/0X...,Amazon.com,Amazon.com,True,False,False,"[b'<div id=""ape_Detail_ams-detail-right-v2_des...",False,False,False,"[B000X457HO, B000X457HO, B000X457HO, B000X457H...",B000X457HO,False
1166,../data/input/spotcheck/selenium-products/0/0Q...,Amazon.com,Amazon.com,True,False,False,"[b'<div id=""ape_Detail_ams-detail-right-v2_des...",False,False,False,"[B000QTUJXS, B000QTUJXS, B000QTUJXS, B000QTUJX...",B000QTUJXS,False
