# Data Preprocessing
This notebook will preprocess searchs, product pages, the our brands API, best sellers, and create our Amazon Database.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import re
import glob
from collections import Counter
import json
from multiprocessing import Pool

import parsers as P
from tqdm import tqdm
from lxml import html
import pandas as pd

from utils import value_counts

In [3]:
# inputs
pattern_searches = '../data/input/search-selenium/*/*/*/2021/01/21/webpage_search.html'
pattern_our_brands_api = '../data/input/search-private-label/*/*/*/2021/*/*/response__*.json'
pattern_our_brands_html = '../data/input/search-selenium-our-brands-filter_/*/*/*/2021/*/*/webpage_search__*.html'
pattern_our_brands_search = '../data/input/search-private-label/*/*/*/2021/*/*/webpage_ourbrands.html'
pattern_best_sellers = '../data/input/best_sellers/*/*/*/*/page_*/2021/*/*/*.html'
pattern_products = '../data/input/selenium-products/*/*/*/2021/02/*/webpage_product.html'
fn_seller_central = '../data/input/seller_central/All_Q4_2020.csv.xz'

# outputs
data_dir = '../data/output/datasets'
fn_amazon = f'{data_dir}/amazon_private_label.csv.xz'
fn_search = f'{data_dir}/searches.csv.xz'
fn_products = f'{data_dir}/products.csv.xz'
fn_training_set = f'{data_dir}/training_set.csv.xz'

fn_brands_api = f'{data_dir}/non_essential/our_brands_api.csv.gz'
fn_brand_search = f'{data_dir}/non_essential/our_brands_search.csv.gz'
fn_brands_filter = f'{data_dir}/non_essential/our_brands_filter.csv.gz'
fn_best_sellers = f'{data_dir}/non_essential/best_sellers.csv.gz'

os.makedirs(f"{data_dir}/non_essential" , exist_ok=True)

## search
Parses raw search results

In [4]:
files_searches = glob.glob(pattern_searches)
len(files_searches)

12717

In [5]:
if not os.path.exists(fn_search):
    search_data = []
    with Pool(processes=32) as pool:
        for record in tqdm(pool.imap_unordered(P.process_search_result, files_searches), 
                           total=len(files_searches)):
            search_data.extend(record)
            
    df_search = pd.DataFrame(search_data)
    df_search.to_csv(fn_search, index=False, compression='xz')

else:
    df_search = pd.read_csv(fn_search, compression='xz')

100%|██████████| 12717/12717 [01:09<00:00, 183.19it/s]


In [6]:
df_search.groupby('product_type').search_term.nunique() / df_search.search_term.nunique()

product_type
editorial_recs_carousel      0.352660
featured_brands_carousel     0.125199
misc_carousel                0.028034
misc_sponsored_carousel      0.020787
regular_placement            0.998168
regular_placement__missed    0.141048
sponsored_banner             0.686684
Name: search_term, dtype: float64

## our brands search
parse search results made in the "our brands" department.

In [7]:
files_our_brands_search = glob.glob(pattern_our_brands_search)
len(files_our_brands_search)

795

In [8]:
if not os.path.exists(fn_brand_search):
    files_our_brands_search = glob.glob(pattern_our_brands_search)
    print(len(files_our_brands_search))

    brands_search_data = []
    with Pool(processes=32) as pool:
        for record in tqdm(pool.imap_unordered(P.process_search_result, 
                                               files_our_brands_search), 
                           total=len(files_our_brands_search)):
            brands_search_data.extend(record)

    df_brands_search = pd.DataFrame(brands_search_data)
    df_brands_search = df_brands_search[~df_brands_search.is_sponsored]
    df_brands_search.to_csv(fn_brand_search, index=False, compression='gzip')
    
else:
    df_brands_search = pd.read_csv(fn_brand_search, compression='gzip')

795


100%|██████████| 795/795 [00:00<00:00, 965.58it/s] 


## our brands API
parses API responses that mimic the "our brands" filter.

In [9]:
if not os.path.exists(fn_brands_api):
    files_our_brands_api = glob.glob(pattern_our_brands_api)
    print(len(files_our_brands_api))
    
    data_our_brands = []
    with Pool(processes=32) as pool:
        for record in tqdm(pool.imap_unordered(P.process_our_brands_api, 
                                               files_our_brands_api), 
                           total=len(files_our_brands_api)):
            data_our_brands.extend(record)

    df_brands_api = pd.DataFrame(data_our_brands)
    df_brands_api.drop_duplicates(subset=['asin', 'search_term'], inplace=True)
    df_brands_api.to_csv(fn_brands_api, index=False, compression='gzip')
    
else:
    df_brands_api = pd.read_csv(fn_brands_api, compression='gzip')

110671


100%|██████████| 110671/110671 [01:35<00:00, 1163.51it/s]


## our brands HTML
parses HTML pages that are filtered to "our brands."

In [10]:
if not os.path.exists(fn_brands_filter):
    files_brands_filter = glob.glob(pattern_our_brands_html)
    print(len(files_brands_filter))
    
    brands_filter_data = []
    with Pool(processes=64) as pool:
        for record in tqdm(pool.imap_unordered(P.process_our_brands_filter, 
                                               files_brands_filter), 
                           total=len(files_brands_filter)):
            brands_filter_data.extend(record)
    df_brands_filter = pd.DataFrame(brands_filter_data)
    df_brands_filter.drop_duplicates(subset=['asin', 'search_term'], inplace=True)
    df_brands_filter.to_csv(fn_brands_filter, index=False, compression='gzip')
    
else:
    df_brands_filter = pd.read_csv(fn_brands_filter, compression='gzip')

48709


100%|██████████| 48709/48709 [02:12<00:00, 366.58it/s]


## Best Sellers
Find best-selling Amazon devices.

In [None]:
if not os.path.exists(fn_best_sellers):
    files_best_sellers = glob.glob(pattern_best_sellers)
    print(len(files_best_sellers))
    
    best_sellers_data = []
    with Pool(processes=32) as pool:
        for record in tqdm(pool.imap_unordered(P.process_best_sellers, 
                                               files_best_sellers), 
                           total=len(files_best_sellers)):
            best_sellers_data.extend(record)
    df_best_sellers = pd.DataFrame(best_sellers_data)
    df_best_sellers.to_csv(fn_best_sellers, index=False, compression='gzip')

else:
    df_best_sellers = pd.read_csv(fn_best_sellers, compression='gzip')

15660


 21%|██        | 3256/15660 [00:05<00:12, 987.72it/s] 

## Amazon products database
Creating the database of Amazon ASINs.

In [None]:
amazon_missed = '|'.join([
    'amazon basics',
    'amazon essentials',
    'amazon exclusive',
    'amazon us exclusive',
    'amazon brand',
    'goodthreads',
    'solimo',
    'whole foods market',
    'amazon commercial',
    'amazon collection',
    'amazon fresh',
    'amazon elements',
    'amazonbasics',
    'pinzon by amazon',
    'simple joys by carter',
    'daily ritual',
    '365 everyday value',
    'lark & ro',
    'presto!'
])

In [None]:
if not os.path.exists(fn_amazon):
    # Amazon products from the datasets we collected
    df_best_seller_amazon = df_best_sellers[
        (df_best_sellers.path.str.contains('Amazon Devices')) &
        (~df_best_sellers.category.astype(str).str.contains('Kindle'))
    ]
    s1 = df_brands_api[['asin', 'product_name', 'product_url']]
    s1 = s1.assign(source="our brands API")
    s2 = df_best_seller_amazon[['asin', 'product_name', 'product_url']]
    s2 = s2.assign(source='best selling Amazon devices')
    s3 =  df_brands_search[['asin', 'product_name', 'product_url']]
    s3 = s3.assign(source='our brands searchbar')
    s4 = df_brands_filter[['asin', 'product_name', 'product_url']]
    s4 = s4.assign(source='our brands filtered search result')
    df_amazon = s1.append(s2).append(s3).append(s4)
    
    # missed from non-search listing products
    s5 = df_search[
        (df_search.is_featured_brand) & 
        (df_search.product_type == 'regular_placement') &
        (~df_search.asin.isin(df_amazon.asin))
    ][['asin', 'product_name', 'product_url']]
    s5 = s5.assign(source="featured from our brands tag")
    s6 = df_search[
        (df_search.product_type == 'featured_brands_carousel') &
        (~df_search.asin.isin(df_amazon.asin))
    ][['asin', 'product_name', 'product_url']]
    s6 = s6.assign(source="featured from our brands carousel")
    s7 =  df_search[
        (df_search.product_name.str.contains(amazon_missed, case=False)) &
        (~df_search.asin.isin(df_amazon.asin))
    ][['asin', 'product_name', 'product_url']]
    s7 = s7.assign(source="text search")
    df_amazon = df_amazon.append(s5).append(s6).append(s7)
    df_amazon.drop_duplicates(subset=['asin'], keep='first', inplace=True)
    
    # add urls in the dataframe
    df_amazon['product_url'] = df_amazon.product_url.apply(lambda x: 'https://www.amazon.com' + x)
    df_amazon.to_csv(fn_amazon, index=False, compression='xz')
else:
    df_amazon = pd.read_csv(fn_amazon, compression='xz')
amazon_asin = set(df_amazon.asin.unique())

In [None]:
value_counts(df_amazon, col='source')

## product pages

In [15]:
if not os.path.exists(fn_products):
    files_products = glob.glob(pattern_products)
    print(len(files_products))
    
    product_data = []
    with Pool(processes=64) as pool:
        for record in tqdm(pool.imap_unordered(P.parse_product_page, files_products), 
                           total=len(files_products)):
            product_data.append(record)
    df_product = pd.DataFrame(product_data)
    
    # create some columns for ads
    df_product['n_ads'] = df_product.ads.apply(lambda x: len(x))
    df_product["asin"] = df_product.fn.apply(lambda x: x.split('/2021')[0].split('/')[-1])
    
    # dedupe and filter to only products in the search.
    df_product.drop_duplicates(subset='asin', keep='last', inplace=True)
    df_product = df_product[df_product.asin.isin(df_search.asin)]
    
    # create boolean column if the asin is an Amazon product
    amazon_asin = set(df_amazon.asin.unique())
    df_product['is_amazon'] = df_product.asin.isin(amazon_asin)
    
    # create boolean column for Amazon's shippers and sellers
    df_product["is_sold_by_amazon"] = df_product.apply(P.is_sold_by_amazon, axis=1)
    df_product["is_shipped_by_amazon"] = df_product.apply(P.is_shipped_by_amazon, axis=1)
    df_product.to_csv(fn_products, index=False, compression='xz')
    
else:
    df_product = pd.read_csv(fn_products, compression='xz')

100%|██████████| 204855/204855 [20:51<00:00, 163.74it/s]


In [16]:
df_product.sold_by.value_counts().head(20)

Amazon.com                 38649
AmazonFresh                 1481
Whole Foods Market          1045
Pharmapacks                  309
Amazon.com Services LLC      246
shein online store           165
Zappos                       150
Songmics Direct              141
VM Express                   138
SweatyRocks                  131
EPFamily Direct              129
BestChoiceproducts           124
JoyinDirect                  122
Just Love Fashion            119
MYBATTERYSUPPLIER            108
PajamaGram                   107
iServe                       105
Mr. Pen                      104
TheNewMall                    96
Baleaf Sports                 92
Name: sold_by, dtype: int64

## Tranining set for regression

In [18]:
if not os.path.exists(fn_training_set): 
    # filter the searches to regular placements with at least 20 products
    amazon_searches = df_search[
        (df_search.product_type == 'regular_placement') &
        (df_search.asin.isin(df_amazon.asin)) 
    ].search_term.unique()
    qualified_searches = df_search[
        (df_search.product_order >= 4) &
        (df_search.search_term.isin(amazon_searches))
    ].search_term.unique()
    
    training_set = df_search[
        (df_search.product_type == 'regular_placement') & 
        (df_search.search_term.isin(qualified_searches))
    ]
    
    # create column
    df_seller_central = pd.read_csv(fn_seller_central, skiprows=1, index_col=0, compression='xz')
    top_clicked = list(set(
        df_seller_central['#1 Clicked ASIN'].unique().tolist() + 
        df_seller_central['#2 Clicked ASIN'].unique().tolist() +
        df_seller_central['#3 Clicked ASIN'].unique().tolist()
    ))
    training_set['top_clicked'] = training_set['asin'].isin(top_clicked)
    
    # merge product metadata
    training_set = training_set.merge(df_product[[
        'asin','is_amazon', 'is_sold_by_amazon', 
        'is_shipped_by_amazon', 'has_third_party_sellers'
    ]], how='left')
    training_set.to_csv(fn_training_set, index=False, compression='xz')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_set['top_clicked'] = training_set['asin'].isin(top_clicked)
