# Data Preprocessing
This notebook will preprocess searchs, product pages, the our brands API, best sellers, and create our Amazon Database.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import re
import glob
from collections import Counter
import json
from multiprocessing import Pool

import parsers as P
from tqdm import tqdm
from lxml import html
import pandas as pd

from utils import value_counts

In [3]:
# inputs
pattern_searches = '../data/input/search-selenium/*/*/*/2021/01/21/webpage_search.html'
pattern_our_brands_api = '../data/input/search-private-label/*/*/*/2021/*/*/response__*.json'
pattern_our_brands_html = '../data/input/search-selenium-our-brands-filter_/*/*/*/2021/*/*/webpage_search__*.html'
pattern_our_brands_search = '../data/input/search-private-label/*/*/*/2021/*/*/webpage_ourbrands.html'
pattern_best_sellers = '../data/input/best_sellers/*/*/*/*/page_*/2021/*/*/*.html'
pattern_products = '../data/input/selenium-products/*/*/*/2021/02/*/webpage_product.html'
fn_seller_central = '../data/input/seller_central/All_Q4_2020.csv'

# outputs
data_dir = '../data/intermediary'
fn_brands_api = f'{data_dir}/our_brands_api.csv.gz'
fn_brand_search = f'{data_dir}/our_brands_api.csv.gz'
fn_brands_filter = f'{data_dir}/our_brands_filter.csv.gz'
fn_search = f'{data_dir}/searches.csv.gz'
fn_best_sellers = f'{data_dir}/best_sellers.csv.gz'
fn_queries = 'data/input/search_queries/combined_queries_with_source.csv'
fn_products = f'{data_dir}/products.csv.bz2'
fn_products_plus = f'{data_dir}/products_with_meta.csv.bz2'
fn_amazon = f'{data_dir}/amazon_private_label.csv.gz'
fn_training_set = f'{data_dir}/training_set.csv.gz'

os.makedirs(data_dir , exist_ok=True)

## search
Parses raw search results

In [4]:
files_searches = glob.glob(pattern_searches)
len(files_searches)

12717

In [5]:
if not os.path.exists(fn_search):
    search_data = []
    with Pool(processes=32) as pool:
        for record in tqdm(pool.imap_unordered(P.process_search_result, files_searches), 
                           total=len(files_searches)):
            search_data.extend(record)
            
    df_search = pd.DataFrame(search_data)
    df_search.to_csv(fn_search, index=False, compression='gzip')

else:
    df_search = pd.read_csv(fn_search)

In [6]:
df_search.groupby('product_type').search_term.nunique() / df_search.search_term.nunique()

product_type
editorial_recs_carousel      0.352660
featured_brands_carousel     0.125199
misc_carousel                0.028034
misc_sponsored_carousel      0.020787
regular_placement            0.998168
regular_placement__missed    0.141048
sponsored_banner             0.686684
Name: search_term, dtype: float64

## our brands search
parse search results made in the "our brands" department.

In [7]:
files_our_brands_search = glob.glob(pattern_our_brands_search)
len(files_our_brands_search)

795

In [8]:
if not os.path.exists(fn_brand_search):
    files_our_brands_search = glob.glob(pattern_our_brands_search)
    print(len(files_our_brands_search))

    brands_search_data = []
    with Pool(processes=32) as pool:
        for record in tqdm(pool.imap_unordered(P.process_search_result, 
                                               files_our_brands_search), 
                           total=len(files_our_brands_search)):
            brands_search_data.extend(record)

    df_brands_search = pd.DataFrame(brands_search_data)
    df_brands_search = df_brands_search[~df_brands_search.is_sponsored]
    df_brands_search.to_csv(fn_brand_search, index=False, compression='gzip')
    
else:
    df_brands_search = pd.read_csv(fn_brand_search, compression='gzip')

## our brands API
parses API responses that mimic the "our brands" filter.

In [9]:
if not os.path.exists(fn_brands_api):
    files_our_brands_api = glob.glob(pattern_our_brands_api)
    print(len(files_our_brands_api))
    
    data_our_brands = []
    with Pool(processes=32) as pool:
        for record in tqdm(pool.imap_unordered(P.process_our_brands_api, 
                                               files_our_brands_api), 
                           total=len(files_our_brands_api)):
            data_our_brands.extend(record)

    df_brands_api = pd.DataFrame(data_our_brands)
    df_brands_api.drop_duplicates(subset=['asin', 'search_term'], inplace=True)
    df_brands_api.to_csv(fn_brands_api, index=False, compression='gzip')
    
else:
    df_brands_api = pd.read_csv(fn_brands_api, compression='gzip')

## our brands HTML
parses HTML pages that are filtered to "our brands."

In [10]:
if not os.path.exists(fn_brands_filter):
    files_brands_filter = glob.glob(pattern_our_brands_html)
    print(len(files_brands_filter))
    
    brands_filter_data = []
    with Pool(processes=64) as pool:
        for record in tqdm(pool.imap_unordered(P.process_our_brands_filter, 
                                               files_brands_filter), 
                           total=len(files_brands_filter)):
            brands_filter_data.extend(record)
    df_brands_filter = pd.DataFrame(brands_filter_data)
    df_brands_filter.drop_duplicates(subset=['asin', 'search_term'], inplace=True)
    df_brands_filter.to_csv(fn_brands_filter, index=False, compression='gzip')
    
else:
    df_brands_filter = pd.read_csv(fn_brands_filter, compression='gzip')

## Best Sellers
Find best-selling Amazon devices.

In [11]:
if not os.path.exists(fn_best_sellers):
    files_best_sellers = glob.glob(pattern_best_sellers)
    print(len(files_best_sellers))
    
    best_sellers_data = []
    with Pool(processes=32) as pool:
        for record in tqdm(pool.imap_unordered(P.process_best_sellers, 
                                               files_best_sellers), 
                           total=len(files_best_sellers)):
            best_sellers_data.extend(record)
    df_best_sellers = pd.DataFrame(best_sellers_data)
    df_best_sellers.to_csv(fn_best_sellers, index=False, compression='gzip')

else:
    df_best_sellers = pd.read_csv(fn_best_sellers, compression='gzip')

## Amazon products database
Creating the database of Amazon ASINs.

In [12]:
df_best_seller_amazon = df_best_sellers[
    (df_best_sellers.path.str.contains('Amazon Devices')) &
    (~df_best_sellers.category.astype(str).str.contains('Kindle'))
]

In [13]:
s1 = df_brands_api[['asin', 'product_name', 'product_url']]
s1 = s1.assign(source="our brands API")

s2 = df_best_seller_amazon[['asin', 'product_name', 'product_url']]
s2 = s2.assign(source='best selling Amazon devices')

s3 =  df_brands_search[['asin', 'product_name', 'product_url']]
s3 = s3.assign(source='our brands searchbar')

s4 = df_brands_filter[['asin', 'product_name', 'product_url']]
s4 = s4.assign(source='our brands filtered search result')

In [14]:
df_amazon = s1.append(s2).append(s3).append(s4)
df_amazon.asin.nunique()

135652

In [15]:
amazon_missed = '|'.join([
    'amazon basics',
    'amazon essentials',
    'amazon exclusive',
    'amazon us exclusive',
    'amazon brand',
    'goodthreads',
    'solimo',
    'whole foods market',
    'amazon commercial',
    'amazon collection',
    'amazon fresh',
    'amazon elements',
    'amazonbasics',
    'pinzon by amazon',
    'simple joys by carter',
    'daily ritual',
    '365 everyday value',
    'lark & ro',
    'presto!'
])

In [16]:
s5 = df_search[
    (df_search.is_featured_brand) & 
    (df_search.product_type == 'regular_placement') &
    (~df_search.asin.isin(df_amazon.asin))
][['asin', 'product_name', 'product_url']]
s5 = s5.assign(source="featured from our brands tag")

s6 = df_search[
    (df_search.product_type == 'featured_brands_carousel') &
    (~df_search.asin.isin(df_amazon.asin))
][['asin', 'product_name', 'product_url']]
s6 = s6.assign(source="featured from our brands carousel")

s7 =  df_search[
    (df_search.product_name.str.contains(amazon_missed, case=False)) &
    (~df_search.asin.isin(df_amazon.asin))
][['asin', 'product_name', 'product_url']]
s7 = s7.assign(source="text search")

In [17]:
df_amazon = df_amazon.append(s5).append(s6).append(s7)

In [18]:
df_amazon.drop_duplicates(subset=['asin'], keep='first', inplace=True)
len(df_amazon)

137420

In [19]:
df_amazon['product_url'] = df_amazon.product_url.apply(lambda x: 'https://www.amazon.com' + x)

In [20]:
df_amazon.to_csv(fn_amazon, index=False, compression='gzip')

In [21]:
value_counts(df_amazon, col='source')

Unnamed: 0,count,percentage
our brands filtered search result,68671,0.499716
our brands API,66476,0.483743
text search,780,0.005676
featured from our brands carousel,684,0.004977
best selling Amazon devices,505,0.003675
featured from our brands tag,304,0.002212


## Create YAML file for extension...

In [27]:
import yaml

In [22]:
s2 = s2.drop_duplicates(subset='asin')

In [23]:
with open('../data/output/asins.yaml', 'w') as f:
    f.write('---\n')
    for i, row in s2.iterrows():
        f.write(f"{row['asin']}: {row['product_name']}\n")

In [24]:
data = {row['asin']: row['product_name'] for i, row in s2.iterrows()}

In [25]:
to_add = {
    'B08CKHPP52' : "Introducing Ring Video Doorbell Wired – Convenient, essential features in a compact design, pair with Ring Chime to hear audio alerts in your home (existing doorbell wiring required) - 2021 release",
    'B0876GVJ3D': "Certified Refurbished Ring Video Doorbell – newest generation, 2020 release – 1080p HD video, improved motion detection, easy installation – Satin Nickel",
    'B08M125RNW': "Ring Video Doorbell Pro – Upgraded, with added security features and a sleek design (existing doorbell wiring required)",
    'B086Q54K53': "Introducing Ring Video Doorbell Pro 2 – Best-in-class with cutting-edge features (existing doorbell wiring required) – 2021 release",
    'B07WLP395R': "Ring Video Doorbell 3 Plus – enhanced wifi, improved motion detection, 4-second video previews, easy installation",
    'B08JNR77QY': "All-new Ring Video Doorbell 4 – improved 4-second color video previews plus easy installation, and enhanced wifi – 2021 release",
    'B0876PVDMC': "Certified Refurbished Ring Video Doorbell 3 – enhanced wifi, improved motion detection, easy installation",
    'B08SSDZ6R8': "Introducing Ring Video Doorbell Wired with Ring Chime",
    'B0849J7W5X': "Ring Video Doorbell 3 – enhanced wifi, improved motion detection, easy installation",
    'B0727XJQLD': "Ring Floodlight Camera Motion-Activated HD Security Cam Two-Way Talk and Siren Alarm, White",
    'B08P5499MZ': "Solar Charger for Ring Video Doorbell 2",
    'B08C1W5N87': "Fire TV Stick (3rd Gen) with Alexa Voice Remote (includes TV controls) | HD streaming device | 2021 release",
    'B08N5NQ69J': "Ring Video Doorbell – newest generation, 2020 release – 1080p HD video, improved motion detection, easy installation – Venetian Bronze",
    'B07PDHSLM6': "Echo Dot (3rd Gen) - Smart speaker with Alexa - Heather Gray",
    'B01MZEEFNX': "Amazon Smart Plug, works with Alexa – A Certified for Humans Device",
    'B084KYM1HH': "Certified Refurbished Echo Dot (4th Gen) | Smart speaker with Alexa | Glacier White",
    'B07XKF75B8': "Echo (4th Gen) | With premium sound, smart home hub, and Alexa | Glacier White",
    'B07Q6ZZFLS': "Ring Stick Up Cam Battery HD security camera with custom privacy controls, Simple setup, Works with Alexa - White",
    'B07FZ8S74R': "Echo Dot (3rd Gen) - Smart speaker with Alexa - Charcoal",
    'B0758L64L9': "Ring Spotlight Cam Battery HD Security Camera with Built Two-Way Talk and a Siren Alarm, White, Works with Alexa",
    'B07ZPMCW64': "Ring Alarm 8-piece kit (2nd Gen) – home security system with optional 24/7 professional monitoring – Works with Alexa",
    'B07Q9VBYV8': "Ring Indoor Cam, Compact Plug-In HD security camera with two-way talk, Works with Alexa - White",
    'B0781Z3FNX': "Ring Solar Panel White - Compatible with Ring Spotlight Cam Battery and Stick Up Cam Battery",
    'B084J4MZK8': "Echo Dot (newest generation - 2020 release) | Smart speaker with Alexa | Twilight Blue",
    'B07VHZ41L8': "Echo Show 10 (3rd Gen) | HD smart display with motion and Alexa | Charcoal",
    'B085M66LH1': "All-new Echo Dot (4th Gen) | Smart speaker with clock and Alexa | Twilight Blue",
    'B00X4WHP5E': "Amazon Echo - Black (1st Generation)",
    'B07SLHPDVZ': "Certified Refurbished Echo Show 8 -- HD smart display with Alexa – stay connected with video calling - Charcoal",
    'B084J4QQK1': "Echo Dot (4th Gen) Kids | Designed for kids, with parental controls | Tiger",
    'B08M1N852': "Ring A19 Smart LED Bulb, White, bundle with All-new Echo Show 10 (3rd Gen) - Charcoal"
}

In [26]:
data = {**data, **to_add}

In [28]:
with open('../data/output/asins.yaml', 'w') as outfile:
    yaml.dump(data, outfile, default_flow_style=False)

## product pages

In [77]:
df_amazon = pd.read_csv(fn_amazon, compression='gzip')

In [78]:
amazon_asin = set(df_amazon.asin.unique())

In [79]:
def is_sold_by_amazon(row, col='sold_by'):
    amazon_sellers = ['zappos', 'whole foods', 'amazon']
    if row[col] == None:
        return None
    to_check = str(row[col]).lower()
    if any(seller in to_check for seller in amazon_sellers):
        return True
    elif row['product_by_amazon']:
        return True
    else:
        return False
    
def is_shipped_by_amazon(row, col='shipped_by'):
    amazon_sellers = ['zappos', 'whole foods', 'amazon']
    if row[col] == None:
        return None
    to_check = str(row[col]).lower()
    if any(seller in to_check for seller in amazon_sellers):
        return True
    elif row['product_by_amazon']:
        return True
    else:
        return False

In [80]:
if not os.path.exists(fn_products):
    files_products = glob.glob(pattern_products)
    print(len(files_products))
    
    product_data = []
    with Pool(processes=64) as pool:
        for record in tqdm(pool.imap_unordered(P.parse_product_page, files_products), 
                           total=len(files_products)):
            product_data.append(record)
            
    df_product = pd.DataFrame(product_data)
    
    # create some columns for ads
    df_product['n_ads'] = df_product.ads.apply(lambda x: len(x))
    df_product["asin"] = df_product.fn.apply(lambda x: x.split('/2021')[0].split('/')[-1])
    
    # dedupe and filter to only products in the search.
    df_product.drop_duplicates(subset='asin', keep='last', inplace=True)
    df_product = df_product[df_product.asin.isin(df_search.asin)]
    
    # create boolean column if the asin is an Amazon product
    amazon_asin = set(df_amazon.asin.unique())
    df_product['is_amazon'] = df_product.asin.apply(lambda x: x in amazon_asin)
    
    # create boolean column for Amazon's shippers and sellers
    df_product["is_sold_by_amazon"] = df_product.apply(is_sold_by_amazon, axis=1)
    df_product["is_shipped_by_amazon"] = df_product.apply(is_shipped_by_amazon, axis=1)
    df_product.to_csv(fn_products_plus, index=False, compression='bz2')
    
else:
    df_product = pd.read_csv(fn_products_plus, compression='bz2')

In [33]:
df_product.sold_by.value_counts().head(20)

Amazon.com                 38647
AmazonFresh                 1481
Whole Foods Market          1045
Pharmapacks                  308
Amazon.com Services LLC      246
shein online store           165
Zappos                       150
Songmics Direct              141
VM Express                   138
SweatyRocks                  131
EPFamily Direct              129
BestChoiceproducts           124
JoyinDirect                  122
Just Love Fashion            119
MYBATTERYSUPPLIER            108
PajamaGram                   107
iServe                       105
Mr. Pen                      104
TheNewMall                    95
Baleaf Sports                 92
Name: sold_by, dtype: int64

## Tranining set for regression

In [81]:
df_seller_central = pd.read_csv(fn_seller_central, skiprows=1, index_col=0)
top_clicked = list(set(
    df_seller_central['#1 Clicked ASIN'].unique().tolist() + 
    df_seller_central['#2 Clicked ASIN'].unique().tolist() +
    df_seller_central['#3 Clicked ASIN'].unique().tolist()
))

In [82]:
df_amazon = pd.read_csv(fn_amazon)

In [83]:
amazon_searches = df_search[
    (df_search.product_type == 'regular_placement') &
    (df_search.asin.isin(df_amazon.asin)) 
].search_term.unique()

In [84]:
df_product = pd.read_csv(fn_products_plus, compression='bz2')

In [85]:
qualified_searches = df_search[
    (df_search.product_order >= 8) &
    (df_search.search_term.isin(amazon_searches))
].search_term.unique()

In [86]:
training_set = df_search[
    (df_search.product_type == 'regular_placement') & 
    (df_search.search_term.isin(qualified_searches))
]

In [87]:
training_set['top_clicked'] = training_set['asin'].isin(top_clicked)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_set['top_clicked'] = training_set['asin'].isin(top_clicked)


In [88]:
len(training_set)

248938

In [89]:
# merge product metadata.
training_set = training_set.merge(df_product[[
    'asin','is_amazon', 'is_sold_by_amazon', 
    'is_shipped_by_amazon', 'has_third_party_sellers'
]], how='left')

In [90]:
len(training_set)

248938

In [91]:
training_set.to_csv(fn_training_set, index=False, compression='gzip')