In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy.stats as stats
from statsmodels.tsa.stattools import acf

sns.set_theme()

print("Packages Imported")

Packages Imported


### Load Data

In [6]:
# load data
walmart = pd.read_csv('data/walmart.csv')
amazon = pd.read_csv('data/amazon.csv')

In [7]:
# select relevant columns
walmart = walmart[['Product Name', 'Sale Price', 'Brand', 'Category']]
amazon = amazon[['Product Name', 'Brand Name', 'Category', 'Selling Price']]

### Preprocessing

In [8]:
# extract primary category
extract_primary_category = lambda s: s.split(' | ')[0] if not pd.isna(s) else np.nan
walmart['Primary Category'] = walmart['Category'].apply(extract_primary_category)
amazon['Primary Category'] = amazon['Category'].apply(extract_primary_category)

walmart = walmart.drop(columns=['Category'])
amazon = amazon.drop(columns=['Category'])

In [9]:
# extract price
def extract_price(s):
    if pd.isna(s):
        return np.nan
    
    s = s.replace('$', '').replace(',', '')
    if ' - ' in s:
        s1, s2 = s.split(' - ')
        s = (float(s1) + float(s2)) / 2
        return s
    try:
        return float(s)
    except:
        return np.nan
    
amazon['Selling Price'] = amazon['Selling Price'].apply(extract_price)
amazon = amazon.drop(amazon[amazon['Selling Price'].isna()].index)

In [10]:
# category selection
walmart_categories_to_keep = ['Sports & Outdoors', 'Health', 'Household Essentials', 'Home', 'Baby', 'Toys', 'Home Improvement', 'Clothing', 'Jewelry']
amazon_categories_to_keep = ['Toys & Games', 'Home & Kitchen', 'Clothing, Shoes & Jewelry', 'Sports & Outdoors', 'Baby Products', 'Health & Household']

walmart = walmart[walmart['Primary Category'].isin(walmart_categories_to_keep)]
amazon = amazon[amazon['Primary Category'].isin(amazon_categories_to_keep)]

def universalize_walmart(category):
    if category in ['Health', 'Household Essentials', 'Home', 'Home Improvement']:
        return 'Health & Household'
    if category in ['Clothing', "Jewelry"]:
        return 'Clothing, Shoes & Jewelry'
    return category

def universalize_amazon(category):
    if category in ['Home & Kitchen', 'Health & Household']:
        return 'Health & Household'
    if category == 'Baby Products':
        return 'Baby'
    if category == 'Toys & Games':
        return 'Toys'
    return category

walmart['Universal Category'] = walmart['Primary Category'].apply(universalize_walmart)
amazon['Universal Category'] = amazon['Primary Category'].apply(universalize_amazon)

In [11]:
# outlier elimination
walmart_outlier_lims = {}
amazon_outlier_lims = {}

iqr_mult = 1.5
for cat in walmart['Universal Category'].unique():
    walmart_pr = (walmart[walmart['Universal Category'] == cat]['Sale Price'])
    amazon_pr = (amazon[amazon['Universal Category'] == cat]['Selling Price'])
    
    walmart_limit = walmart_pr.quantile(0.75) + (walmart_pr.quantile(0.75) - walmart_pr.quantile(0.25)) * iqr_mult
    amazon_limit = amazon_pr.quantile(0.75) + (amazon_pr.quantile(0.75) - amazon_pr.quantile(0.25)) * iqr_mult
    
    walmart_outlier_lims[cat] = walmart_limit
    amazon_outlier_lims[cat] = amazon_limit
    
    walmart_prop = walmart_pr > walmart_limit
    amazon_prop = amazon_pr > amazon_limit
    
    print()
    print(f'Walmart - {cat} < {walmart_limit.round(2)}: Prop = {walmart_prop.mean()}; Num = {walmart_prop.sum()}')
    print(f'Amazon - {cat} < {amazon_limit.round(2)}: Prop = {amazon_prop.mean()}, Num = {amazon_prop.sum()}')
    
walmart_outlier_indices = []
amazon_outlier_indices = []
for cat in walmart['Universal Category'].unique():
    walmart_cat = walmart[walmart['Universal Category'] == cat]
    amazon_cat = amazon[amazon['Universal Category'] == cat]
    
    walmart_outlier_indices += list(walmart_cat[walmart_cat['Sale Price'] > walmart_outlier_lims[cat]].index)
    amazon_outlier_indices += list(amazon_cat[amazon_cat['Selling Price'] > amazon_outlier_lims[cat]].index)
    

walmart = walmart.drop(walmart_outlier_indices)
amazon = amazon.drop(amazon_outlier_indices)


Walmart - Health & Household < 66.04: Prop = 0.0962962962962963; Num = 624
Amazon - Health & Household < 155.94: Prop = 0.1048158640226629, Num = 74

Walmart - Sports & Outdoors < 161.54: Prop = 0.11378872984236274; Num = 1256
Amazon - Sports & Outdoors < 224.39: Prop = 0.10144927536231885, Num = 49

Walmart - Baby < 89.96: Prop = 0.13217138707334786; Num = 364
Amazon - Baby < 63.26: Prop = 0.07109004739336493, Num = 15

Walmart - Clothing, Shoes & Jewelry < 102.14: Prop = 0.0881057268722467; Num = 40
Amazon - Clothing, Shoes & Jewelry < 62.55: Prop = 0.04952076677316294, Num = 31

Walmart - Toys < 243.27: Prop = 0.135632183908046; Num = 118
Amazon - Toys < 51.61: Prop = 0.10001536334306345, Num = 651


### Hypothesis tests

We saw in our EDA that both Walmart and Amazon have their own respective item counts per universal category. Here we'll implement a Chi-Squared test of homogeneity evaluate whether or not the proportions of items in each category are identical across stores. We want to test if the observed frequencies of the categories are **consistent** with each other.

In [12]:
walmart['Universal Category'].value_counts()

Universal Category
Sports & Outdoors            9782
Health & Household           5856
Baby                         2390
Toys                          752
Clothing, Shoes & Jewelry     414
Name: count, dtype: int64

In [13]:
amazon['Universal Category'].value_counts()

Universal Category
Toys                         5858
Health & Household            632
Clothing, Shoes & Jewelry     595
Sports & Outdoors             434
Baby                          196
Name: count, dtype: int64

In [15]:
combined_counts = pd.DataFrame({'walmart': walmart['Universal Category'].value_counts().sort_index(), 
                   'amazon': amazon['Universal Category'].value_counts().sort_index()})
contingency_table = combined_counts.T.values

chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

print(f"Chi-Squared Statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of Freedom: {dof}")
print("Expected Frequencies:")
print(expected)

Chi-Squared Statistic: 16749.285450576455
P-value: 0.0
Degrees of Freedom: 4
Expected Frequencies:
[[1844.57556951  719.71258687 4627.84466164 7287.00078041 4714.86640158]
 [ 741.42443049  289.28741313 1860.15533836 2928.99921959 1895.13359842]]


We can conclude that the observed frequencies are, in fact, not consistent with each other across stores as shown by a p-value less than $\alpha=0.01$

In [16]:
# fit some sort of distribution to our price data?
# poisson, chi-square, F, gamma?