In [None]:
# %%
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from IPython.display import Markdown, display

In [None]:
# %%
data = pd.read_csv("./archive.zip")

# Part 1: Best selling categories
## Crosstab analysis

In [None]:
# %%
crosstab = pd.crosstab(data["category"], data["isBestSeller"].replace({True: "yes", False: "no"}), margins=True).eval("BestSellerRate = yes / All").sort_values("BestSellerRate", ascending=False)
print(crosstab["BestSellerRate"].head(5))

%%
Above are top 5 categories on best seller rate, as for the question best seller rate spans from 0% to 5.8% and even top 5 spans from 3.5% to 5.8%. So yes, some categories are more expected to have best sellers.

## Statistical tests
### Chi-square test of independence

In [None]:
# %%
import scipy.stats as stats

In [None]:
chi_sq, p, dof, ex = stats.chi2_contingency(crosstab.iloc[:, :-2])
markdown_text = f"""
$\\chi^2$ statistic is {chi_sq:.2f} indicating that there is relationship between variables (we reject null hypothesis with very high probability)
"""
display(Markdown(markdown_text))
print(markdown_text)

%% [markdown]
### Cramér's V

In [None]:
# %%
cramers_v = stats.contingency.association(crosstab.iloc[:, :-2], method="cramer")
markdown_text = f"""
Cramér's V is {cramers_v:.2f} indicating that the strength of association is very weak or even negligible
"""
display(Markdown(markdown_text))
print(markdown_text)

%%
## Visualization

Bar stacked chart can help us visualize the variable relationship. Absolute values in the table are too different so we normalize them and select by randomly some records to visualize the fraction of BestSeller products per category.

In [None]:
# %%
(crosstab.div(crosstab.All.values, axis=0)).sample(25).iloc[:, :-2].plot(kind="bar", stacked=True)
plt.show()

%%

# Part 2: Prices and ratings across categories and brands

In [None]:
# %%
# ## 0. Filter using iqr method
iqr = data["price"].quantile(0.75) - data["price"].quantile(0.25)
lower = data["price"].quantile(0.25) - 1.5*iqr
upper = data["price"].quantile(0.75) + 1.5*iqr
data = data.query("price >= @lower and price <= @upper")
prices_per_categories = pd.DataFrame([
    data.loc[:, "price"],
    data.loc[:, "category"],
]).T

%%

## 1. Violin plots

In [None]:
# %%
top20_categories = data["category"].value_counts().head(20).index.to_list()
sns.violinplot(data=prices_per_categories.query("category in @top20_categories"), x="category", y="price")
plt.show()

In [None]:
# %%
top_median_prices = prices_per_categories.groupby("category").median().sort_values("price", ascending=False)
markdown_text = f"""
The category with the highest median price is {top_median_prices.index[0]} with median price of {top_median_prices.iloc[0, 0]:.2f}
"""
print(markdown_text)
display(Markdown(markdown_text))

%%

## 2. Bar plots

In [None]:
# %%
top10_categories = data["category"].value_counts().head(10).index.to_list()
data.query("category in @top10_categories").groupby("category").agg({"price": "mean"}).plot(kind="bar")
plt.show()

In [None]:
# %%
top_average_prices = prices_per_categories.groupby("category").mean().sort_values("price", ascending=False)
markdown_text = f"""
The category with the highest median price is {top_average_prices.index[0]} with median price of {top_average_prices.iloc[0, 0]:.2f}
"""
print(markdown_text)
display(Markdown(markdown_text))

%%

## 3. Box plots

In [None]:
# %%
# side-by-side box plots
data.query("category in @top10_categories and stars!=0").boxplot(column="stars", by="category", grid=False, rot=45)
plt.show()

In [None]:
# %%
top_median_ratings = data.groupby("category").stars.median().sort_values(ascending=False)
markdown_text = f"""
The category with the highest median rating is {top_median_ratings.index[0]} with median rating of {top_median_ratings.iloc[0]:.2f}
"""
print(markdown_text)
display(Markdown(markdown_text))

%% [markdown]
# Part 3: Prices and ratings interplay

## Correlation coefficients

In [None]:
# %%
price_rating_corr = data.query("stars!=0")[["price", "stars"]].corr()
markdown_text = f"""
Correlation coefficient between price and rating is {price_rating_corr.iloc[0, 1]:.4f} indicating that there is very weak positive correlation between them, i.e. as price increases, rating tends to increase slightly.
"""
print(markdown_text)
display(Markdown(markdown_text))

%% [markdown]
## Visualizations

### Scatter plot

In [None]:
# %%
sns.scatterplot(data=data.query("stars!=0"), x="stars", y="price", alpha=0.3)
plt.show()

%% [markdown]
### Correlation heatmap

In [None]:
# %%
sns.heatmap(price_rating_corr, annot=True, cmap="coolwarm", vmin=-1, vmax=1)
plt.show()

%% [markdown]
### QQ plot for prices

According to QQ plot, prices are not normally distributed. Visually seems like the dataset is containing several normally distributed subdatasets for different price segments.

In [None]:
# %%
sm.qqplot(data["price"].dropna().values)
plt.show()