In [70]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

In [71]:
df = pd.read_csv('amazon_products_sales_data_cleaned.csv')
print("dataset shape :",df.shape)
df.describe()
print(f"Price Skewness: {df['current/discounted_price'].skew()}")

dataset shape : (42675, 16)
Price Skewness: 3.7362601699387445


In [72]:
df.head()

Unnamed: 0,title,rating,number_of_reviews,bought_in_last_month,current/discounted_price,price_on_variant,listed_price,is_best_seller,is_sponsored,is_couponed,buy_box_availability,delivery_details,sustainability_badges,image_url,product_url,collected_at
0,BOYA BOYALINK 2 Wireless Lavalier Microphone f...,4.6,375,300,89.68,2.0,159.0,False,True,True,Add to cart,"Delivery Mon, Sep 1",Carbon impact,https://m.media-amazon.com/images/I/71pAqiVEs3...,/sspa/click?ie=UTF8&spc=MTo4NzEzNDY2NTQ5NDYxND...,2025-08-21 11:14:29
1,"LISEN USB C to Lightning Cable, 240W 4 in 1 Ch...",4.3,2457,6000,9.99,,15.99,False,True,False,Add to cart,"Delivery Fri, Aug 29",,https://m.media-amazon.com/images/I/61nbF6aVIP...,/sspa/click?ie=UTF8&spc=MTo4NzEzNDY2NTQ5NDYxND...,2025-08-21 11:14:29
2,"DJI Mic 2 (2 TX + 1 RX + Charging Case), Wirel...",4.6,3044,2000,314.0,,349.0,False,True,False,Add to cart,"Delivery Mon, Sep 1",,https://m.media-amazon.com/images/I/61h78MEXoj...,/sspa/click?ie=UTF8&spc=MTo4NzEzNDY2NTQ5NDYxND...,2025-08-21 11:14:29
3,"Apple AirPods Pro 2 Wireless Earbuds, Active N...",4.6,35882,10000,,162.24,,True,False,False,,,,https://m.media-amazon.com/images/I/61SUj2aKoE...,/Apple-Cancellation-Transparency-Personalized-...,2025-08-21 11:14:29
4,Apple AirTag 4 Pack. Keep Track of and find Yo...,4.8,28988,10000,,72.74,,False,False,False,,,,https://m.media-amazon.com/images/I/61bMNCeAUA...,/Apple-MX542LL-A-AirTag-Pack/dp/B0D54JZTHY/ref...,2025-08-21 11:14:29


In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42675 entries, 0 to 42674
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   title                     42675 non-null  object 
 1   rating                    41651 non-null  float64
 2   number_of_reviews         42675 non-null  int64  
 3   bought_in_last_month      42675 non-null  int64  
 4   current/discounted_price  30926 non-null  float64
 5   price_on_variant          21298 non-null  float64
 6   listed_price              11673 non-null  float64
 7   is_best_seller            42675 non-null  bool   
 8   is_sponsored              42675 non-null  bool   
 9   is_couponed               42675 non-null  bool   
 10  buy_box_availability      28022 non-null  object 
 11  delivery_details          30955 non-null  object 
 12  sustainability_badges     3408 non-null   object 
 13  image_url                 42675 non-null  object 
 14  produc

In [74]:
#Current Price distibution
sns.set_style("whitegrid")
plt.figure(figsize=(12, 6))

# Creating a histogram for current price
ax = sns.histplot(df['current/discounted_price'].dropna(), kde=True, color='blue')

# Adding a vertical line for the mean price
plt.axvline(df['current/discounted_price'].mean(), color='red', linestyle='--', 
            label=f'Mean: ${df["current/discounted_price"].mean():.2f}')

# Adding a vertical line for the median price
plt.axvline(df['current/discounted_price'].median(), color='green', linestyle='--', 
            label=f'Median: ${df["current/discounted_price"].median():.2f}')

plt.xlabel('Price ($)')
plt.ylabel('Frequency')
plt.title('Distribution of Current/Discounted Prices', fontsize=15)
plt.legend()

# Show the plot
plt.tight_layout()
plt.savefig('Current Price distibution.png', dpi = 300)
plt.close()

In [75]:
#Best Sellers vs. Regular Products
sns.set_style("whitegrid")
plt.figure(figsize=(14, 8))

# Group metrics by best seller status
metrics = ['number_of_reviews', 'current/discounted_price', 'bought_in_last_month']
grouped = df.groupby('is_best_seller')[metrics].mean().reset_index()

x = np.arange(len(metrics))
width = 0.35

fig, ax = plt.subplots(figsize=(14, 8))
best_seller_bars = ax.bar(x - width/2, grouped.loc[grouped['is_best_seller'] == True, metrics].values[0], 
                          width, label='Best Sellers', color='forestgreen')
regular_bars = ax.bar(x + width/2, grouped.loc[grouped['is_best_seller'] == False, metrics].values[0], 
                      width, label='Regular Products', color='lightcoral')

def add_labels(bars):
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.2f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

add_labels(best_seller_bars)
add_labels(regular_bars)

# Add labels, title and legend
ax.set_ylabel('Average Value')
ax.set_title('Comparison of Metrics: Best Sellers vs. Regular Products', fontsize=16)
ax.set_xticks(x)
ax.set_xticklabels(['Number of Reviews', 'Price ($)', 'Monthly Purchases'])
ax.legend()

plt.figtext(0.5, 0.01, 
            "This chart compares key metrics between best-selling products and regular products.", 
            ha="center", fontsize=10, bbox={"facecolor":"orange", "alpha":0.2, "pad":5})

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig('Best Sellers vs. Regular Products.png')
plt.close()

<Figure size 1400x800 with 0 Axes>

In [76]:
#Impact on metrics over a period of time
sns.set_style("whitegrid")

df['collected_at'] = pd.to_datetime(df['collected_at'])

daily_data = df.groupby(df['collected_at'].dt.date).agg({
    'current/discounted_price': 'mean',
    'number_of_reviews': 'mean',
    'rating': 'mean',
    'is_best_seller': 'mean'  # This gives the proportion of best sellers each day
}).reset_index()

fig, axs = plt.subplots(4, 1, figsize=(14, 16), sharex=True)

# average price over time
axs[0].plot(daily_data['collected_at'], daily_data['current/discounted_price'], 
           marker='o', linestyle='-', color='blue')
axs[0].set_ylabel('Avg. Price ($)')
axs[0].set_title('Average Price Over Time')
axs[0].grid(True, alpha=0.3)

# average number of reviews over time
axs[1].plot(daily_data['collected_at'], daily_data['number_of_reviews'], 
           marker='s', linestyle='-', color='green')
axs[1].set_ylabel('Avg. Reviews')
axs[1].set_title('Average Number of Reviews Over Time')
axs[1].grid(True, alpha=0.3)

#average rating over time
axs[2].plot(daily_data['collected_at'], daily_data['rating'], 
           marker='^', linestyle='-', color='orange')
axs[2].set_ylabel('Avg. Rating')
axs[2].set_title('Average Rating Over Time')
axs[2].set_ylim([min(daily_data['rating'])-0.1, 5.0])  # Assuming ratings are out of 5
axs[2].grid(True, alpha=0.3)

# proportion of best sellers over time
axs[3].plot(daily_data['collected_at'], daily_data['is_best_seller']*100, 
           marker='D', linestyle='-', color='red')
axs[3].set_ylabel('Best Sellers (%)')
axs[3].set_title('Proportion of Best Sellers Over Time')
axs[3].set_ylim([0, 100])
axs[3].grid(True, alpha=0.3)

for ax in axs:
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    ax.xaxis.set_major_locator(mdates.AutoDateLocator())


plt.suptitle('Product Metrics Time Series Analysis', fontsize=20, y=0.98)
plt.xlabel('Date')
plt.xticks(rotation=45)

# Add annotations for significant trends or events if they exist
# For example, if there's a notable price drop:
# Find the date with the lowest average price
min_price_date = daily_data.loc[daily_data['current/discounted_price'].idxmin(), 'collected_at']
min_price = daily_data['current/discounted_price'].min()
axs[0].annotate(f'Lowest: ${min_price:.2f}', 
                xy=(min_price_date, min_price),
                xytext=(10, -30),
                textcoords='offset points',
                arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'))

# Add a text explaining the chart
plt.figtext(0.5, 0.01, 
            "This time series analysis shows how key product metrics have changed over the data collection period.",
            ha="center", fontsize=12, bbox={"facecolor":"lightblue", "alpha":0.2, "pad":5})

# Adjust layout and save
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig('time_series_analysis.png', dpi=300)
plt.close()

In [77]:
#rating and bestseller comparison
sns.set_style("whitegrid")
plt.figure(figsize=(12, 8))

# Create rating bins
bins = [0, 1, 2, 3, 4, 5]
labels = ['0-1', '1-2', '2-3', '3-4', '4-5']

# Add a new column with binned ratings
df['rating_group'] = pd.cut(df['rating'], bins=bins, labels=labels, include_lowest=True)

# Create a crosstab to count products in each rating group by best_seller status
rating_counts = pd.crosstab(df['rating_group'], df['is_best_seller'])

# Rename columns for clarity
rating_counts.columns = ['Regular Products', 'Best Sellers']

# Plot the stacked bar chart
ax = rating_counts.plot(kind='bar', stacked=False, figsize=(12, 8), 
                        color=['lightcoral', 'forestgreen'])

# Add value labels on top of bars
for container in ax.containers:
    ax.bar_label(container, fmt='%d', fontsize=10)

# Add labels and title
plt.xlabel('Rating Range', fontsize=12)
plt.ylabel('Number of Products', fontsize=12)
plt.title('Distribution of Ratings: Best Sellers vs Regular Products', fontsize=16)
plt.legend(title='Product Type')

# Add percentage annotations for best sellers in each category
for i, rating_group in enumerate(rating_counts.index):
    if rating_counts.loc[rating_group, 'Best Sellers'] > 0:
        best_seller_pct = (rating_counts.loc[rating_group, 'Best Sellers'] / 
                          (rating_counts.loc[rating_group, 'Best Sellers'] + 
                           rating_counts.loc[rating_group, 'Regular Products'])) * 100
        
        plt.annotate(f'{best_seller_pct:.1f}% are\nbest sellers', 
                    xy=(i, rating_counts.loc[rating_group, 'Best Sellers']),
                    xytext=(0, 10),
                    textcoords='offset points',
                    ha='center', va='bottom',
                    fontsize=9, color='darkgreen')

# Add a text explaining the chart
plt.figtext(0.5, 0.01, 
            "This chart shows the distribution of products across rating ranges, comparing best sellers to regular products.",
            ha="center", fontsize=10, bbox={"facecolor":"orange", "alpha":0.2, "pad":5})

# Adjust layout and save
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig('rating_bestseller_histogram.png', dpi=300)
plt.close()

<Figure size 1200x800 with 0 Axes>

In [78]:
# correlation between some important metrics
plt.figure(figsize=(8, 6))
corr = df[['current/discounted_price', 'rating', 'number_of_reviews', 'listed_price']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.tight_layout()
plt.savefig('correlation_Matrix.png')
plt.close()

In [79]:
# Hypothesis Testing
from scipy import stats

# Split data into two groups
best_sellers = df[df['is_best_seller'] == True]['rating'].dropna()
regular = df[df['is_best_seller'] == False]['rating'].dropna()

# Perform T-Test
t_stat, p_val = stats.ttest_ind(best_sellers, regular)

print(f"P-Value: {p_val}")
if p_val < 0.05:
    print("Result: Significant difference! Best Sellers have different average ratings.")
else:
    print("Result: No significant difference found.")

P-Value: 4.2934228619028974e-35
Result: Significant difference! Best Sellers have different average ratings.
