In [23]:
# Essential libraries for data manipulation and analysis
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Read the data
df = pd.read_csv('amazon.csv')

# Clean price columns
def clean_price(x):
    try:
        return float(str(x).replace('₹','').replace(',',''))
    except:
        return np.nan

def clean_percentage(x):
    try:
        return float(str(x).replace('%',''))
    except:
        return np.nan

def clean_rating(x):
    try:
        if str(x).strip() == '|':
            return np.nan
        return float(x)
    except:
        return np.nan

def clean_rating_count(x):
    try:
        return float(str(x).replace(',',''))
    except:
        return np.nan

# Apply cleaning functions
df['discounted_price'] = df['discounted_price'].apply(clean_price)
df['actual_price'] = df['actual_price'].apply(clean_price)
df['discount_percentage'] = df['discount_percentage'].apply(clean_percentage)
df['rating'] = df['rating'].apply(clean_rating)
df['rating_count'] = df['rating_count'].apply(clean_rating_count)

# Remove rows with missing values in key columns
df = df.dropna(subset=['actual_price', 'discounted_price', 'rating', 'rating_count', 'discount_percentage'])

# Calculate derived metrics
df['price_difference'] = df['actual_price'] - df['discounted_price']
df['success_score'] = df['rating'] * np.log1p(df['rating_count'])
df['value_for_money'] = df['rating'] / df['actual_price']
df['discount_amount'] = df['actual_price'] - df['discounted_price']
df['price_range'] = pd.qcut(df['actual_price'], q=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
df['popularity_score'] = np.log1p(df['rating_count'])
df['efficiency_score'] = df['rating'] * df['discount_percentage'] / 100

In [24]:
# Objective: Understand product distribution and category performance

fig = go.Figure()
category_counts = df['category'].value_counts()

fig.add_trace(go.Bar(
    x=category_counts.values,
    y=category_counts.index,
    orientation='h',
    marker_color='rgb(55, 83, 109)',
    text=category_counts.values,
    textposition='auto',
))

fig.update_layout(
    title='Product Distribution by Category',
    xaxis_title='Number of Products',
    yaxis_title='Category',
    height=600,
    width=1000,
    showlegend=False,
    hovermode='closest'
)

fig.show()

In [25]:
# Objective: Analyze correlation between price points and customer ratings

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df['actual_price'],
    y=df['rating'],
    mode='markers',
    marker=dict(
        size=np.log1p(df['rating_count'])/2,
        color=df['discount_percentage'],
        colorscale='Viridis',
        showscale=True,
        colorbar=dict(title='Discount %')
    ),
    text=df['product_name'],
    hovertemplate=
    '<b>Product</b>: %{text}<br>'+
    '<b>Price</b>: ₹%{x:.2f}<br>'+
    '<b>Rating</b>: %{y:.1f}<br>'+
    '<b>Rating Count</b>: %{marker.size:.0f}<br>'+
    '<extra></extra>'
))

fig.update_layout(
    title='Price vs Rating Relationship (Size: Rating Count, Color: Discount %)',
    xaxis_title='Price (₹)',
    yaxis_title='Rating',
    height=600,
    width=1000,
    hovermode='closest'
)

fig.show()

In [26]:
# Objective: Evaluate effectiveness of discount strategies on sales performance

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df['discount_percentage'],
    y=np.log1p(df['rating_count']),
    mode='markers',
    marker=dict(
        size=df['rating']*3,
        color=df['actual_price'],
        colorscale='Viridis',
        showscale=True,
        colorbar=dict(title='Original Price (₹)')
    ),
    text=df['product_name'],
    hovertemplate=
    '<b>Product</b>: %{text}<br>'+
    '<b>Discount</b>: %{x:.1f}%<br>'+
    '<b>Log Rating Count</b>: %{y:.2f}<br>'+
    '<b>Rating</b>: %{marker.size:.1f}<br>'+
    '<extra></extra>'
))

fig.update_layout(
    title='Discount Impact on Sales (Size: Rating, Color: Original Price)',
    xaxis_title='Discount Percentage',
    yaxis_title='Log(Rating Count + 1)',
    height=600,
    width=1000,
    hovermode='closest'
)

fig.show()

In [64]:
# Objective: Analyze pricing strategies across different categories

fig = go.Figure()

fig.add_trace(go.Box(
    x=df['category'],
    y=df['actual_price'],
    name='Original Price',
    boxpoints='outliers',
    marker_color='rgb(55, 83, 109)',
    hovertemplate=
    '<b>Category</b>: %{x}<br>'+
    '<b>Price</b>: ₹%{y:.2f}<br>'+
    '<extra></extra>'
))

fig.update_layout(
    title='Price Distribution by Category',
    xaxis_title='Category',
    yaxis_title='Price (₹)',
    height=1400,
    width=2000,
    showlegend=False,
    xaxis={'tickangle': 90}
)

fig.show()


In [28]:
# Objective: Identify successful products and their characteristics

# Calculate normalized success score
df['success_score_normalized'] = (df['success_score'] - df['success_score'].min()) / \
                                (df['success_score'].max() - df['success_score'].min())

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df['actual_price'],
    y=df['success_score_normalized'],
    mode='markers',
    marker=dict(
        size=df['discount_percentage']/2 + 5,
        color=df['rating'],
        colorscale='Viridis',
        showscale=True,
        colorbar=dict(title='Rating')
    ),
    text=df['product_name'],
    hovertemplate=
    '<b>Product</b>: %{text}<br>'+
    '<b>Price</b>: ₹%{x:.2f}<br>'+
    '<b>Success Score</b>: %{y:.3f}<br>'+
    '<b>Discount</b>: %{marker.size:.1f}%<br>'+
    '<extra></extra>'
))

fig.update_layout(
    title='Product Success Score Analysis (Size: Discount %, Color: Rating)',
    xaxis_title='Price (₹)',
    yaxis_title='Normalized Success Score',
    height=600,
    width=1000,
    hovermode='closest'
)

fig.show()


In [30]:
# Category Distribution
fig = px.treemap(df,
                 path=['category'],
                 values='rating_count',
                 color='rating',
                 color_continuous_scale='Viridis',
                 title='Product Category Distribution with Rating')
fig.update_layout(height=700, width=1000)
fig.show()

In [68]:
# Category Performance Overview
fig = px.scatter(df,
                x='rating',
                y='rating_count',
                color='category',
                size='actual_price',
                hover_data=['product_name', 'discount_percentage'],
                title='Category Performance Matrix')
fig.update_layout(height=1000, width=1000, showlegend=False)
fig.show()

In [61]:
# Category Price Range Analysis
fig = px.violin(df,
                x='category',
                y='actual_price',
                color='category',
                box=True,
                points="all",
                title='Price Distribution Across Categories')
fig.update_layout(height=1500, width=1500, showlegend=False)
fig.show()

In [33]:
# Price vs Discount Relationship
fig = px.scatter(df,
                x='actual_price',
                y='discount_percentage',
                color='rating',
                size='rating_count',
                hover_data=['product_name', 'category'],
                title='Price vs Discount Strategy Analysis')
fig.update_layout(height=700, width=1000)
fig.show()

In [34]:
# Discount Distribution
fig = px.histogram(df,
                  x='discount_percentage',
                  color='category',
                  marginal='box',
                  title='Discount Distribution Analysis')
fig.update_layout(height=700, width=1000)
fig.show()

In [69]:
# Price Range Analysis
fig = px.box(df,
             x='price_range',
             y='rating',
             color='category',
             notched=True,
             title='Rating Distribution by Price Range')
fig.update_layout(height=600, width=1400, showlegend=False)
fig.show()


In [37]:
# Rating Pattern Analysis
fig = px.density_heatmap(df,
                        x='rating',
                        y='discount_percentage',
                        z='rating_count',
                        title='Rating Pattern Analysis')
fig.update_layout(height=700, width=1000)
fig.show()

In [38]:
# Customer Response to Pricing
fig = px.scatter(df,
                x='actual_price',
                y='rating_count',
                color='rating',
                size='discount_percentage',
                facet_col='price_range',
                title='Customer Response across Price Ranges')
fig.update_layout(height=700, width=1200)
fig.show()

In [40]:
# Value for Money Analysis
fig = px.scatter_3d(df,
                    x='actual_price',
                    y='rating',
                    z='rating_count',
                    color='category',
                    size='discount_percentage',
                    title='3D Value Analysis')
fig.update_layout(height=600, width=1200)
fig.show()


In [62]:
# Success Score Distribution
fig = px.violin(df,
                x='category',
                y='success_score',
                color='price_range',
                box=True,
                title='Success Score Distribution by Category and Price Range')
fig.update_layout(height=1500, width=1500)
fig.show()

In [43]:
# Efficiency Score Analysis
fig = px.scatter(df,
                x='efficiency_score',
                y='rating_count',
                color='category',
                size='actual_price',
                title='Efficiency Score vs Popularity')
fig.update_layout(height=600, width=1200)
fig.show()

In [44]:
# Category-wise Performance Metrics
category_metrics = df.groupby('category').agg({
    'rating': 'mean',
    'rating_count': 'mean',
    'discount_percentage': 'mean',
    'success_score': 'mean'
}).reset_index()

fig = px.parallel_coordinates(category_metrics,
                            dimensions=['rating', 'rating_count',
                                      'discount_percentage', 'success_score'],
                            color='rating',
                            title='Category Performance Metrics')
fig.update_layout(height=700, width=1000)
fig.show()

In [45]:
# Discount Impact Heatmap
fig = px.density_heatmap(df,
                        x='discount_percentage',
                        y='rating_count',
                        z='rating',
                        title='Discount Impact on Rating and Popularity')
fig.update_layout(height=700, width=1000)
fig.show()

In [48]:
# Discount Amount Analysis
fig = px.scatter(df,
                x='discount_amount',
                y='rating_count',
                color='category',
                size='rating',
                title='Discount Amount vs Popularity')
fig.update_layout(height=600, width=1400)
fig.show()

In [70]:
# Discount Strategy Effectiveness
fig = px.scatter(df,
                x='discount_percentage',
                y='success_score',
                color='category',
                size='actual_price',
                trendline="ols",
                title='Discount Strategy Effectiveness')
fig.update_layout(height=600, width=1400, showlegend=False)
fig.show()


In [52]:
# Product Positioning Matrix
fig = px.scatter(df,
                x='actual_price',
                y='rating',
                color='category',
                size='rating_count',
                facet_col='price_range',
                title='Product Positioning Matrix')
fig.update_layout(height=600, width=1400)
fig.show()

In [53]:
# Price Sensitivity Analysis
fig = px.scatter(df,
                x='discount_percentage',
                y='rating_count',
                color='price_range',
                size='rating',
                animation_frame='category',
                title='Price Sensitivity Analysis by Category')
fig.update_layout(height=700, width=1000)
fig.show()

In [55]:
# Success Factors Analysis
fig = px.scatter_matrix(df,
                       dimensions=['rating', 'rating_count',
                                 'discount_percentage', 'actual_price'],
                       color='category',
                       title='Success Factors Correlation Matrix')
fig.update_layout(height=1000, width=2000)
fig.show()


In [71]:
# Market Segmentation Analysis
fig = px.scatter_3d(df,
                    x='actual_price',
                    y='rating',
                    z='discount_percentage',
                    color='category',
                    size='rating_count',
                    title='Market Segmentation 3D Analysis')
fig.update_layout(height=800, width=800, showlegend=False)
fig.show()

In [58]:
# Value Proposition Analysis
fig = px.sunburst(df,
                  path=['category', 'price_range'],
                  values='rating_count',
                  color='rating',
                  title='Value Proposition by Category and Price Range')
fig.update_layout(height=800, width=1000)
fig.show()
