In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

In [4]:
df = pd.read_csv('../../walmart_cleaned.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../walmart_cleaned.csv'

In [None]:
df.info()

# Univariate Analysis

In [None]:
num_cols = df.select_dtypes(include = np.number).columns.to_list()
num_cols

In [None]:
cat_cols = df.select_dtypes(exclude= np.number).columns.to_list()
cat_cols

In [None]:
def univ_num_explore_plotly(data, num_col):
    print(f"Analysis for '{num_col}'")
    print("-" * 40)
    
    #Summary statistics (optional)
    print(data[num_col].describe())

    #Histogram with KDE
    hist_fig = px.histogram(
        data, 
        x=num_col, 
        marginal="box",  # Adds boxplot on the side
        nbins=50, 
        title=f"Distribution and Boxplot of {num_col}",
        opacity=0.7
    )
    hist_fig.update_layout(bargap=0.1)

    hist_fig.show()

In [None]:
for num_col in num_cols:
    univ_num_explore_plotly(df, num_col)

In [None]:
def univ_cat_explore_plotly(data, cat_col, top_n=5):
    
    #Value Counts
    print(f"Value counts for {cat_col}:")
    print(data[cat_col].value_counts())
    print("\n")

    unique_vals = data[cat_col].nunique()
    top_categories = data[cat_col].value_counts().head(top_n)

    if unique_vals <= 4:
        # Pie chart
        pie_df = data[cat_col].value_counts().reset_index()
        pie_df.columns = [cat_col, 'count']
        fig = px.pie(pie_df, names=cat_col, values='count', title=f"Distribution of {cat_col}")
        fig.show()
    else:
        # Bar chart
        bar_df = data[cat_col].value_counts().head(top_n).reset_index()
        bar_df.columns = [cat_col, 'count']
        fig = px.bar(bar_df, x=cat_col, y='count', title=f"Top {top_n} Categories in {cat_col}")
        fig.update_layout(xaxis_title=cat_col, yaxis_title="Count")
        fig.show()

In [None]:
for cat_col in cat_cols:
    univ_cat_explore_plotly(df, cat_col)

# Bivariate Analysis

In [None]:
# Function 1: Numerical vs Numerical
def num_vs_num_plotly(df, num_col1, num_col2):
    corr = df[num_col1].corr(df[num_col2])
    print(f'Linear Correlation between {num_col1} and {num_col2}: {corr:.2f}')
    
    fig = px.scatter(df, x=num_col1, y=num_col2, trendline="ols",
                     title=f"Scatter Plot with Trendline: {num_col1} vs {num_col2}",
                     labels={num_col1: num_col1, num_col2: num_col2})
    fig.show()

# Function 2: Numerical vs Categorical
def num_vs_cat_plotly(df, num_col, cat_col):
    if df[cat_col].nunique() > 10:
        print(f"Warning: {cat_col} has more than 10 unique categories. Plot may get cluttered.")
    
    fig = px.box(df, x=cat_col, y=num_col, color=cat_col,
                 title=f"Box Plot of {num_col} across {cat_col}",
                 labels={cat_col: cat_col, num_col: num_col})
    fig.update_layout(xaxis_tickangle=45)
    fig.show()

# Function 3: Numerical vs Binary
def num_vs_binary_plotly(df, num_col, binary_col):
    mean_val = df[num_col].mean()

    mean_df = df.groupby(binary_col)[num_col].mean().reset_index()

    fig = px.bar(mean_df, x=binary_col, y=num_col, color=binary_col,
                 title=f"Mean {num_col} by {binary_col}",
                 labels={binary_col: binary_col, num_col: f"Mean {num_col}"})

    fig.add_hline(y=mean_val, line_dash="dash", line_color="red",
                  annotation_text=f"Overall Mean: {mean_val:.2f}", annotation_position="bottom right")

    fig.show()

# Function 4: Categorical vs Binary
def cat_vs_binary_plotly(df, cat_col, binary_col):
    # Count plot
    count_df = df.groupby([cat_col, binary_col]).size().reset_index(name='count')
    fig1 = px.bar(count_df, x=cat_col, y='count', color=binary_col, barmode='group',
                  title=f"Count of {cat_col} by {binary_col}")
    fig1.show()

    # Mean plot
    mean_df = df.groupby(cat_col)[binary_col].mean().reset_index()
    fig2 = px.bar(mean_df, x=cat_col, y=binary_col, color=cat_col,
                  title=f"Mean {binary_col} by {cat_col}")
    fig2.show()

## Numerical Vs. Numerical

In [None]:
num_vs_num_plotly(df,'Size', 'Weekly_Sales' )

## Categorical Vs. Numerical

In [None]:
num_vs_cat_plotly(df, 'Weekly_Sales', 'Type')

In [None]:
num_vs_cat_plotly(df, 'Weekly_Sales', 'Season')

In [None]:
# Average sales by store type
type_sales = df.groupby("Type", as_index=False)["Weekly_Sales"].mean().sort_values(by="Weekly_Sales", ascending=False)

fig2 = px.bar(type_sales, x="Type", y="Weekly_Sales", text="Weekly_Sales",
              title="Average Weekly Sales by Store Type",
              labels={"Weekly_Sales": "Avg Weekly Sales ($)", "Type": "Store Type"})
fig2.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig2.show()

In [None]:
df["Quarter"] = df["Quarter"].apply(lambda x: f"Q{x}")

sns.set(style="whitegrid", palette="muted", font_scale=1.2)

# Plot average weekly sales by Quarter
plt.figure(figsize=(8, 5))
Quarter_sales = df.groupby("Quarter")["Weekly_Sales"].mean().reindex(["Q1", "Q2", "Q3", "Q4"])
sns.barplot(x=Quarter_sales.index, y=Quarter_sales.values)
plt.title("Average Weekly Sales by Quarter")
plt.ylabel("Average Weekly Sales")
plt.xlabel("Quarter")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# Plot total weekly sales trend by month
plt.figure(figsize=(10, 6))
month_sales = df.groupby("Month")["Weekly_Sales"].mean()
sns.lineplot(x=month_sales.index, y=month_sales.values, marker='o')
plt.title("Average Weekly Sales by Month")
plt.xlabel("Month")
plt.ylabel("Average Weekly Sales")
plt.xticks(month_sales.index)
plt.tight_layout()
plt.show()

In [None]:
# Weekly sales trend around major holiday weeks
holiday_sales = df[df["IsPromoWeek"] == True].groupby(["Year", "Month", "WeekOfYear"], as_index=False)["Weekly_Sales"].mean()
holiday_sales["Year-Week"] = holiday_sales["Year"].astype(str) + "-W" + holiday_sales["WeekOfYear"].astype(str)

fig4 = px.line(holiday_sales, x="Year-Week", y="Weekly_Sales", markers=True,
               title="Weekly Sales During Promo Weeks (Holiday Trend)",
               labels={"Weekly_Sales": "Avg Weekly Sales ($)"})
fig4.update_layout(xaxis_title="Year-Week", yaxis_title="Weekly Sales", xaxis_tickangle=45)
fig4.show()

In [None]:
store_sales = df.groupby("Store", as_index=False)["Weekly_Sales"].sum()
top_10 = store_sales.sort_values(by="Weekly_Sales", ascending=False).head(10)

# Convert Store to string to prevent overlap/compression
top_10["Store"] = top_10["Store"].astype(str)

fig = px.bar(top_10, x="Store", y="Weekly_Sales",
             text=top_10["Weekly_Sales"] / 1e9,
             labels={"Weekly_Sales": "Total Sales (in Billions)", "Store": "Store ID"},
             title="Top 10 Stores by Total Sales")

fig.update_traces(texttemplate='%{text:.2f}B', textposition='outside',
                  marker_color='steelblue')

fig.update_layout(
    xaxis=dict(
        title="Store ID",
        tickmode='array',
        tickvals=top_10["Store"],
        tickfont=dict(size=16)
    ),
    yaxis=dict(
        title=dict(
            text="Total Sales (in Billions)",
            font=dict(size=16)
        ),
        tickformat="~s",
        tickfont=dict(size=14)
    ),
    title=dict(x=0.5, xanchor="center", font=dict(size=22)),
    uniformtext_minsize=12,
    uniformtext_mode='hide',
    plot_bgcolor="rgba(255,255,255,1)",
    paper_bgcolor="rgba(255,255,255,1)",
    margin=dict(l=40, r=40, t=80, b=60),
    font=dict(family="Arial", size=14)
)

fig.show()

In [None]:
dept_avg = df.groupby("Dept")["Weekly_Sales"].mean().reset_index()
dept_avg = dept_avg.sort_values(by="Weekly_Sales", ascending=False).head(10)
dept_avg["Dept"] = dept_avg["Dept"].astype(str)

fig = px.bar(dept_avg, y="Dept", x="Weekly_Sales", orientation='h',
             text=dept_avg["Weekly_Sales"],
             title="Top 10 Departments by Average Weekly Sales",
             labels={"Weekly_Sales": "Average Weekly Sales ($)", "Dept": "Department"},
             color="Weekly_Sales", color_continuous_scale="Viridis")

fig.update_traces(texttemplate='%{text:,.0f}', textposition='outside')
fig.update_layout(
    xaxis_tickformat=",",
    yaxis=dict(categoryorder='total ascending'),
    xaxis_title="Avg Weekly Sales ($)",
    yaxis_title="Department",
    plot_bgcolor="white",
    title_font=dict(size=20),
    font=dict(size=14)
)
fig.show()

## Numerical Vs. Binary

In [None]:
num_vs_binary_plotly(df, 'Weekly_Sales', 'IsPromoWeek')

## Categorical Vs. Binary

In [None]:
cat_vs_binary_plotly(df, 'Season', 'IsPromoWeek')

In [None]:
cat_vs_binary_plotly(df, 'Type', 'Holiday_Flag')

# Multivariate Analysis

In [None]:
# Calculate average weekly sales for each store type and promo status
avg_sales = df.groupby(['Type', 'IsPromoWeek'], as_index=False)['Weekly_Sales'].mean()

# Create a grouped bar plot
fig = px.bar(
    avg_sales,
    x='Type',
    y='Weekly_Sales',
    color='IsPromoWeek',
    barmode='group',
    title="Average Weekly Sales by Store Type with Promo Week Info",
    labels={'Weekly_Sales': 'Avg Weekly Sales', 'Type': 'Store Type', 'IsPromoWeek': 'Promo Week'}
)

fig.show()

In [None]:
sns.lineplot(data=df, x='Month', y='Weekly_Sales', hue='IsPromoWeek')
plt.title("Monthly Sales Trend - Promo vs Non-Promo Weeks")
plt.show()

In [None]:
fig = px.scatter(
    df.sample(1000),
    x='Temperature',
    y='Weekly_Sales',
    size='Size',
    color='Type',
    opacity=0.6,
    title='Sales vs Temperature with Store Size & Type',
    labels={'Weekly_Sales': 'Weekly Sales', 'Temperature': 'Temperature', 'Size': 'Store Size', 'Type': 'Store Type'}
)
fig.show()

In [None]:
px.scatter(df, x='Size', y='Weekly_Sales', color='Type', trendline="ols")