In [14]:
import pandas as pd
import plotly.express as px
import plotly.io as pio
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.figure_factory as ff


In [15]:
# Load the cleaned dataset
file_path = "CMST80HH_enhanced.xlsx"
df = pd.read_excel(file_path, sheet_name="Cleaned_Data", engine="openpyxl")

In [40]:
# Select relevant columns for EDA
eda_df = df[[
    "internet_access_binary",
    "online_purchase_binary",
    "Usual monthly consumption expenditure of the household",
    "Household size",
    "sector",
    "Social group",
    "Religion"
]].copy()

In [41]:
# Bar plot: Internet access by sector
access_by_sector = eda_df.groupby("sector")["internet_access_binary"].mean().reset_index()
fig1 = px.bar(access_by_sector, x="sector", y="internet_access_binary",
              text=access_by_sector["internet_access_binary"].round(2),
              title="Internet Access Rate by Sector",
              labels={"internet_access": "Access Rate"})
fig1.update_traces(textposition="outside")
fig1.update_layout(width=800, height=500)
fig1.show()


In [42]:
# Bar plot: Online purchase by social group
purchase_by_group = eda_df.groupby("Social group")["online_purchase_binary"].mean().reset_index()
fig2 = px.bar(purchase_by_group, x="Social group", y="online_purchase_binary",
              text=purchase_by_group["online_purchase_binary"].round(2),
              title="Online Purchase Rate by Social Group",
              labels={"online_purchase": "Purchase Rate"})
fig2.update_traces(textposition="outside")
fig2.update_layout(width=800, height=500)
fig2.show()

In [43]:
# Bar plot: Online purchase by religion
purchase_by_religion = eda_df.groupby("Religion")["online_purchase_binary"].mean().reset_index()
fig3 = px.bar(purchase_by_religion, x="Religion", y="online_purchase_binary",
              text=purchase_by_religion["online_purchase_binary"].round(2),
              title="Online Purchase Rate by Religion",
              labels={"online_purchase_binary": "Purchase Rate"})
fig3.update_traces(textposition="outside")
fig3.update_layout(width=800, height=500)
fig3.show()

In [44]:
# Histogram: Expenditure distribution
fig4 = px.histogram(eda_df, x="Usual monthly consumption expenditure of the household", nbins=50,
                    title="Distribution of Household Expenditure",
                    labels={"Usual monthly consumption expenditure of the household": "Monthly Expenditure (INR)"})
fig4.update_layout(width=800, height=500)
fig4.show()

In [45]:
# Histogram: Household size distribution
fig5 = px.histogram(eda_df, x="Household size", nbins=15,
                    title="Distribution of Household Size",
                    labels={"Household size": "Household Size"})
fig5.update_layout(width=800, height=500)
fig5.show()

In [47]:
# Correlation heatmap for numeric variables

numeric_cols = ["internet_access_binary", "online_purchase_binary", 
                "Usual monthly consumption expenditure of the household", "Household size"]
corr_matrix = eda_df[numeric_cols].corr()

fig6 = ff.create_annotated_heatmap(z=corr_matrix.values,
                                   x=corr_matrix.columns.tolist(),
                                   y=corr_matrix.columns.tolist(),
                                   colorscale="Viridis")
fig6.update_layout(title="Correlation Heatmap of Numeric Variables", width=1100, height=800)
fig6.show()