# Supermarket Sales Data Exploration

Notebook này dùng để khám phá và phân tích dữ liệu Supermarket Sales trước khi thực hiện ETL process.


In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


ModuleNotFoundError: No module named 'matplotlib'

In [None]:
# Load data
data_path = Path('../data/raw/SuperMarketAnalysis.csv')
df = pd.read_csv(data_path)

print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")


In [None]:
# Basic info
df.info()
print("\n" + "="*50)
df.head()


In [None]:
# Statistical summary
df.describe()


In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values:")
print(missing_values[missing_values > 0])

if missing_values.sum() == 0:
    print("No missing values found!")


In [None]:
# Sales analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Total sales by branch
df.groupby('Branch')['Sales'].sum().plot(kind='bar', ax=axes[0,0])
axes[0,0].set_title('Total Sales by Branch')
axes[0,0].set_ylabel('Total Sales')

# Sales by product line
df.groupby('Product line')['Sales'].sum().plot(kind='bar', ax=axes[0,1])
axes[0,1].set_title('Total Sales by Product Line')
axes[0,1].set_ylabel('Total Sales')
axes[0,1].tick_params(axis='x', rotation=45)

# Sales by customer type
df.groupby('Customer type')['Sales'].sum().plot(kind='pie', ax=axes[1,0], autopct='%1.1f%%')
axes[1,0].set_title('Sales by Customer Type')

# Sales by payment method
df.groupby('Payment')['Sales'].sum().plot(kind='pie', ax=axes[1,1], autopct='%1.1f%%')
axes[1,1].set_title('Sales by Payment Method')

plt.tight_layout()
plt.show()


In [None]:
# Time series analysis
# Convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'])
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

# Daily sales trend
daily_sales = df.groupby('Date')['Sales'].sum().reset_index()

plt.figure(figsize=(12, 6))
plt.plot(daily_sales['Date'], daily_sales['Sales'])
plt.title('Daily Sales Trend')
plt.xlabel('Date')
plt.ylabel('Total Sales')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Correlation analysis
numeric_columns = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numeric_columns].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()


In [None]:
# Data quality insights
print("=== DATA QUALITY INSIGHTS ===")
print(f"Total records: {len(df)}")
print(f"Unique invoices: {df['Invoice ID'].nunique()}")
print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")
print(f"Total sales amount: ${df['Sales'].sum():,.2f}")
print(f"Average transaction: ${df['Sales'].mean():.2f}")
print(f"Highest transaction: ${df['Sales'].max():.2f}")
print(f"Lowest transaction: ${df['Sales'].min():.2f}")
