In [None]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set Seaborn style
sns.set(style="whitegrid")

# Task 1: Load and Explore the Dataset
try:
    # Load dataset
    df = pd.read_csv("sales_data.csv")

    # Display first few rows
    print("🔍 First 5 rows of the dataset:")
    print(df.head())

    # Check data types
    print("\n📦 Data types:")
    print(df.dtypes)

    # Check for missing values
    print("\n❓ Missing values:")
    print(df.isnull().sum())

    # Clean missing values
    df.fillna(method='ffill', inplace=True)

except FileNotFoundError:
    print("⚠️ File not found. Please check the filename and path.")
except Exception as e:
    print(f"⚠️ Error loading dataset: {e}")

# Task 2: Basic Data Analysis
print("\n📊 Basic statistics:")
print(df.describe())

# Group by 'Region' and compute mean 'Units Sold'
if 'Region' in df.columns and 'Units Sold' in df.columns:
    region_units = df.groupby('Region')['Units Sold'].mean()
    print("\n📍 Average Units Sold by Region:")
    print(region_units)

# Task 3: Data Visualization

# Line Chart: Units Sold over time
if 'Order Date' in df.columns and 'Units Sold' in df.columns:
    df['Order Date'] = pd.to_datetime(df['Order Date'])
    df.sort_values('Order Date', inplace=True)
    plt.figure(figsize=(10, 5))
    plt.plot(df['Order Date'], df['Units Sold'], label='Units Sold', color='teal')
    plt.title('📈 Units Sold Over Time')
    plt.xlabel('Date')
    plt.ylabel('Units Sold')
    plt.legend()
    plt.tight_layout()
    plt.show()

# Bar Chart: Average Units Sold by Region
plt.figure(figsize=(8, 5))
sns.barplot(x=region_units.index, y=region_units.values, palette='coolwarm')
plt.title('📊 Average Units Sold by Region')
plt.xlabel('Region')
plt.ylabel('Average Units Sold')
plt.tight_layout()
plt.show()

# Histogram: Distribution of Unit Price
if 'Unit Price' in df.columns:
    plt.figure(figsize=(8, 5))
    plt.hist(df['Unit Price'], bins=20, color='skyblue', edgecolor='black')
    plt.title('📉 Distribution of Unit Price')
    plt.xlabel('Unit Price')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()

# Scatter Plot: Unit Price vs Units Sold
if 'Unit Price' in df.columns and 'Units Sold' in df.columns:
    plt.figure(figsize=(8, 5))
    sns.scatterplot(data=df, x='Unit Price', y='Units Sold', hue='Region', palette='deep')
    plt.title('🔎 Unit Price vs Units Sold by Region')
    plt.xlabel('Unit Price')
    plt.ylabel('Units Sold')
    plt.legend(title='Region')
    plt.tight_layout()
    plt.show()


🔍 First 5 rows of the dataset:
         Date    Product Region Salesperson  Quantity  Unit_Price  Total_Sales
0  2023-11-24      Mouse  North       Frank         5      527.54      2637.70
1  2023-03-13      Mouse  North         Eve         7      111.98       783.83
2  2023-02-17    Monitor  South         Eve        10      101.75      1017.45
3  2023-04-12  USB Cable  South       Diana        10      592.47      5924.72
4  2023-01-04   Keyboard   West     Charlie         5      353.19      1765.93

📦 Data types:
Date            object
Product         object
Region          object
Salesperson     object
Quantity         int64
Unit_Price     float64
Total_Sales    float64
dtype: object

❓ Missing values:
Date           0
Product        0
Region         0
Salesperson    0
Quantity       0
Unit_Price     0
Total_Sales    0
dtype: int64

📊 Basic statistics:
          Quantity   Unit_Price   Total_Sales
count  1000.000000  1000.000000   1000.000000
mean      5.610000  1018.323010   5709.39

  df.fillna(method='ffill', inplace=True)


NameError: name 'region_units' is not defined

<Figure size 800x500 with 0 Axes>