In [None]:
# Adidas US Sales Analysis: Robust, Dynamic, Error-Free
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Utility: Get latest data file from a directory

def get_latest_data(directory, pattern='*.xlsx'):
    """
    Get the most recently modified Excel/CSV file from the specified directory.
    
    Args:
        directory (str or Path): Directory to search for files
        pattern (str): File pattern to match (default: '*.xlsx')
        
    Returns:
        pandas.DataFrame: Data from the most recent file
    """
    cleaned_dir = Path(directory)
    files = list(cleaned_dir.glob(pattern))
    if not files:
        files = list(cleaned_dir.glob('*.csv'))
        if not files:
            raise FileNotFoundError(f"No {pattern} or CSV files found in {cleaned_dir}")
    latest_file = max(files, key=lambda f: f.stat().st_mtime)
    print(f"Loading file: {latest_file}")
    if latest_file.suffix.lower() == '.csv':
        return pd.read_csv(latest_file)
    else:
        return pd.read_excel(latest_file)

# Adidas US Sales Analysis

This notebook provides a robust, dynamic workflow for loading, cleaning, and visualizing Adidas US Sales data. All steps are organized for clarity and error-free execution.

## Data Loading & Cleaning

This section installs required packages, loads the latest cleaned data, and performs robust cleaning with diagnostics.

In [None]:
# Install required packages (run once)
%pip install matplotlib seaborn openpyxl

In [None]:
# Load the latest cleaned Adidas US Sales data
try:
    # Update the path to your cleaned data directory
    cleaned_data_dir = '../cleaned_data'  # or 'cleaned_data' if running from project root
    df = get_latest_data(cleaned_data_dir)
    print(f"Data loaded. Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
except Exception as e:
    print(f"Error loading data: {e}")
    df = None

In [None]:
# Data cleaning: Drop rows with missing values in required columns
required_columns = ['Retailer', 'Product', 'Total Sales', 'Operating Profit', 'State', 'Order Date']
if df is not None:
    missing_cols = [col for col in required_columns if col not in df.columns]
    if missing_cols:
        print(f"Missing columns: {missing_cols}. Cleaning will skip these.")
    for col in required_columns:
        if col in df.columns:
            df = df.dropna(subset=[col])
    print(f"Data cleaned. Shape: {df.shape}")
else:
    print("No data loaded. Skipping cleaning.")

## Visualization & Analysis

This section provides robust visualizations and analysis of the cleaned Adidas US Sales data, with error handling for missing columns.

In [None]:
# Visualization: Total Sales by Product
if df is not None and 'Product' in df.columns and 'Total Sales' in df.columns:
    plt.figure(figsize=(12,6))
    sales_by_product = df.groupby('Product')['Total Sales'].sum().sort_values(ascending=False)
    sns.barplot(x=sales_by_product.index, y=sales_by_product.values)
    plt.xticks(rotation=90)
    plt.title('Total Sales by Product')
    plt.xlabel('Product')
    plt.ylabel('Total Sales')
    plt.tight_layout()
    plt.show()
else:
    print("Required columns for plotting ('Product', 'Total Sales') are missing or data not loaded.")

DataFrame 'df' is not defined, is None, or required columns are missing.


In [7]:
# Visualization: Total Sales by State
if df is not None and 'State' in df.columns and 'Total Sales' in df.columns:
    plt.figure(figsize=(12,6))
    sales_by_state = df.groupby('State')['Total Sales'].sum().sort_values(ascending=False)
    sns.barplot(x=sales_by_state.index, y=sales_by_state.values)
    plt.xticks(rotation=90)
    plt.title('Total Sales by State')
    plt.xlabel('State')
    plt.ylabel('Total Sales')
    plt.tight_layout()
    plt.show()
else:
    print("Required columns for plotting ('State', 'Total Sales') are missing or data not loaded.")

NameError: name 'df' is not defined

In [None]:
# Visualization: Operating Profit by Retailer
if df is not None and 'Retailer' in df.columns and 'Operating Profit' in df.columns:
    plt.figure(figsize=(12,6))
    profit_by_retailer = df.groupby('Retailer')['Operating Profit'].sum().sort_values(ascending=False)
    sns.barplot(x=profit_by_retailer.index, y=profit_by_retailer.values)
    plt.xticks(rotation=90)
    plt.title('Operating Profit by Retailer')
    plt.xlabel('Retailer')
    plt.ylabel('Operating Profit')
    plt.tight_layout()
    plt.show()
else:
    print("Required columns for plotting ('Retailer', 'Operating Profit') are missing or data not loaded.")

DataFrame 'df' is not defined, is None, or required columns are missing.
