In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages

# Function to load and merge datasets efficiently
def load_data():
    """Load and merge datasets efficiently"""
    customers = pd.read_csv('DataFile/Customers.csv', parse_dates=['SignupDate'])
    products = pd.read_csv('DataFile/Products.csv')
    transactions = pd.read_csv('DataFile/Transactions.csv', parse_dates=['TransactionDate'])

    # Merge in one pipeline
    return (transactions
            .merge(customers, on='CustomerID', how='inner')
            .merge(products, on='ProductID', suffixes=('_txn', '_prod')))

In [2]:
def generate_plots(df, customers):
    """Generate all plots and return figure objects"""
    figures = []

    # Plot 1: Transaction Value Distribution
    fig1, ax1 = plt.subplots(figsize=(10, 6))
    sns.histplot(df['TotalValue'], bins=30, kde=True, ax=ax1)
    ax1.set(title='Distribution of Total Transaction Values',
           xlabel='Total Value', ylabel='Frequency')
    figures.append(fig1)
    print("Transaction Value Distribution successfully completed")

    # Plot 2: Customer Distribution by Region
    fig2, ax2 = plt.subplots(figsize=(10, 6))
    region_counts = customers['Region'].value_counts()
    sns.barplot(x=region_counts.index, y=region_counts.values, ax=ax2)
    ax2.set(title='Customer Distribution by Region',
           xlabel='Region', ylabel='Number of Customers')
    figures.append(fig2)
    print("Customer Distribution by Region successfully completed")

    # Plot 3: Top Performing Products
    top_products = df.groupby('ProductName')['TotalValue'].sum().nlargest(10)
    fig3, ax3 = plt.subplots(figsize=(12, 6))
    top_products.plot(kind='barh', color='teal', ax=ax3)
    ax3.set(title='Top 10 Products by Total Revenue',
           xlabel='Total Revenue', ylabel='Product')
    figures.append(fig3)
    print("Top Performing Products successfully completed")

    # Plot 4: Monthly Trends
    df['Month'] = df['TransactionDate'].dt.to_period('M').astype(str)
    monthly_trends = df.groupby('Month')['TotalValue'].sum()
    fig4, ax4 = plt.subplots(figsize=(12, 6))
    monthly_trends.plot(kind='line', marker='o', ax=ax4)
    ax4.set(title='Monthly Transaction Value Trends',
           xlabel='Month', ylabel='Total Revenue')
    figures.append(fig4)
    print("Monthly Trends successfully completed")

    # Plot 5: Regional Spending Analysis
    region_stats = df.groupby('Region')['TotalValue'].agg(['mean', 'sum'])
    fig5, (ax5a, ax5b) = plt.subplots(1, 2, figsize=(16, 6))
    region_stats['mean'].plot(kind='bar', color='salmon', ax=ax5a)
    region_stats['sum'].plot(kind='bar', color='lightseagreen', ax=ax5b)
    ax5a.set(title='Average Order Value by Region', ylabel='Average Value')
    ax5b.set(title='Total Revenue by Region', ylabel='Total Value')
    figures.append(fig5)
    print("Regional Spending Analysis successfully completed")

    return figures

# Main function
if __name__ == '__main__':
    # Configure visual settings
    sns.set_style('whitegrid')
    plt.rcParams.update({'font.size': 12})

    # Load and prepare data
    customers = pd.read_csv('DataFile/Customers.csv', parse_dates=['SignupDate'])
    merged_data = load_data()

    # Generate all plots and save them in a PDF
    with PdfPages('Data/EDA_Report.pdf') as pdf:
        for fig in generate_plots(merged_data, customers):
            pdf.savefig(fig)
            plt.close(fig)

        # Add statistics page
        stats_page = plt.figure(figsize=(11, 8))
        stats_text = [
            "Key Statistics:",
            f"- Total Revenue: ${merged_data['TotalValue'].sum():,.2f}",
            f"- Average Order Value: ${merged_data['TotalValue'].mean():.2f}",
            f"- Total Customers: {merged_data['CustomerID'].nunique()}",
            f"- Most Popular Category: {merged_data['Category'].mode()[0]}",
            f"- Busiest Month: {merged_data['Month'].mode()[0]}"
        ]
        stats_page.text(0.1, 0.5, '\n'.join(stats_text), fontsize=14)
        pdf.savefig(stats_page)
        plt.close(stats_page)

    print("EDA Report generated successfully and pdf saved to Data Folder.")



Transaction Value Distribution successfully completed
Customer Distribution by Region successfully completed
Top Performing Products successfully completed
Monthly Trends successfully completed
Regional Spending Analysis successfully completed
EDA Report generated successfully and pdf saved to Data Folder.
