In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime



In [11]:

def load_data():
    """Load and preprocess the datasets."""
    customers_df = pd.read_csv("C:\\Users\\chatu\\Downloads\\Customers.csv")
    products_df = pd.read_csv("C:\\Users\\chatu\\Downloads\\Products.csv")
    transactions_df = pd.read_csv("C:\\Users\\chatu\\Downloads\\Transactions.csv")
    
    # Convert date columns to datetime
    customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
    transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])
    
    return customers_df, products_df, transactions_df


In [12]:
def perform_customer_analysis(transactions_df):
    """Analyze customer behavior and metrics."""
    customer_metrics = transactions_df.groupby('CustomerID').agg({
        'TransactionID': 'count',
        'TotalValue': 'sum',
        'Quantity': 'sum'
    }).reset_index()
    
    customer_metrics.columns = ['CustomerID', 'TotalTransactions', 'TotalSpend', 'TotalItems']
    customer_metrics['AvgOrderValue'] = customer_metrics['TotalSpend'] / customer_metrics['TotalTransactions']
    
    return customer_metrics

In [13]:
def perform_product_analysis(transactions_df, products_df):
    """Analyze product performance metrics."""
    product_metrics = transactions_df.merge(products_df, on='ProductID').groupby(['ProductID', 'Category']).agg({
        'TransactionID': 'count',
        'TotalValue': 'sum',
        'Quantity': 'sum'
    }).reset_index()
    
    return product_metrics

In [14]:
def perform_regional_analysis(customers_df, transactions_df):
    """Analyze regional performance metrics."""
    regional_metrics = customers_df.merge(
        transactions_df, on='CustomerID'
    ).groupby('Region').agg({
        'TransactionID': 'count',
        'TotalValue': 'sum',
        'CustomerID': 'nunique'
    }).reset_index()
    
    return regional_metrics

In [15]:
def perform_time_analysis(transactions_df):
    """Analyze time-based trends."""
    transactions_df['Month'] = transactions_df['TransactionDate'].dt.month
    transactions_df['Year'] = transactions_df['TransactionDate'].dt.year
    
    time_metrics = transactions_df.groupby(['Year', 'Month']).agg({
        'TotalValue': 'sum',
        'TransactionID': 'count'
    }).reset_index()
    
    return time_metrics

In [16]:
def generate_visualizations(customer_metrics, product_metrics, regional_metrics, time_metrics):
    """Generate visualizations for the analyses."""
    # Customer Distribution Plot
    plt.figure(figsize=(10, 6))
    sns.histplot(data=customer_metrics, x='TotalSpend', bins=50)
    plt.title('Distribution of Customer Total Spend')
    plt.savefig('customer_spend_distribution.png')
    plt.close()
    
    # Product Category Performance
    plt.figure(figsize=(12, 6))
    sns.barplot(data=product_metrics, x='Category', y='TotalValue')
    plt.title('Sales by Product Category')
    plt.xticks(rotation=45)
    plt.savefig('category_performance.png')
    plt.close()
    
    # Regional Performance
    plt.figure(figsize=(10, 6))
    sns.barplot(data=regional_metrics, x='Region', y='TotalValue')
    plt.title('Sales by Region')
    plt.savefig('regional_performance.png')
    plt.close()
    
    # Time Series Plot
    plt.figure(figsize=(15, 6))
    plt.plot(range(len(time_metrics)), time_metrics['TotalValue'])
    plt.title('Sales Trend Over Time')
    plt.savefig('sales_trend.png')
    plt.close()


In [19]:
def main():
    # Load data
    customers_df, products_df, transactions_df = load_data()
    
    # Perform analyses
    customer_metrics = perform_customer_analysis(transactions_df)
    product_metrics = perform_product_analysis(transactions_df, products_df)
    regional_metrics = perform_regional_analysis(customers_df, transactions_df)
    time_metrics = perform_time_analysis(transactions_df)
    
    # Generate visualizations
    generate_visualizations(customer_metrics, product_metrics, regional_metrics, time_metrics)
    
    # Save insights to PDF
    insights = [
        "Insight 1: [Based on actual data analysis]",
        "Insight 2: [Based on actual data analysis]",
        "Insight 3: [Based on actual data analysis]",
        "Insight 4: [Based on actual data analysis]",
        "Insight 5: [Based on actual data analysis]"
    ]
    
    with open('FirstName_LastName_EDA.pdf', 'w') as f:
        f.write("\n\n".join(insights))

if __name__ == "__main__":
    main()