In [2]:
import pandas as pd
import numpy as np

# --- Configuration for Data Size (24 Months) ---
NUM_MONTHS = 24  
ENTRIES_PER_MONTH = 900 # Total rows: 21,600
START_YEAR = 2024
BASE_REVENUE_LOC = 5000

# --- Categorical Variables ---
PROFIT_CENTERS = ['Product_A', 'Product_B', 'Service_C', 'Sales_Channel_X']
REGIONS = ['North America', 'EMEA', 'APAC', 'LATAM']
PRODUCT_CATEGORIES = ['Software Subscription', 'Hardware Sales', 'Professional Services', 'Maintenance']
LINE_ITEM_DESCRIPTIONS = [
    'Annual Premium License', 'Standard Monthly Fee', 'Consulting Project Alpha', 
    'Cloud Server Unit', 'Legacy Product Support', 'Installation Fee'
]

# --- Step 1: Create Base DataFrame and Time Features ---
all_months = pd.date_range(start=f'{START_YEAR}-01-01', periods=NUM_MONTHS, freq='MS')
df_list = []

for month_index, month in enumerate(all_months):
    size = ENTRIES_PER_MONTH
    
    month_df = pd.DataFrame({
        'month': month,
        'profit_center': np.random.choice(PROFIT_CENTERS, size=size, replace=True),
        'region': np.random.choice(REGIONS, size=size, replace=True),
        'product_category': np.random.choice(PRODUCT_CATEGORIES, size=size, replace=True),
        'line_item_description': np.random.choice(LINE_ITEM_DESCRIPTIONS, size=size, replace=True)
    })
    month_df['year'] = month.year
    month_df['month_int'] = month.month
    month_df['quarter'] = month.quarter
    month_df['is_qtr_end'] = int(month.month % 3 == 0)
    month_df['month_index'] = month_index # Used for trend calculation

    df_list.append(month_df)

df = pd.concat(df_list, ignore_index=True)

# --- Step 2: Calculate Complex Revenue (Trend + Seasonality + Noise) ---

# 1. Base Revenue with Noise (Base)
df['revenue'] = np.random.normal(loc=BASE_REVENUE_LOC, scale=1500, size=len(df))

# 2. Add Trend (5% CAGR over 2 years, applied monthly)
# Monthly growth rate: (1 + CAGR)^(1/12) - 1
monthly_growth_rate = (1 + 0.05)**(1/12) - 1
df['trend_factor'] = (1 + monthly_growth_rate) ** df['month_index']
df['revenue'] = df['revenue'] * df['trend_factor']

# 3. Add Seasonality (Adjustment based on quarter)
def get_seasonality_factor(q):
    if q == 4: # Strong Q4 peak
        return 1.30
    elif q == 3: # Slight Q3 dip
        return 0.90
    else: # Q1 and Q2 are neutral
        return 1.00

df['seasonality_factor'] = df['quarter'].apply(get_seasonality_factor)
df['revenue'] = df['revenue'] * df['seasonality_factor']

# Final cleanup
df['revenue'] = df['revenue'].apply(lambda x: max(500, x)).round(2) 
df = df.drop(columns=['month_index', 'trend_factor', 'seasonality_factor']) # Drop helper columns

# --- Step 3: Verification and Output ---
print(f"✅ Successfully generated a dataset with {len(df)} rows.")
print("The revenue now includes a long-term **5% trend** and **quarterly seasonality**.")
print(f"Date range: {df['month'].min().strftime('%Y-%m')} to {df['month'].max().strftime('%Y-%m')}.")
# To save the 20k+ line file:
#df.to_csv('time_series_enhanced_revenue_data.csv', index=False)
df.to_csv('../data/raw/revenue_data.csv', index=False)

✅ Successfully generated a dataset with 21600 rows.
The revenue now includes a long-term **5% trend** and **quarterly seasonality**.
Date range: 2024-01 to 2025-12.
