Create a dataset about apples. 
Defining a dataset generator

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta


def generate_apple_sales_data_with_promo_adjustment(
        base_demand: int = 1000, n_rows: int = 5000
):
    """
    Generates a synthetic dataset for predicting apple sales demand with seasonality
    and inflation.

    This function creates a pandas DataFrame with features relevant to apple sales.
    The features include date, average_temperature, rainfall, weekend flag, holiday flag,
    promotional flag, price_per_kg, and the previous day's demand. The target variable,
    'demand', is generated based on a combination of these features with some added noise.

    Args:
        base_demand (int, optional): Base demand for apples. Defaults to 1000.
        n_rows (int, optional): Number of rows (days) of data to generate. Defaults to 5000.

    Returns:
        pd.DataFrame: DataFrame with features and target variable for apple sales prediction.

    Example:
        >>> df = generate_apple_sales_data_with_seasonality(base_demand=1200, n_rows=6000)
        >>> df.head()
    """

    # Set seed for reproducibility
    np.random.seed(9999)

    # Create date range
    dates = [datetime.now() - timedelta(days=i) for i in range(n_rows)]
    dates.reverse()

    # Generate features
    df = pd.DataFrame(
        {
            "date": dates,
            "average_temperature": np.random.uniform(10, 35, n_rows),
            "rainfall": np.random.exponential(5, n_rows),
            "weekend": [(date.weekday() >= 5) * 1 for date in dates],
            "holiday": np.random.choice([0, 1], n_rows, p=[0.97, 0.03]),
            "price_per_kg": np.random.uniform(0.5, 3, n_rows),
            "month": [date.month for date in dates],
        }
    )

    # Introduce inflation over time (years)
    df["inflation_multiplier"] = (
            1 + (df["date"].dt.year - df["date"].dt.year.min()) * 0.03
    )

    # Incorporate seasonality due to apple harvests
    df["harvest_effect"] = np.sin(2 * np.pi * (df["month"] - 3) / 12) + np.sin(
        2 * np.pi * (df["month"] - 9) / 12
    )

    # Modify the price_per_kg based on harvest effect
    df["price_per_kg"] = df["price_per_kg"] - df["harvest_effect"] * 0.5

    # Adjust promo periods to coincide with periods lagging peak harvest by 1 month
    peak_months = [4, 10]  # months following the peak availability
    df["promo"] = np.where(
        df["month"].isin(peak_months),
        1,
        np.random.choice([0, 1], n_rows, p=[0.85, 0.15]),
    )

    # Generate target variable based on features
    base_price_effect = -df["price_per_kg"] * 50
    seasonality_effect = df["harvest_effect"] * 50
    promo_effect = df["promo"] * 200

    df["demand"] = (
                           base_demand
                           + base_price_effect
                           + seasonality_effect
                           + promo_effect
                           + df["weekend"] * 300
                           + np.random.normal(0, 50, n_rows)
                   ) * df[
                       "inflation_multiplier"
                   ]  # adding random noise

    # Add previous day's demand
    df["previous_days_demand"] = df["demand"].shift(1)
    df["previous_days_demand"].fillna(
        method="bfill", inplace=True
    )  # fill the first row

    # Drop temporary columns
    df.drop(columns=["inflation_multiplier", "harvest_effect", "month"], inplace=True)

    return df


In [2]:
generate_apple_sales_data_with_promo_adjustment()

  df["previous_days_demand"].fillna(


Unnamed: 0,date,average_temperature,rainfall,weekend,holiday,price_per_kg,promo,demand,previous_days_demand
0,2010-05-24 08:23:14.271950,30.584727,1.199291,0,0,1.726258,0,851.276659,851.276659
1,2010-05-25 08:23:14.271947,15.465069,1.037626,0,0,0.576471,0,906.836626,851.276659
2,2010-05-26 08:23:14.271945,10.786525,5.656089,0,0,2.513328,0,857.895424,906.836626
3,2010-05-27 08:23:14.271942,23.648154,12.030937,0,0,1.839225,0,848.961007,857.895424
4,2010-05-28 08:23:14.271940,13.861391,4.303812,0,0,1.531772,0,983.128282,848.961007
...,...,...,...,...,...,...,...,...,...
4995,2024-01-26 08:23:14.263375,21.643051,3.821656,0,0,2.391010,0,1215.125948,1170.799278
4996,2024-01-27 08:23:14.263374,13.808813,1.080603,1,1,0.898693,0,1738.128382,1215.125948
4997,2024-01-28 08:23:14.263372,11.698227,1.911000,1,0,2.839860,0,1462.323379,1738.128382
4998,2024-01-29 08:23:14.263370,18.052081,1.000521,0,0,1.188440,0,1397.144493,1462.323379


In [3]:
# Generate the data
df = generate_apple_sales_data_with_promo_adjustment(base_demand=1200, n_rows=6000)

# Save the DataFrame to CSV file
df.to_csv('apple_sales_data.csv', index=False)

  df["previous_days_demand"].fillna(
