Creating Dataset based on current bikes_offered and random distributions, while taking into account market trends (holidays, black friday...)

In [2]:
import numpy as np
import pandas as pd
import random
from datetime import datetime, timedelta


# df  = pd.read_csv('/datasets/bike_types_and_names.csv')

bike_data=pd.read_csv('./datasets/bike_types_and_names.csv')
# Parameters for dataset creation
start_date = datetime(2020, 1, 1)
end_date = datetime(2024, 10, 1)
date_range = (end_date - start_date).days
bikes = bike_data.to_dict('records')

# Special events for sales spikes
christmas = [datetime(year, 12, 25) for year in range(2018, 2024)]
black_friday = [datetime(year, 11, 26) for year in range(2018, 2024)]
tour_de_france = [datetime(year, 7, 1) for year in range(2018, 2024)]
summer_season = [(datetime(year, 6, 1), datetime(year, 8, 31)) for year in range(2018, 2024)]

# Function to generate random sales with seasonal spikes
def generate_sales_data(n=1000):
    sales_data = []
    for _ in range(n):
        bike = random.choice(bikes)
        random_days = random.randint(0, date_range)
        date_ordered = start_date + timedelta(days=random_days)
        
        # Base sales quantity
        base_quantity = int(np.random.normal(3,1))
        
        # Adjust sales quantity for special events
        if any(abs((date_ordered - event).days) <= 7 for event in christmas):
            quantity = base_quantity + int(np.random.normal(10,2))  # Christmas spike
        elif any(abs((date_ordered - event).days) <= 7 for event in black_friday):
            quantity = base_quantity + int(np.random.normal(15,2))   # Black Friday spike
        elif any(start <= date_ordered <= end for start, end in summer_season):
            quantity = base_quantity + int(np.random.normal(5,2))     # Summer season boost
        elif any(abs((date_ordered - event).days) <= 30 for event in tour_de_france):
            if bike['bike_type'] == 'Road Bike':
                quantity = base_quantity + int(np.random.normal(15,2))   # Tour de France spike for road bikes
            else:
                quantity = base_quantity + int(np.random.normal(1,1)) 
        else:
            quantity = base_quantity

        sales_data.append({
            'bike_name': bike['bike_name'],
            'bike_type': bike['bike_type'],
            'date_ordered': date_ordered.strftime('%Y-%m-%d'),
            'quantity': max(1, quantity)  # Ensure at least 1 sale
        })
    
    return pd.DataFrame(sales_data)

# Generate the sales dataset
sales_dataset = generate_sales_data(20000)  # Generate 5000 sales records



Save in .csv file

In [3]:
sales_dataset.to_csv('./datasets/sales_df.csv',index=False)