# Mocking retail sales data

This notebook aims to mock retail sales data to a sufficient level by which demonstatable forecasts can be produced with Amazon Forecast.

It has been adapted from the R program found here: https://github.com/techhound/GenerateOrders/blob/master/GenerateOrders.R
https://datasciencereview.com/how-to-generate-mock-sales-data/

The key attributes we are interested in generating are:

* timestamp: The date and time of a transaction
* order_id: An identifier for the order, an order may contain multiple products sold
* item_id: An identifier for a particular item, for now we are not adding additional metadata
* store_id: An identifier of where the transaction took place
* quantity: The number of item_id sold in the order

In [None]:
#We need to upgrade pandas to get around this bug https://github.com/pandas-dev/pandas/issues/39520 with the default sagemaker studio version 
!pip3 install --upgrade pandas

In [16]:
import numpy as np
import pandas as pd
from datetime import datetime
import random


In [2]:
def getProductQuantity():
    #Number of product quantity
    x = np.array([1,2,3,4])
    #Product quantity probability
    # 70% Chance of selecting a 1, followed by a 17% chance of selecting a 2, following by 10% chance, etc.
    px = np.array([0.70, 0.17, 0.1, 0.03])
    
    return np.random.choice(x, size=1, replace=True, p=px).item()
    
# for n in range(1, 101):
#     print(getProductQuantity())

In [3]:
def getCustomerQuantity():
    x = np.array([1,2,3,4])
    px = np.array([0.40, 0.30, 0.20, 0.10])
    
    return np.random.choice(x, size=1, replace=True, p=px).item()

# for n in range(1, 101):
#     print(getProductQuantity())

In [32]:
def createTransaction(dt, customerID, productMax, oNum):
    productID = np.random.randint(1, productMax, size=None, dtype=int)
    quantity = getProductQuantity()
    d = {
        'orderNum': [oNum],
        'transactionDate': [dt],
        'customerID': [customerID],
        'productID': [productID],
        'quantity': [quantity]
    }
    df = pd.DataFrame(data=d)
    
    return df

# print(createTransaction('1/1/2018', '20', 1000, 70))


In [22]:
def generateCustomers(dt, customerMax, productMax, oNum):
    df = pd.DataFrame(columns=['orderNum', 'transactionDate', 'customerID', 'productID', 'quantity'])
    customersToGenerate = getCustomerQuantity()
    
    for n in range(0, customersToGenerate):
        customerIDs = np.random.choice(customerMax, customersToGenerate, replace=False)
        keepSameCustomer = np.random.choice(np.array([True,False]), p = np.array([0.40, 0.60]), size = 1)
#         print(customerIDs)
#         print(keepSameCustomer)
        
        if customersToGenerate > 1 and keepSameCustomer:
            customerIDs[range(0, customersToGenerate)] = customerIDs[0] 
            
#         print(customerIDs)
        for cust in range(0,len(customerIDs)):
            dfTemp = createTransaction(dt, customerIDs[cust], productMax, oNum)
            df = pd.concat([df, dfTemp], axis=0)
            
            if not keepSameCustomer and cust < len(customerIDs):
                oNum = oNum + 1
                
        oNum = oNum + 1
    
#     display(df)
    return([oNum,df])
        
print(generateCustomers('1/1/2018', 20, 100, 70))

[72,   orderNum transactionDate customerID productID quantity  oNum        dt
0      NaN             NaN          7         9        1  70.0  1/1/2018]


In [30]:
def generateOrders(customerMax, productMax, numDates):
    df = pd.DataFrame(columns=['orderNum', 'transactionDate', 'customerID', 'productID', 'quantity'])
    
    oNum = 1
    #We use the Pandas timestamp here but that can be changed https://stackoverflow.com/questions/993358/creating-a-range-of-dates-in-python
    dateArr = pd.date_range(datetime.today(), periods=numDates).tolist()

    for dt in dateArr:
        dfTemp = generateCustomers(dt, customerMax, productMax, oNum)
        df = pd.concat([df, dfTemp[1]], axis=0)
        oNum = dfTemp[0]
    
    return(df)


## To generate orders, run the cell below modifying the three varibles to suit your needs

In [35]:
#generateOrders(mximum number of customers, maximum number of products, number of dates)
orders = generateOrders(7000, 20000, 2000)
display(orders)

Unnamed: 0,orderNum,transactionDate,customerID,productID,quantity
0,1,2021-07-20 17:40:27.114598,5932,13886,2
0,3,2021-07-21 17:40:27.114598,6176,3289,1
0,4,2021-07-22 17:40:27.114598,174,13109,1
0,5,2021-07-23 17:40:27.114598,5930,12938,1
0,6,2021-07-24 17:40:27.114598,2462,12437,1
...,...,...,...,...,...
0,10024,2027-01-08 17:40:27.114598,908,579,3
0,10024,2027-01-08 17:40:27.114598,908,145,1
0,10025,2027-01-08 17:40:27.114598,2029,11645,1
0,10025,2027-01-08 17:40:27.114598,2029,6826,2


## To output to CSV run this cell

In [None]:
orders.to_csv(index=False)