In [1]:
#Simulate input data for e-commerce project

In [2]:
#Import libraries
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import timedelta

In [3]:
#Set up simulation
#Instantiate Faker object
fake = Faker()

#Set a random seed for numpy random functions
np.random.seed(42)

#Set a random seed for built-in random modules
random.seed(42)

In [4]:
#Create a dummy dataset USERS
#1000 users, sign up date between 2 years ago and today, assign country = US, UK, CA, DE, or JP
n_users = 1000
user_ids = range(1, n_users + 1)
users = pd.DataFrame({
    'user_id': user_ids,
    'signup_date': [fake.date_between(start_date='-2y', end_date='today') for _ in user_ids],
    'country': [random.choice(['US', 'UK', 'CA', 'DE', 'JP']) for _ in user_ids]
})
users.to_csv('users.csv', index=False)

In [5]:
#Create a dummy dataset PRODUCTS
#100 products, randomly name it from common words
#Randomly assign category from Electronics, Clothing, Books, Beauty, or Home
#Randomly assign price between $5 and $300
#Category and price may not make sense for the chosen product name
#If more realistic combinations are desired, specify from which common products/prices a name/price can be chosen for each category
n_products = 100
product_ids = range(1, n_products + 1)
products = pd.DataFrame({
    'product_id': product_ids,
    'product_name': [fake.word().capitalize() for _ in product_ids],
    'category': [random.choice(['Electronics', 'Clothing', 'Books', 'Beauty', 'Home']) for _ in product_ids],
    'price': np.round(np.random.uniform(5.0, 300.0, size=n_products), 2)
})
products.to_csv('products.csv', index=False)

In [6]:
#Create a dummy dataset ORDERS
#10000 orders
#Randomly choose user_id and product_id for each order from user_ids and product_ids defined earlier
#Randomly assign quantity = 1, 2, or 3
#Randomly assign order data between 1 year ago and today
n_orders = 10000
orders = pd.DataFrame({
    'order_id': range(1, n_orders + 1),
    'user_id': [random.choice(user_ids) for _ in range(n_orders)],
    'product_id': [random.choice(product_ids) for _ in range(n_orders)],
    'quantity': np.random.randint(1, 4, size=n_orders),
    'order_date': [fake.date_between(start_date='-1y', end_date='today') for _ in range(n_orders)]
})
orders.to_csv('orders.csv', index=False)

In [7]:
#Create a dummy dataset PAYMENTS
#Same number of payment records as the orders
#Randomly assign payment method from Credit Card, Paypal, and Gift Card
#Calculate the total amount paid
payment_methods = ['Credit Card', 'Paypal', 'Gift Card']
payments = pd.DataFrame({
    'payment_id': range(1, n_orders + 1),
    'order_id': range(1, n_orders + 1),
    'payment_type': [random.choice(payment_methods) for _ in range(n_orders)],
    'amount': orders['quantity'].values * [
        products.loc[products['product_id'] == pid, 'price'].values[0] for pid in orders['product_id']
    ]
})
payments['amount'] = payments['amount'].round(2)
payments.to_csv('payments.csv', index=False)

Instead of using list comprehension in amount, I could merge 'price' from products dataset onto orders dataset by 'product_id'

orders_price = orders.merge(
products[['products_id', 'price']], 
how = 'left',
on = 'products_id'
)