<a href="https://colab.research.google.com/github/tim-sadler/tutorials/blob/main/PromotionAnalyses/Utils/GenerateRandomTransactions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
!pip install faker
from faker import Faker

from hashlib import md5
import random
import string
from random import randrange

import pandas as pd
import numpy as np

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [26]:
no_of_customers = 1000

start_date = pd.to_datetime("2018-01-01")
end_date = pd.to_datetime("2022-12-31")

In [27]:
fake = Faker()

customer_ids = [md5(fake.name().encode()).hexdigest() for i in range(0,no_of_customers)]

start_end_range = pd.Series(pd.date_range(start_date, end_date, freq = "d"))
first_purchases = start_end_range.sample(no_of_customers).reset_index(drop = True)

total_purchases = list(np.ceil(np.random.chisquare(2, no_of_customers)).astype(int))

customer_df = pd.DataFrame({"customer_id": customer_ids, "acquisition_date": first_purchases, "total_purchases": total_purchases, "date": first_purchases})

transactions_df = pd.DataFrame()

for customer in customer_df.itertuples():
  temp_range = pd.Series(pd.date_range(customer.acquisition_date,end_date))
  temp_df = pd.DataFrame()
  temp_df["date"] = temp_range.sample(customer.total_purchases, replace = True)
  temp_df["customer_id"] = customer.customer_id
  temp_df["acquisition_date"] = customer.acquisition_date
  temp_df["total_purchases"] = customer.total_purchases
  temp_df = temp_df[customer_df.columns]
  transactions_df = pd.concat([transactions_df, temp_df]).reset_index(drop = True)

transactions_df["quantity"] = np.clip(np.ceil(np.random.normal(5, 3, len(transactions_df))), 1, None).astype(int)
transactions_df["revenue"] = round(np.random.randint(500, 1500, len(transactions_df))/100 * transactions_df['quantity'],2)

transactions_df = transactions_df.sort_values(["customer_id", "date"]).reset_index(drop = True)
transactions_df.to_csv("transactions.csv", index = False)


In [28]:
def random_string(length) -> str:
  letters = string.ascii_uppercase
  result = ''.join(random.choice(letters) for i in range(length))
  return result

In [29]:
def add_ongoing_campaign(df: pd.DataFrame, no_of_recipients: int, fraction: float, campaign_name: str, discount: float):
  
  draw_from_customers = np.ceil(no_of_customers*fraction).astype(int)
  
  remaining_recipients = no_of_recipients-draw_from_customers
  
  recipients = [md5(fake.name().encode()).hexdigest() for i in range(remaining_recipients)]
  
  active_recipients = df.groupby("customer_id")[["revenue"]].sum().sort_values("revenue")
  active_recipients["weight"] = 1
  active_recipients["weight"] = 1/active_recipients["weight"].cumsum()
  active_recipients = active_recipients.sample(draw_from_customers, weights=active_recipients["weight"]).index
  
  if "campaign" in df.columns and "coupon_code" in df.columns and "discount" in df.columns:
    df["campaign"] = np.where(df["customer_id"].isin(active_recipients), campaign_name, df["campaign"])
    df["coupon_code"] = np.where(df["customer_id"].isin(active_recipients), random_string(6), df["coupon_code"])
    df["discount"] = np.where(df["customer_id"].isin(active_recipients), discount, df["discount"])
  else:
    df["campaign"] = np.where(df["customer_id"].isin(active_recipients), campaign_name, np.nan)
    df["coupon_code"] = np.where(df["customer_id"].isin(active_recipients), random_string(6), np.nan)
    df["discount"] = np.where(df["customer_id"].isin(active_recipients), discount, 0)
  
  all_recipients = [*recipients, *active_recipients]
  
  return all_recipients, df

In [30]:
def add_temporary_campaign(df: pd.DataFrame, no_of_recipients: int, fraction: float, campaign_name: str, discount: float, campaign_start: str, campaign_end: str):
  
  draw_from_customers = np.ceil(len(df[(df["date"] >= pd.to_datetime(campaign_start)) & (df["date"] <= pd.to_datetime(campaign_end))])*fraction).astype(int)
  
  remaining_recipients = no_of_recipients-draw_from_customers
  
  recipients = [md5(fake.name().encode()).hexdigest() for i in range(remaining_recipients)]
  
  active_recipients = df[(df["date"] >= pd.to_datetime(campaign_start)) & (df["date"] <= pd.to_datetime(campaign_end))].groupby("customer_id")[["revenue"]].sum().sort_values("revenue")
  active_recipients["weight"] = 1
  active_recipients["weight"] = 1/active_recipients["weight"].cumsum()
  active_recipients = active_recipients.sample(draw_from_customers, weights=active_recipients["weight"]).index
  
  if "campaign" in df.columns and "coupon_code" in df.columns and "discount" in df.columns:
    df["campaign"] = np.where((df["customer_id"].isin(active_recipients)) & (df["date"] >= pd.to_datetime(campaign_start)) & (df["date"] <= pd.to_datetime(campaign_end)), campaign_name, df["campaign"])
    df["coupon_code"] = np.where((df["customer_id"].isin(active_recipients)) & (df["date"] >= pd.to_datetime(campaign_start)) & (df["date"] <= pd.to_datetime(campaign_end)), random_string(6), df["coupon_code"])
    df["discount"] = np.where((df["customer_id"].isin(active_recipients)) & (df["date"] >= pd.to_datetime(campaign_start)) & (df["date"] <= pd.to_datetime(campaign_end)), discount, df["discount"])
  else:
    df["campaign"] = np.where((df["customer_id"].isin(active_recipients)) & (df["date"] >= pd.to_datetime(campaign_start)) & (df["date"] <= pd.to_datetime(campaign_end)), campaign_name, np.nan)
    df["coupon_code"] = np.where((df["customer_id"].isin(active_recipients)) & (df["date"] >= pd.to_datetime(campaign_start)) & (df["date"] <= pd.to_datetime(campaign_end)), random_string(6), np.nan)
    df["discount"] = np.where((df["customer_id"].isin(active_recipients)) & (df["date"] >= pd.to_datetime(campaign_start)) & (df["date"] <= pd.to_datetime(campaign_end)), discount, 0)
  
  all_recipients = [*recipients, *active_recipients]
  
  return all_recipients, df

In [31]:
all_newsletter_recipients, transactions_df = add_ongoing_campaign(transactions_df, 500, 0.2, "Newsletter", 0.1)

In [32]:
all_partner_recipients, transactions_df = add_ongoing_campaign(transactions_df, 200, 0.05, "Partner", 0.15)

In [33]:
all_valentines21_recipients, transactions_df = add_temporary_campaign(transactions_df, 400, 0.07, "Valentines2021", 0.20, "2021-02-14", "2021-02-14")

In [34]:
all_valentines22_recipients, transactions_df = add_temporary_campaign(transactions_df, 500, 0.1, "Valentines2022", 0.25, "2022-02-14", "2022-02-14")

In [35]:
transactions_df.to_csv("transactions_with_coupons.csv", index = False)

In [36]:
transactions_df.groupby("campaign")["discount"].max()

campaign
Newsletter        0.10
Partner           0.15
Valentines2021    0.20
Valentines2022    0.25
nan               0.00
Name: discount, dtype: float64