In [14]:
import pandas as pd
import numpy as np
import random

In [15]:
np.random.seed(42)

def generate_10_digit_code():
    # Generate 10 random digits (0-9) as a string
    return ''.join(np.random.choice(list('0123456789'), 10))

# Generate 100 unique codes
hcp_codes = set()
while len(hcp_codes) < 100:
    code = generate_10_digit_code()
    hcp_codes.add(code)

hcp_codes = list(hcp_codes)

# Generate monthly periods from Jan 2021 to Dec 2023
dates = pd.date_range(start='2021-01-01', end='2023-12-31', freq='W-MON')

# Build the DataFrame efficiently
sales_df = pd.DataFrame(
    [(hcp, date, np.round(np.random.uniform(1000, 5000), 2)) for hcp in hcp_codes for date in dates],
    columns=['HCP_ID', 'Date', 'Sales']
)

print(sales_df.head())

       HCP_ID       Date    Sales
0  9686008838 2021-01-04  2024.06
1  9686008838 2021-01-11  3904.38
2  9686008838 2021-01-18  3371.85
3  9686008838 2021-01-25  1408.85
4  9686008838 2021-02-01  4675.00


In [16]:
# Randomly select 89 HCP codes from the full list of 100
subset_hcps = random.sample(hcp_codes, 89)

# Then build calls_df only for this subset
calls_df = pd.DataFrame(
    [(hcp, date, np.round(np.random.uniform(1000, 5000), 2), np.round(np.random.uniform(100, 500), 2)) for hcp in subset_hcps for date in dates],
    columns=['HCP_ID', 'Date', 'Calls', 'Calls Spend']
)

print(calls_df.head())

       HCP_ID       Date    Calls  Calls Spend
0  5783009361 2021-01-04  1649.48       149.13
1  5783009361 2021-01-11  4862.21       212.61
2  5783009361 2021-01-18  4634.37       264.23
3  5783009361 2021-01-25  2543.69       380.37
4  5783009361 2021-02-01  1039.53       235.24


In [17]:
# Randomly select 89 HCP codes from the full list of 100
subset_hcps = random.sample(hcp_codes, 92)

# Then build calls_df only for this subset
samples_df = pd.DataFrame(
    [(hcp, date, np.round(np.random.uniform(1000, 5000), 2), np.round(np.random.uniform(100, 500), 2)) for hcp in subset_hcps for date in dates],
    columns=['HCP_ID', 'Date', 'Samples', 'Samples Spend']
)

print(samples_df.head())

       HCP_ID       Date  Samples  Samples Spend
0  2189592771 2021-01-04  4987.65         289.45
1  2189592771 2021-01-11  2534.93         321.00
2  2189592771 2021-01-18  1659.06         488.51
3  2189592771 2021-01-25  4353.34         145.20
4  2189592771 2021-02-01  4780.33         477.13


In [18]:
# Randomly select 90 HCP codes from the full list of 100
subset_hcps = random.sample(hcp_codes, 90)

# Then build calls_df only for this subset
print_df = pd.DataFrame(
    [(hcp, date, np.round(np.random.uniform(1000, 5000), 2), np.round(np.random.uniform(100, 500), 2)) for hcp in subset_hcps for date in dates],
    columns=['HCP_ID', 'Date', 'Print', 'Print Spend']
)

print(print_df.head())

       HCP_ID       Date    Print  Print Spend
0  3772541751 2021-01-04  4610.69       497.56
1  3772541751 2021-01-11  4977.71       275.38
2  3772541751 2021-01-18  1957.17       168.81
3  3772541751 2021-01-25  2528.44       339.58
4  3772541751 2021-02-01  3974.35       306.55


In [19]:
sales_df.to_csv(r'dummy_data/sales_data_dummy.csv')
calls_df.to_csv(r'dummy_data/calls_data_dummy.csv')
samples_df.to_csv(r'dummy_data/samples_data_dummy.csv')
print_df.to_csv(r'dummy_data/print_data_dummy.csv')