In [2]:
import pandas as pd
import numpy as np
import random

In [4]:
zip_dma_map = pd.read_csv(r'zip_dma_mapping.csv')
zip_dma_map

Unnamed: 0,ZIP_CODE,DMA_code,DMA
0,1001,543,SPRINGFIELD - HOLYOKE
1,1002,543,SPRINGFIELD - HOLYOKE
2,1003,543,SPRINGFIELD - HOLYOKE
3,1004,543,SPRINGFIELD - HOLYOKE
4,1005,506,BOSTON (MANCHESTER)
...,...,...,...
41012,99803,747,JUNEAU
41013,99811,747,JUNEAU
41014,99821,747,JUNEAU
41015,99824,747,JUNEAU


In [5]:
np.random.seed(42)

def generate_10_digit_code():
    # Generate 10 random digits (0-9) as a string
    return ''.join(np.random.choice(list('0123456789'), 10))

# Generate 100 unique codes
hcp_codes = set()
while len(hcp_codes) < 100:
    code = generate_10_digit_code()
    hcp_codes.add(code)

hcp_codes = list(hcp_codes)
hcp_codes

['9686008838',
 '3010446882',
 '9845391754',
 '3434686499',
 '5696921879',
 '5783009361',
 '3955407446',
 '9307023759',
 '2863294428',
 '1792459532',
 '1937860280',
 '0182046504',
 '3948720231',
 '0242049668',
 '1304808756',
 '6279303635',
 '5679819144',
 '1769915521',
 '2189592771',
 '4095809263',
 '8242648613',
 '2363807617',
 '3776200256',
 '9055373774',
 '1098988570',
 '0676406682',
 '3519193768',
 '0501049850',
 '0757842444',
 '4322381800',
 '2813517702',
 '2481971467',
 '6528959950',
 '4128665737',
 '1152830304',
 '4524644499',
 '8003852038',
 '0881692698',
 '6374692674',
 '1066742752',
 '4704203460',
 '2938981776',
 '4232004528',
 '6097999128',
 '2657840297',
 '5552571400',
 '6515111213',
 '4064562924',
 '3625198453',
 '0517946917',
 '7414798808',
 '0729694946',
 '1477262552',
 '5085233292',
 '8409901587',
 '4793979148',
 '7557699718',
 '6394173848',
 '2040700115',
 '8507692043',
 '3508043251',
 '6400214956',
 '3046995431',
 '6833072611',
 '9926033466',
 '8705459454',
 '95228649

In [6]:
hcp_codes_df = pd.DataFrame({'HCP_ID' : hcp_codes})
hcp_codes_df['ZIP'] = zip_dma_map['ZIP_CODE'].sample(n = len(hcp_codes_df), replace = True, random_state = 42).reset_index(drop = True)
 
hcp_codes_df

Unnamed: 0,HCP_ID,ZIP
0,9686008838,37919
1,3010446882,3243
2,9845391754,93205
3,3434686499,28466
4,5696921879,17049
...,...,...
95,5270530683,62866
96,3030095432,95191
97,1335507528,53926
98,2237570730,21778


In [7]:
# Generate monthly periods from Jan 2021 to Dec 2023
dates = pd.date_range(start='2021-01-01', end='2023-12-31', freq='W-MON')

# Build the DataFrame efficiently
sales_df = pd.DataFrame(
    [(hcp, date, np.round(np.random.uniform(1000, 5000), 2)) for hcp in hcp_codes for date in dates],
    columns=['HCP_ID', 'Date', 'Sales']
)

sales_df = pd.merge(sales_df, hcp_codes_df, on = 'HCP_ID', how = 'left')

print(sales_df.head())

       HCP_ID       Date    Sales    ZIP
0  9686008838 2021-01-04  2024.06  37919
1  9686008838 2021-01-11  3904.38  37919
2  9686008838 2021-01-18  3371.85  37919
3  9686008838 2021-01-25  1408.85  37919
4  9686008838 2021-02-01  4675.00  37919


In [8]:
# Randomly select 89 HCP codes from the full list of 100
subset_hcps = random.sample(hcp_codes, 89)

# Then build calls_df only for this subset
calls_df = pd.DataFrame(
    [(hcp, date, np.round(np.random.uniform(1000, 5000), 2), np.round(np.random.uniform(100, 500), 2)) for hcp in subset_hcps for date in dates],
    columns=['HCP_ID', 'Date', 'Calls', 'Calls Spend']
)
calls_df = pd.merge(calls_df, hcp_codes_df, on = 'HCP_ID', how = 'left')

print(calls_df.head())

       HCP_ID       Date    Calls  Calls Spend    ZIP
0  8705459454 2021-01-04  1649.48       149.13  68503
1  8705459454 2021-01-11  4862.21       212.61  68503
2  8705459454 2021-01-18  4634.37       264.23  68503
3  8705459454 2021-01-25  2543.69       380.37  68503
4  8705459454 2021-02-01  1039.53       235.24  68503


In [9]:
# Randomly select 89 HCP codes from the full list of 100
subset_hcps = random.sample(hcp_codes, 92)

# Then build calls_df only for this subset
samples_df = pd.DataFrame(
    [(hcp, date, np.round(np.random.uniform(1000, 5000), 2), np.round(np.random.uniform(100, 500), 2)) for hcp in subset_hcps for date in dates],
    columns=['HCP_ID', 'Date', 'Samples', 'Samples Spend']
)
samples_df = pd.merge(samples_df, hcp_codes_df, on = 'HCP_ID', how = 'left')

print(samples_df.head())

       HCP_ID       Date  Samples  Samples Spend    ZIP
0  5619190708 2021-01-04  4987.65         289.45  41544
1  5619190708 2021-01-11  2534.93         321.00  41544
2  5619190708 2021-01-18  1659.06         488.51  41544
3  5619190708 2021-01-25  4353.34         145.20  41544
4  5619190708 2021-02-01  4780.33         477.13  41544


In [10]:
# Randomly select 90 HCP codes from the full list of 100
subset_hcps = random.sample(hcp_codes, 90)

# Then build calls_df only for this subset
print_df = pd.DataFrame(
    [(hcp, date, np.round(np.random.uniform(1000, 5000), 2), np.round(np.random.uniform(100, 500), 2)) for hcp in subset_hcps for date in dates],
    columns=['HCP_ID', 'Date', 'Print', 'Print Spend']
)
print_df = pd.merge(print_df, hcp_codes_df, on = 'HCP_ID', how = 'left')

print(print_df.head())

       HCP_ID       Date    Print  Print Spend    ZIP
0  1066742752 2021-01-04  4610.69       497.56  28089
1  1066742752 2021-01-11  4977.71       275.38  28089
2  1066742752 2021-01-18  1957.17       168.81  28089
3  1066742752 2021-01-25  2528.44       339.58  28089
4  1066742752 2021-02-01  3974.35       306.55  28089


In [11]:
sales_df.to_csv(r'sales_data_zip_dummy.csv')
calls_df.to_csv(r'calls_data_zip_dummy.csv')
samples_df.to_csv(r'samples_data_zip_dummy.csv')
print_df.to_csv(r'print_data_zip_dummy.csv')

In [12]:
combined_hcp_data = pd.merge(pd.merge(sales_df, calls_df, on = ['HCP_ID', 'Date', 'ZIP'], how = 'left'), zip_dma_map, left_on= 'ZIP', right_on = 'ZIP_CODE', how = 'left')
combined_hcp_data

Unnamed: 0,HCP_ID,Date,Sales,ZIP,Calls,Calls Spend,ZIP_CODE,DMA_code,DMA
0,9686008838,2021-01-04,2024.06,37919,1188.56,307.05,37919,557,KNOXVILLE
1,9686008838,2021-01-11,3904.38,37919,4182.84,370.52,37919,557,KNOXVILLE
2,9686008838,2021-01-18,3371.85,37919,3036.68,431.50,37919,557,KNOXVILLE
3,9686008838,2021-01-25,1408.85,37919,1911.15,432.32,37919,557,KNOXVILLE
4,9686008838,2021-02-01,4675.00,37919,2105.45,244.99,37919,557,KNOXVILLE
...,...,...,...,...,...,...,...,...,...
15595,4200791212,2023-11-27,3554.29,72203,1881.15,338.14,72203,693,LITTLE ROCK - PINE BLUFF
15596,4200791212,2023-12-04,3359.02,72203,4621.63,241.28,72203,693,LITTLE ROCK - PINE BLUFF
15597,4200791212,2023-12-11,1547.10,72203,1843.07,408.86,72203,693,LITTLE ROCK - PINE BLUFF
15598,4200791212,2023-12-18,4379.80,72203,1707.53,201.50,72203,693,LITTLE ROCK - PINE BLUFF


In [13]:
combined_hcp_data['DMA_code'].nunique

<bound method IndexOpsMixin.nunique of 0        557
1        557
2        557
3        557
4        557
        ... 
15595    693
15596    693
15597    693
15598    693
15599    693
Name: DMA_code, Length: 15600, dtype: int64>

In [14]:
dma_codes = list(zip_dma_map['DMA_code'].unique())
dma_codes

[np.int64(543),
 np.int64(506),
 np.int64(532),
 np.int64(521),
 np.int64(523),
 np.int64(500),
 np.int64(537),
 np.int64(552),
 np.int64(533),
 np.int64(501),
 np.int64(504),
 np.int64(526),
 np.int64(502),
 np.int64(549),
 np.int64(555),
 np.int64(538),
 np.int64(514),
 np.int64(565),
 np.int64(508),
 np.int64(574),
 np.int64(511),
 np.int64(516),
 np.int64(536),
 np.int64(577),
 np.int64(566),
 np.int64(576),
 np.int64(512),
 np.int64(556),
 np.int64(584),
 np.int64(569),
 np.int64(573),
 np.int64(544),
 np.int64(560),
 np.int64(518),
 np.int64(531),
 np.int64(559),
 np.int64(564),
 np.int64(598),
 np.int64(554),
 np.int64(597),
 np.int64(517),
 np.int64(545),
 np.int64(567),
 np.int64(570),
 np.int64(550),
 np.int64(575),
 np.int64(524),
 np.int64(546),
 np.int64(520),
 np.int64(519),
 np.int64(507),
 np.int64(503),
 np.int64(525),
 np.int64(522),
 np.int64(561),
 np.int64(530),
 np.int64(534),
 np.int64(656),
 np.int64(686),
 np.int64(592),
 np.int64(548),
 np.int64(528),
 np.int6

In [15]:
# Generate monthly periods from Jan 2021 to Dec 2023
dates = pd.date_range(start='2021-01-01', end='2023-12-31', freq='W-MON')

# Build the DataFrame efficiently
tv_df = pd.DataFrame(
    [(dma, date, np.round(np.random.uniform(100, 500), 2)) for dma in dma_codes for date in dates],
    columns=['DMA_code', 'Date', 'TV GRP']
)


print(tv_df.head())

   DMA_code       Date  TV GRP
0       543 2021-01-04  294.44
1       543 2021-01-11  422.57
2       543 2021-01-18  162.56
3       543 2021-01-25  312.76
4       543 2021-02-01  291.17


In [16]:
# Build the DataFrame efficiently
display_df = pd.DataFrame(
    [(dma, date, np.round(np.random.uniform(1000, 5000), 2)) for dma in dma_codes for date in dates],
    columns=['DMA_code', 'Date', 'Display Impressions']
)


print(display_df.head())

   DMA_code       Date  Display Impressions
0       543 2021-01-04              3519.77
1       543 2021-01-11              3580.55
2       543 2021-01-18              4472.14
3       543 2021-01-25              3086.39
4       543 2021-02-01              2084.56


In [17]:
tv_df.to_csv(r'tv_data_dma_dummy.csv')
display_df.to_csv(r'display_data_dma_dummy.csv')