In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_rows', 100)      

# Load dataset

In [3]:
users_df = pd.read_csv(r"data\final\users.csv")
ad_events_df = pd.read_csv(r"data\final\ad_events.csv")
ads_df = pd.read_csv(r"data\final\ads.csv")
campaigns_df = pd.read_csv(r"data\final\campaigns.csv")

# Create generation column 

In [4]:
users_df.sample(3)

Unnamed: 0,user_id,user_gender,user_age,age_group,country,location,interests,birth_year,generation
3098,3099,Male,23,18-24,Brazil,Josephport,gaming,2002,Generation Z
8885,8886,Female,16,16-17,United States,New Allisonfort,"finance, sports, news",2009,Generation Z
6213,6214,Male,37,35-44,United States,North Dawn,gaming,1988,Millennials


In [5]:
# Create birth_year column from user_age column (currently year is 2025)
users_df['birth_year'] = 2025 - users_df['user_age']

# Categorize generation from birth_year column
def categorize_generation(birth_year):
    if birth_year >= 2013:
        return 'Gen Alpha'
    elif birth_year <= 2012 and birth_year >= 1995:
        return 'Generation Z'
    elif birth_year <= 1994 and birth_year >= 1980:
        return 'Millennials'
    elif birth_year <= 1979 and birth_year >= 1965:
        return 'Generation X '
    else:
        return 'The Baby Boomer Generation '

users_df['generation'] = users_df['birth_year'].apply(categorize_generation)

users_df.drop('birth_year', axis=1)

Unnamed: 0,user_id,user_gender,user_age,age_group,country,location,interests,generation
0,1,Female,24,18-24,United Kingdom,New Mariomouth,"fitness, health",Generation Z
1,2,Male,21,18-24,Germany,Danielsfort,"food, fitness, lifestyle",Generation Z
2,3,Male,27,25-34,Australia,Vincentchester,"fashion, news",Generation Z
3,4,Female,28,25-34,India,Lisaport,"health, news, finance",Generation Z
4,5,Male,28,25-34,United States,Brownmouth,"health, photography, lifestyle",Generation Z
...,...,...,...,...,...,...,...,...
9995,9996,Male,18,18-24,United States,Curtisside,"travel, fashion, art",Generation Z
9996,9997,Male,24,18-24,Mexico,Brownland,finance,Generation Z
9997,9998,Male,29,25-34,United States,Watersburgh,health,Generation Z
9998,9999,Male,31,25-34,United Kingdom,South Kenneth,"art, fashion",Millennials


In [6]:
users_df.sample(3)

Unnamed: 0,user_id,user_gender,user_age,age_group,country,location,interests,birth_year,generation
1871,1872,Male,22,18-24,United States,South Patrick,"lifestyle, food, health",2003,Generation Z
2724,2725,Male,26,25-34,United States,North Peter,health,1999,Generation Z
6827,6828,Male,39,35-44,United States,Nicolebury,gaming,1986,Millennials


In [7]:
campaigns_df.sample(3)

Unnamed: 0,campaign_id,name,start_date,end_date,duration_days,total_budget
33,34,Campaign_34_Winter,10-07-25,08-09-25,60,26104.3
38,39,Campaign_39_Q3,18-05-25,27-07-25,70,55638.18
47,48,Campaign_48_Winter,08-04-25,01-06-25,54,13842.07


In [8]:
users_df['generation'].value_counts()

generation
Generation Z                   6830
Millennials                    2840
Generation X                    301
The Baby Boomer Generation       29
Name: count, dtype: int64

# Save dataset to csv file

In [9]:
# users_df.to_csv(r"data\final\users.csv", index=False)

# Fake dataset

In [10]:
import random 
from datetime import datetime, timedelta

In [11]:
ads_df.sample(3)

Unnamed: 0,ad_id,campaign_id,ad_platform,ad_type,target_gender,target_age_group,target_interests
148,149,11,Facebook,Stories,All,18-24,"health, fitness"
122,123,33,Facebook,Carousel,All,18-24,"finance, sports"
79,80,46,Facebook,Stories,Male,35-44,lifestyle


In [12]:
ads_df['ad_platform'].unique()

array(['Facebook', 'Instagram'], dtype=object)

In [13]:
ads_df['ad_type'].unique()

array(['Video', 'Stories', 'Carousel', 'Image'], dtype=object)

In [14]:
print(ads_df['ad_id'].min())
print(ads_df['ad_id'].max())

1
200


In [15]:
ads_df['target_age_group'].unique()

array(['35-44', '25-34', '18-24', 'All'], dtype=object)

In [16]:
users_df.sample(3)

Unnamed: 0,user_id,user_gender,user_age,age_group,country,location,interests,birth_year,generation
585,586,Female,23,18-24,United Kingdom,New Troy,"health, gaming, finance",2002,Generation Z
2430,2431,Female,28,25-34,Australia,North Crystal,food,1997,Generation Z
1151,1152,Female,48,45-54,Canada,Evanborough,"lifestyle, news",1977,Generation X


In [17]:
users_df['user_gender'].unique()

array(['Female', 'Male', 'Other'], dtype=object)

In [18]:
users_df['interests'].unique()

array(['fitness, health', 'food, fitness, lifestyle', 'fashion, news',
       ..., 'travel, food, technology', 'fashion, technology, food',
       'gaming, food, health'], shape=(1641,), dtype=object)

In [19]:
print(users_df['user_age'].min())
print(users_df['user_age'].max())

16
65


In [20]:
users_df['age_group'].unique()

array(['18-24', '25-34', '35-44', '45-54', '16-17', '55-65'], dtype=object)

In [21]:
ad_events_df.sample(3)

Unnamed: 0,event_id,ad_id,user_id,timestamp,day_of_week,time_of_day,event_type,date,time,day,month,quarter,year,hour,minute,second
113080,111805,106,7992,23-06-25 3:14,Monday,Night,Click,23-06-25,3:14:58,23,6,2,2025,3,14,58
391314,386650,54,3000,12-05-25 8:35,Monday,Morning,Impression,12-05-25,8:35:03,12,5,2,2025,8,35,3
57397,56725,74,8153,29-05-25 9:29,Thursday,Morning,Impression,29-05-25,9:29:10,29,5,2,2025,9,29,10


In [22]:
ad_events_df['event_type'].unique()

array(['Like', 'Share', 'Impression', 'Purchase', 'Click', 'Comment'],
      dtype=object)

In [23]:
ad_events_df['time_of_day'].unique()

array(['Night', 'Morning', 'Evening', 'Afternoon'], dtype=object)

In [24]:
campaigns_df.sample(3)

Unnamed: 0,campaign_id,name,start_date,end_date,duration_days,total_budget
9,10,Campaign_10_Winter,17-05-25,21-07-25,65,19669.27
0,1,Campaign_1_Launch,25-05-25,23-07-25,59,24021.32
42,43,Campaign_43_Winter,23-04-25,12-06-25,50,81350.3


In [48]:
# Config number of rows and ID 
num_rows = 70000
start_event_id = 400001

ad_platform_lists = ['Facebook', 'Instagram']
ad_types = ['Carousel', 'Image', 'Stories', 'Video']
genders = ['Female', 'Male', 'Other']
age_group = ['18-24', '25-34', '35-44', '45-54', '16-17', '55-65']
target_age_group = ['35-44', '25-34', '18-24', 'All']

campaign_lists = ["Campaign_1_Launch", "Campaign_2_Launch", "Campaign_3_Winter", "Campaign_4_Summer", "Campaign_5_Launch", 
                "Campaign_6_Winter", "Campaign_7_Winter", "Campaign_8_Q3", "Campaign_9_Launch", "Campaign_10_Winter", 
                "Campaign_11_Q3", "Campaign_12_Q3", "Campaign_13_Winter", "Campaign_14_Summer", "Campaign_15_Launch", 
                "Campaign_16_Winter", "Campaign_17_Launch", "Campaign_18_Q3", "Campaign_19_Winter", "Campaign_20_Winter", 
                "Campaign_21_Winter", "Campaign_22_Q3", "Campaign_23_Winter", "Campaign_24_Summer", "Campaign_25_Winter", 
                "Campaign_26_Winter", "Campaign_27_Q3", "Campaign_28_Winter", "Campaign_29_Winter", "Campaign_30_Winter", 
                "Campaign_31_Summer", "Campaign_32_Summer", "Campaign_33_Summer", "Campaign_34_Winter", "Campaign_35_Launch", 
                "Campaign_36_Q3", "Campaign_37_Launch", "Campaign_38_Q3", "Campaign_39_Q3", "Campaign_40_Summer", 
                "Campaign_41_Winter", "Campaign_42_Summer", "Campaign_43_Winter", "Campaign_44_Q3", "Campaign_45_Summer", 
                "Campaign_46_Winter", "Campaign_47_Launch", "Campaign_48_Winter", "Campaign_49_Winter", "Campaign_50_Summer"]


countries = {
    'United States': ['New York', 'Los Angeles', 'Chicago'],
    'United Kingdom': ['London', 'Manchester', 'Liverpool'],
    'Vietnam': ['Ho Chi Minh City', 'Hanoi', 'Da Nang'],
    'Germany': ['Berlin', 'Munich', 'Hamburg']
}
interests = ['gaming', 'fashion', 'travel', 'technology', 'fitness', 'cooking', 'music']
event_types = ['Like', 'Share', 'Impression', 'Purchase', 'Click', 'Comment']

data = []

def get_time_of_day(h):
    if 6 <= h < 12:
        return 'Morning'
    elif 12 <= h < 18:
        return 'Afternoon'
    elif 18 <= h < 22:
        return 'Evening'
    else:
        return 'Night'

def get_user_age_group(age):
    if 16 <= age <= 17:
        return '16-17'
    elif 18 <= age <= 24:
        return '18-24'
    elif 25 <= age <= 34:
        return '25-34'
    elif 35 <= age <= 44:
        return '35-44'
    elif 45 <= age <= 54:
        return '45-54'
    elif 55 <= age <= 65:
        return '55-65'
    

for i in range(num_rows):
    ad_platform_id = random.randint(0,1)
    ad_platform = ad_platform_lists[ad_platform_id]
    ad_type = random.choice(ad_types)
    
    if ad_type == 'Video':
        if ad_platform == 'Facebook':
            # Facebook Video - high Click rate
            weights = [5, 5, 40, 5, 40, 5]  # Higher Click weight
        else:
            # Instagram Video - moderate Click rate
            weights = [5, 5, 50, 5, 30, 5]
    elif ad_type == 'Stories':
        if ad_platform == 'Instagram':
            # Instagram Stories - high Click rate
            weights = [5, 5, 45, 5, 38, 7]  # Higher Click weight
        else:
            # Facebook Stories - lower Click rate
            weights = [5, 5, 75, 2, 8, 5]
    else:
        # Other ad types - lower Click rate
        weights = [5, 5, 80, 2, 5, 3]
        
    event_types_selected = random.choices(event_types, weights=weights, k=1)[0]
    
    start_date = datetime(2025, 7, 5)
    end_date = datetime(2025, 7, 31)
    days_range = (end_date - start_date).days
    date = start_date + timedelta(days=random.randint(0, days_range))
    
    
    ad_platform_id = random.randint(0,1)
    
    # Select campaign and get the selected campaign id 
    campaign = random.choice(campaign_lists)
    campaign_id = campaign_lists.index(campaign) + 1
    target_interests = ', '.join(random.choices(interests, k=random.randint(1, 3)))
    
    # Time
    second = random.randint(0, 59)
    minute = random.randint(0, 59)
    hour = random.randint(0, 23)
    time_of_day = get_time_of_day(hour)
    
    # User
    user_gender = random.choice(genders)
    user_age = random.randint(16, 65)
    age_group = get_user_age_group(user_age)
    user_interests = random.choice(interests)
    
    # Country & Location
    country = random.choice(list(countries.keys()))
    country_id = list(countries.keys()).index(country) + 1
    location = random.choice(countries[country])
    location_id = random.randint(1, 7706)
    
    row = {
        'event_id': start_event_id + i,
        'date': date.strftime('%Y-%m-%d'),
        'time': f"{hour}:{minute}:{second}",
        
        'ad_id': random.randint(1, 200),
        'target_gender': random.choice(genders),
        'target_age_group': random.choice(target_age_group),
        'target_interests': target_interests,
        
        'ad_platform_id': ad_platform_id + 1,
        'ad_type_id': ad_types.index(ad_type) + 1,
        'campaign_id': campaign_id, 
        'ad_platform': ad_platform_lists[ad_platform_id],
        'ad_type': ad_type,
        'name': campaign,  
        'duration_days': random.randint(7, 60),
        'total_budget': round(random.uniform(5000, 50000), 2),
        
        'day': date.day,
        'month': date.month,
        'quarter': (date.month - 1) // 3 + 1,
        'year': date.year,
        'day_of_week': date.strftime('%A'),
        
        'second': second,
        'minute': minute,
        'hour': hour,
        'time_of_day': time_of_day,
        
        
        'user_gender': user_gender,
        'user_age': user_age,
        'age_group': age_group,
        
        'location_id': location_id,
        'country_id': country_id,
        'interests': user_interests,
        'event_type_name': event_types_selected,
        'location_name': location,
        'country_name': country
    }
    data.append(row)

In [49]:
fake_data = pd.DataFrame(data)

In [50]:
fake_data.shape

(70000, 33)

In [51]:
fake_data.sample(4)

Unnamed: 0,event_id,date,time,ad_id,target_gender,target_age_group,target_interests,ad_platform_id,ad_type_id,campaign_id,ad_platform,ad_type,name,duration_days,total_budget,day,month,quarter,year,day_of_week,second,minute,hour,time_of_day,user_gender,user_age,age_group,location_id,country_id,interests,event_type_name,location_name,country_name
60547,460548,2025-07-19,1:14:38,62,Female,35-44,music,2,2,24,Instagram,Image,Campaign_24_Summer,26,20968.38,19,7,3,2025,Saturday,38,14,1,Night,Female,32,25-34,403,1,fitness,Impression,Chicago,United States
36123,436124,2025-07-15,8:13:42,175,Male,18-24,"music, technology",1,4,14,Facebook,Video,Campaign_14_Summer,10,14734.81,15,7,3,2025,Tuesday,42,13,8,Morning,Female,55,55-65,6171,3,music,Impression,Da Nang,Vietnam
14972,414973,2025-07-08,8:10:49,198,Female,25-34,fashion,1,2,28,Facebook,Image,Campaign_28_Winter,13,19496.69,8,7,3,2025,Tuesday,49,10,8,Morning,Female,41,35-44,3387,2,cooking,Impression,Manchester,United Kingdom
69850,469851,2025-07-26,13:38:57,200,Other,18-24,"gaming, travel",1,3,23,Facebook,Stories,Campaign_23_Winter,15,12909.39,26,7,3,2025,Saturday,57,38,13,Afternoon,Male,56,55-65,418,1,fitness,Impression,Los Angeles,United States


In [52]:
fake_data['event_type_name'].value_counts()

event_type_name
Impression    46118
Click         11758
Share          3507
Like           3476
Comment        2950
Purchase       2191
Name: count, dtype: int64

## Save fake data

In [53]:
# fake_data.to_csv(r'data\fake_data\fake_data.csv', index=False)

# Combine locker_data with fake_data

In [54]:
locker_data_df = pd.read_csv(r'data\final\looker_data.csv')
locker_data_df.sample(3)

Unnamed: 0,event_id,date,time,ad_id,target_gender,target_age_group,target_interests,ad_platform_id,ad_type_id,campaign_id,ad_platform,ad_type,name,duration_days,total_budget,day,month,quarter,year,day_of_week,second,minute,hour,time_of_day,user_gender,user_age,age_group,location_id,country_id,interests,event_type_name,location_name,country_name
57788,73251,2025-06-07,13:26:54,71,Male,All,"art, fashion",1,3,21,Facebook,Stories,Campaign_21_Winter,64,37290.81,7,6,2,2025,Saturday,54,26,13,Afternoon,Other,30,25-34,5217,6,"food, travel",Impression,Port Daletown,India
138969,40614,2025-06-14,23:22:47,162,All,35-44,gaming,2,1,6,Instagram,Carousel,Campaign_6_Winter,84,78607.49,14,6,2,2025,Saturday,47,22,23,Evening,Male,42,35-44,5846,6,"fashion, photography",Impression,Ryanborough,India
303314,235067,2025-05-10,9:29:49,99,Female,35-44,"photography, fitness",1,2,8,Facebook,Image,Campaign_8_Q3,41,39953.19,10,5,2,2025,Saturday,49,29,9,Morning,Male,36,35-44,6856,7,"travel, finance, lifestyle",Impression,Trevormouth,Japan


In [55]:
fake_data.sample(3)

Unnamed: 0,event_id,date,time,ad_id,target_gender,target_age_group,target_interests,ad_platform_id,ad_type_id,campaign_id,ad_platform,ad_type,name,duration_days,total_budget,day,month,quarter,year,day_of_week,second,minute,hour,time_of_day,user_gender,user_age,age_group,location_id,country_id,interests,event_type_name,location_name,country_name
9691,409692,2025-07-15,22:34:11,115,Other,35-44,technology,2,3,40,Instagram,Stories,Campaign_40_Summer,15,19965.14,15,7,3,2025,Tuesday,11,34,22,Night,Other,51,45-54,4501,4,technology,Impression,Munich,Germany
8046,408047,2025-07-22,4:36:20,46,Other,All,"gaming, travel",2,1,3,Instagram,Carousel,Campaign_3_Winter,58,41924.35,22,7,3,2025,Tuesday,20,36,4,Night,Female,46,45-54,5661,2,technology,Impression,Manchester,United Kingdom
6040,406041,2025-07-20,0:28:40,4,Female,25-34,"gaming, music, cooking",1,4,49,Facebook,Video,Campaign_49_Winter,37,37316.31,20,7,3,2025,Sunday,40,28,0,Night,Other,45,45-54,3329,2,technology,Click,Liverpool,United Kingdom


In [56]:
processed_df = pd.concat([locker_data_df, fake_data], ignore_index=True).drop_duplicates()
print(f"Combined (no duplicates): {len(processed_df)} rows")

Combined (no duplicates): 470000 rows


In [57]:
processed_df.sample(5)

Unnamed: 0,event_id,date,time,ad_id,target_gender,target_age_group,target_interests,ad_platform_id,ad_type_id,campaign_id,ad_platform,ad_type,name,duration_days,total_budget,day,month,quarter,year,day_of_week,second,minute,hour,time_of_day,user_gender,user_age,age_group,location_id,country_id,interests,event_type_name,location_name,country_name
166786,42836,2025-07-18,10:22:31,67,All,18-24,"technology, travel",1,1,40,Facebook,Carousel,Campaign_40_Summer,47,53936.41,18,7,3,2025,Friday,31,22,10,Morning,Male,29,25-34,5525,10,"lifestyle, health, art",Like,Port Ryanland,United States
300827,233567,2025-06-13,14:49:13,85,Female,18-24,"finance, sports",1,1,20,Facebook,Carousel,Campaign_20_Winter,90,98904.66,13,6,2,2025,Friday,13,49,14,Afternoon,Male,22,18-24,2238,6,technology,Click,Jamesberg,India
23033,104071,2025-06-23,13:01:55,23,All,18-24,news,1,2,33,Facebook,Image,Campaign_33_Summer,87,59264.68,23,6,2,2025,Monday,55,1,13,Afternoon,Male,33,25-34,6332,10,"fashion, lifestyle",Impression,South Johnborough,United States
55585,121364,2025-07-04,20:26:01,33,Female,All,photography,2,1,15,Instagram,Carousel,Campaign_15_Launch,82,85407.23,4,7,3,2025,Friday,1,26,20,Evening,Male,27,25-34,3354,9,"lifestyle, news, gaming",Impression,Lewisview,United Kingdom
44909,22316,2025-08-06,0:26:49,170,Male,35-44,sports,2,2,40,Instagram,Image,Campaign_40_Summer,47,53936.41,6,8,3,2025,Wednesday,49,26,0,Night,Female,16,16-17,3887,2,technology,Impression,New Ann,Brazil


In [62]:
print(processed_df['date'].min())
print(processed_df['date'].max())

2025-05-07
2025-08-06


## 

## Save combined dataset

In [59]:
processed_df.to_csv(r'data\fake_data\processed_data.csv', index=False)