# Generating Fake Data

Since I could not find the data I was initially looking for, I decided this would be a great time to practice generating artificial data.

In [1]:
#importing libraries
from faker import Faker
from faker.providers import BaseProvider
import datetime
import numpy as np
import pandas as pd

#setting seed for reproducibility
Faker.seed(12)
np.random.seed(12)

create = Faker()

In [2]:
#this will be the number of rows in the dataset
dataset_length = 500

In [3]:
def campaign_date(year, month, start_date, end_date, length = dataset_length):
    """
    When entering the specified parameters, this function will generate random dates 
    to a specified number of times, or length. The function is intended to be utilized 
    to get both the starting and ending dates of the two campaigns.
    """
    date_start = datetime.date(year,month,start_date) #Min start date of campaign
    
    date_end = datetime.date(year,month,end_date) #Max start date of campaign
    
    dates = []
    for _ in range(1, length+1):
    #Timestamp
        date = create.date_time_between_dates(
                datetime_start=date_start,
                datetime_end=date_end
                )
        
        dates.append(date)
        
    return dates
    

In [4]:
campaign_start = campaign_date(2019, 10, 1, 30)
campaign_end = campaign_date(2019, 12, 10, 15)

I decided to start the campaign in October for approximately a month long engagement. Accoring to the following [link](https://joinative.com/holiday-ads-tips#), Statista had surveyed individuals on when they would be likely to begin holiday shopping. I also modeled the end of the campaign date range based on the results of this survey. 

In [5]:
#By utilizing the MyProvider class, we are able to customize all sorts of methods.
class MyProvider(BaseProvider):
    def version(self):
        """Version of the ad shown"""
        version= ['A', 'B']
        return np.random.choice(version)
    
    def state(self):
        """State in which the person lives"""
        state = ["Alaska", "Alabama", "Arkansas", "Arizona", "California", "Colorado", 
                 "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Iowa", "Idaho", 
                 "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", 
                 "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi", 
                 "Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire", 
                 "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", 
                 "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", 
                 "Texas", "Utah", "Virginia", "Vermont", "Washington", "Wisconsin", "West Virginia", 
                 "Wyoming"]
        return np.random.choice(state)
    
    def sex(self):
        """Sex of the person"""
        sex = ['M', 'F']
        return np.random.choice(sex)
    
create.add_provider(MyProvider)

In [6]:
#the lists below are going to be columns in the dataframe
version_of_ad = []
age = []
state = []
sex = []
clicked_on_ad = []
amount_spent = []


for _ in range(1, dataset_length + 1):
    version_of_ad.append(create.version())
    age.append(create.pyint(min_value = 18, max_value = 85))
    state.append(create.state())
    sex.append(create.sex())
    clicked_on_ad.append(create.boolean())
    
for i in clicked_on_ad:
    if i == True:
        amount_spent.append(create.pyfloat(min_value = 1, max_value = 200))
    else:
        amount_spent.append(0)

    



In [7]:
#creating the dataframe
ad_data = pd.DataFrame({'campaign_start': campaign_start, 'campaign_end': campaign_end, 
                       'ad_version': version_of_ad, 'age': age, 'sex': sex, 'state': state,
                        'clicked_on_ad': clicked_on_ad, 'amount_spent': [round(amount, 2) for amount in amount_spent]})

In [8]:
ad_data

Unnamed: 0,campaign_start,campaign_end,ad_version,age,sex,state,clicked_on_ad,amount_spent
0,2019-10-24 00:54:53,2019-12-10 19:54:30,B,29,M,North Dakota,True,8.54
1,2019-10-14 01:24:06,2019-12-10 03:35:27,B,44,M,Wyoming,True,86.92
2,2019-10-26 16:28:55,2019-12-10 10:49:22,B,73,M,Arizona,False,0.00
3,2019-10-17 23:31:46,2019-12-14 01:32:07,A,33,F,Minnesota,True,4.00
4,2019-10-07 22:08:29,2019-12-14 19:02:02,A,28,F,Colorado,False,0.00
...,...,...,...,...,...,...,...,...
495,2019-10-25 08:21:48,2019-12-10 20:54:20,B,80,M,Rhode Island,True,61.00
496,2019-10-17 11:41:07,2019-12-10 03:08:58,B,48,F,Minnesota,False,0.00
497,2019-10-18 05:55:39,2019-12-14 19:18:19,A,71,F,Delaware,True,115.71
498,2019-10-20 08:51:27,2019-12-12 05:44:18,B,83,F,Louisiana,True,40.24


In [9]:
#checking the distribution of the 2 ad classes
ad_data['ad_version'].value_counts()

B    258
A    242
Name: ad_version, dtype: int64

There is an even distribution of classes (or ads in this case).

In [10]:
#saving this dataframe to a csv
ad_data.to_csv('ad_data.csv', index = False)