# Generating Fake Data

In [1]:
from faker import Faker
from faker.providers import BaseProvider
import datetime
import numpy as np

#setting seed for reproducibility
Faker.seed(12)
np.random.seed(12)

create = Faker()

In [2]:
#this will be the number of rows in the dataset
dataset_length = 300

In [3]:
def campaign_date(year, month, start_date, end_date, length = dataset_length):
    """
    When entering the specified parameters, this function will generate random dates 
    to a specified number of times, or length. The function is intended to be utilized 
    to get both the starting and ending dates of the two campaigns.
    """
    date_start = datetime.date(year,month,start_date) #Min start date of campaign
    
    date_end = datetime.date(year,month,end_date) #Max start date of campaign
    
    dates = []
    for _ in range(1, length+1):
    #Timestamp
        date = create.date_time_between_dates(
                datetime_start=date_start,
                datetime_end=date_end
                )
        
        dates.append(date)
        
    return dates
    

In [4]:
campaign_start = campaign_date(2017, 8, 1, 30)
campaign_end = campaign_date(2017, 11, 10, 15)

In [5]:
#By utilizing the MyProvider class, we are able to customize all sorts of methods.
class MyProvider(BaseProvider):
    def version(self):
        """Version of the ad shown"""
        version= ['A', 'B']
        return np.random.choice(version)
    
    def state(self):
        """State in which the person lives"""
        state = ["Alaska", "Alabama", "Arkansas", "Arizona", "California", "Colorado", 
                 "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Iowa", "Idaho", 
                 "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", 
                 "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi", 
                 "Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire", 
                 "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", 
                 "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", 
                 "Texas", "Utah", "Virginia", "Vermont", "Washington", "Wisconsin", "West Virginia", 
                 "Wyoming"]
        return np.random.choice(state)
    
    def sex(self):
        """Sex of the person"""
        sex = ['M', 'F']
        return np.random.choice(sex)
    
create.add_provider(MyProvider)

In [6]:
#the lists below are going to be columns in the dataframe
version_of_ad = []
age = []
state = []
sex = []
clicked_on_ad = []
amount_spent = []


for _ in range(1, dataset_length + 1):
    version_of_ad.append(create.version())
    age.append(create.pyint(min_value = 18, max_value = 85))
    state.append(create.state())
    sex.append(create.sex())
    clicked_on_ad.append(create.boolean())
    
for i in clicked_on_ad:
    if i == True:
        amount_spent.append(create.pyfloat(min_value = 1, max_value = 200))
    else:
        amount_spent.append(0)

    



In [7]:
import pandas as pd

#creating the dataframe
ad_data = pd.DataFrame({'campaign_start': campaign_start, 'campaign_end': campaign_end, 
                       'ad_version': version_of_ad, 'age': age, 'sex': sex, 'state': state,
                        'clicked_on_ad': clicked_on_ad, 'amount_spent': [round(amount, 2) for amount in amount_spent]})

In [8]:
ad_data

Unnamed: 0,campaign_start,campaign_end,ad_version,age,sex,state,clicked_on_ad,amount_spent
0,2017-08-24 00:54:53,2017-11-14 18:52:28,B,70,M,North Dakota,True,58.95
1,2017-08-14 01:24:06,2017-11-11 05:52:06,B,36,M,Wyoming,True,196.00
2,2017-08-26 16:28:55,2017-11-11 13:05:21,B,75,M,Arizona,True,76.48
3,2017-08-17 23:31:46,2017-11-10 15:58:09,A,85,F,Minnesota,True,102.33
4,2017-08-07 22:08:29,2017-11-11 15:01:02,A,49,F,Colorado,True,17.30
...,...,...,...,...,...,...,...,...
295,2017-08-24 21:44:56,2017-11-13 12:03:19,A,80,F,New Hampshire,True,194.88
296,2017-08-19 16:08:59,2017-11-11 21:46:03,B,35,F,Kansas,False,0.00
297,2017-08-20 09:30:40,2017-11-11 02:06:31,A,65,F,Montana,False,0.00
298,2017-08-09 20:52:01,2017-11-11 13:44:23,A,22,M,Alaska,True,154.20


In [9]:
#checking the distribution of the 2 ad classes
ad_data['ad_version'].value_counts()

B    159
A    141
Name: ad_version, dtype: int64

In [10]:
ad_data.to_csv('ad_data.csv', index = False)