In [1]:
# Do not reload
import sys, os
sys.path.append("../")
os.chdir('../')

In [2]:
import pandas as pd
import numpy as np
from src.config import DATA_PATH
from src.common import input_keys
from src.generate.generators import generate_ip_space, generate_bot_entries, generate_human_entries

%load_ext autoreload
%autoreload 2

In [3]:
sample = pd.read_csv(DATA_PATH / 'Sample Data for Candidate Assessment.csv', usecols=input_keys.USECOLS)

In [4]:
sample.head()

Unnamed: 0,REQUEST_TIME,DEVICE_IP,DEVICE_IFA,DEVICE_LANGUAGE,GEO_CURRENT_CITY
0,2022-10-19T15:51:07.387Z,5.28.186.193,741b39aa-3cf6-4fdf-863c-4ca8ab11102f,he,Petah Tiqwa
1,2022-10-19T15:50:05Z,5.28.177.198,5b932c2d-a3b9-4220-9b28-aa4d93c57a59,he,Givatayim
2,2022-10-19T15:56:35.587Z,5.28.186.210,96aa31a6-3ec3-4bab-b33d-ddcb164312e7,en,Bat Yam
3,2022-10-19T15:50:31.205Z,5.28.185.0,AADvyU7EZrYAAC9tt5XFAA,en,Givatayim
4,2022-10-19T15:51:30.363Z,5.28.185.3,AADwHU7GFEoAAA8i1vj7-Q,ru,Givatayim


In [5]:
sample.nunique()

REQUEST_TIME        489
DEVICE_IP           347
DEVICE_IFA          391
DEVICE_LANGUAGE       9
GEO_CURRENT_CITY     36
dtype: int64

# Generate sampling space for attributes: IP, Language, City

In [6]:
IP_SPACE = generate_ip_space(2000)
LANG_SPACE, LANG_P = sample[input_keys.LANG_KEY].value_counts().index, sample[input_keys.LANG_KEY].value_counts(True).values
CITY_SPACE, CITY_P = sample[input_keys.CITY_KEY].value_counts().index, sample[input_keys.CITY_KEY].value_counts(True).values

# Define the samples generation process

The pipeline is the following:
1) Define mass distribution over 24 hours (apply sleeping windows for human entries)
2) Sample random number of total entries a day based on distribution
3) Sample entries in each hour:
    - Take desired number of entries for an hour and sample request time randomly
    - For bot entries can also perform permutations by sampling big number of events in a minute and by sampling hour in equal intervals
    - For bot entries noise in selection of marginal cities or languages can be introduced

In [7]:
def generate_dataset(n_entries=1500, proportion_of_bots=0.2):
    records = []
    for _ in range(n_entries):
        if np.random.random() <= proportion_of_bots:
            records.extend(
                generate_bot_entries(IP_SPACE, LANG_SPACE, LANG_P, CITY_SPACE, CITY_P)
            )
        else:
            records.extend(
                generate_human_entries(IP_SPACE, LANG_SPACE, LANG_P, CITY_SPACE, CITY_P)
            )
    return pd.DataFrame(records, columns=input_keys.USECOLS+['GENERATED_AS_BOT'])

In [8]:
data = generate_dataset(n_entries=np.random.randint(2000, 4000))
test_data = generate_dataset(n_entries=np.random.randint(400, 600))
api_data = generate_dataset(n_entries=np.random.randint(10, 20))

In [9]:
data.head()

Unnamed: 0,REQUEST_TIME,DEVICE_IP,DEVICE_IFA,GEO_CURRENT_CITY,DEVICE_LANGUAGE,GENERATED_AS_BOT
0,2022-12-21 00:50:53.675734,181.160.87.69,336a7994-5681-4bf4-aedd-094a9f66fc8b,Tel Aviv,en,0
1,2022-12-21 08:12:54.768470,181.160.87.69,336a7994-5681-4bf4-aedd-094a9f66fc8b,Tel Aviv,en,0
2,2022-12-21 09:23:27.587814,246.251.252.84,336a7994-5681-4bf4-aedd-094a9f66fc8b,Tel Aviv,en,0
3,2022-12-21 10:10:07.653369,246.251.252.84,336a7994-5681-4bf4-aedd-094a9f66fc8b,Tel Aviv,en,0
4,2022-12-21 11:00:28.802860,181.160.87.69,336a7994-5681-4bf4-aedd-094a9f66fc8b,Tel Aviv,en,0


In [10]:
data.to_csv(DATA_PATH / 'generated_ad_data.csv.gz', sep=';', compression='gzip', index=False)
test_data.to_csv(DATA_PATH / 'generated_ad_data_test.csv.gz', sep=';', compression='gzip', index=False)
api_data.to_csv(DATA_PATH / 'test_api.csv', index=False)