In [1]:
import pandas as pd
import numpy as np

In [2]:
def load_dataset(path, isBidSet=False):
    df = pd.read_pickle(path)
    if not isBidSet:
        df = df.iloc[:, :23]
    df = df.drop(columns=["LogType"], axis=1)
    return df

In [3]:
def initialize_dataset(work_df):
    data = []

    group = work_df.groupby('CreativeID')

    for name, grp in group:
        sub_group = grp.groupby('AdslotID')

        for sub_name, sub_grp in sub_group:
            mini_group = sub_grp.groupby('Adslotvisibility')

            for mini_name, mini_grp in mini_group:
                row = mini_grp.iloc[0]
                regions = []
                city = []
                for region in mini_grp['Region'].unique():
                    if not region == "unknown":
                        regions.append(region)

                for cit in mini_grp['City'].unique():
                    if not cit == "unknown" or not cit == np.nan:
                        city.append(cit)

                if len(regions) == 0:
                    regions = np.nan
                if len(city) == 0:
                    city = np.nan

                data_dict = {
                    'CreativeID': name,
                    'AdvertiserID': row['AdvertiserID'],
                    'User-Agent': row['User-Agent'],
                    'Regions': regions,
                    'City': city,
                    'AdExchange': row['AdExchange'],
                    'Adslotwidth': row['Adslotwidth'],
                    'Adslotheight': row['Adslotheight'],
                    'AdslotID': sub_name,
                    'Adslotvisibility': mini_name,
                    'Adslotformat': row['Adslotformat'],
                    'Adslotfloorprice': row['Adslotfloorprice'],
                    'BiddingPrice': row['BiddingPrice'],
                    'num_impressions': len(mini_grp)
                }

                data.append(data_dict)

    return pd.DataFrame(data)

In [4]:
def get_number(row, df):
    return len(df[(df['CreativeID'] == row['CreativeID']) & (df['AdslotID'] == row['AdslotID']) & (df['Adslotvisibility'] == row['Adslotvisibility'])])

In [5]:
def get_device_type(row):
    string = str(row['User-Agent']).lower()
    if 'windows nt' in string or "macintosh" in string or "linux" in string:
        return 'Desktop'
    
    elif 'android' in string or 'iphone' in string or 'mobile' in string:
        return 'Mobile'
    
    elif 'ipad' in string or 'tablet' in string:
        return 'Tablet'
    
    else:
        return 'unknown'

In [41]:
imp_df = load_dataset("data/imp_12.pkl")
clk_df = load_dataset("data/clk_12.pkl")
conv_df = load_dataset("data/conv_12.pkl")

df = initialize_dataset(imp_df)

In [42]:
df['device_type'] = df.apply(lambda row: get_device_type(row), axis=1)

In [43]:
df['num_conversions'] = df.apply(lambda row: get_number(row, conv_df), axis=1)

In [44]:
df['num_clicks'] = df.apply(lambda row: get_number(row, clk_df), axis=1)

In [45]:
df['ctr'] = df['num_clicks'] / df['num_impressions']

In [46]:
df = df.drop(columns=['User-Agent'], axis=1)

In [47]:
df.to_pickle("processed_data/df_12.pkl")
df.to_csv("processed_data/df_12.csv", index=False)

In [76]:
imp_df['City'] = imp_df['City'].fillna('unknown')
imp_df['Region'] = imp_df['Region'].fillna('unknown')
all_regions = imp_df['Region'].unique()

In [77]:
region_city_map = {}

for region in all_regions:
    if region == "unknown":
        continue

    group = imp_df.groupby('Region').get_group(region)
    cities = group['City'].unique().tolist()
    if "unknown" in cities:
        cities.remove("unknown")
    region_city_map[region] = cities

In [81]:
import json

with open("region_city_map.json", "w") as f:
    json.dump(region_city_map, f)

In [87]:
type(imp_df['Timestamp'][0])

str

In [88]:
from datetime import datetime

In [None]:
imp_hour_map = {}
click_hour_map = {}

In [122]:
for i in imp_df['Timestamp']:
    hour = datetime.strptime(i.split('.')[0], '%Y-%m-%d %H:%M:%S').hour
    if hour in imp_hour_map:
        imp_hour_map[hour] += 1
    else:
        imp_hour_map[hour] = 1

In [123]:
for i in clk_df['Timestamp']:
    hour = datetime.strptime(i.split('.')[0], '%Y-%m-%d %H:%M:%S').hour
    if hour in click_hour_map:
        click_hour_map[hour] += 1
    else:
        click_hour_map[hour] = 1

In [126]:
full_hour_map = {
    'Impressions': imp_hour_map,
    'Clicks': click_hour_map,
    'CTR': {k: click_hour_map[k] / imp_hour_map[k] for k in imp_hour_map}
}

In [129]:
import json

with open("hourly_data.json", "w") as f:
    json.dump(full_hour_map, f)