In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from datetime import datetime
import glob
import numpy as np

In [2]:
city_df = pd.read_csv('city.txt', sep="\t", names=['Id', 'City'])
city_df

Unnamed: 0,Id,City
0,0,unknown
1,4,shijiazhuang
2,5,tangshan
3,6,qinhuangdao
4,7,handan
...,...,...
357,388,aletai
358,389,shihezi
359,390,alaer
360,391,tumushuke


In [3]:
region_df = pd.read_csv('region.txt', sep="\t", names=['Id', 'Region'])
region_df.head()

Unnamed: 0,Id,Region
0,0,unknown
1,1,beijing
2,2,tianjin
3,3,hebei
4,15,shanxi


In [4]:
user_tags_df = pd.read_csv('user.profile.tags.txt', sep="\t", names=['Id', 'Tag'])
user_tags_df.head()

Unnamed: 0,Id,Tag
0,10006,Long-term interest/news
1,10024,Long-term interest/eduation
2,10031,Long-term interest/automobile
3,10048,Long-term interest/real estate
4,10052,Long-term interest/IT


In [5]:
user_tags = user_tags_df.set_index('Id').to_dict()['Tag']
user_tags[0] = 'unknown'

In [6]:
col_list = [
    "BidID",
    "Timestamp",
    "LogType",
    "VisitorID",
    "User-Agent",
    "IP",
    "Region",
    "City",
    "AdExchange",
    "Domain",
    "URL",
    "AnonymousURLID",
    "AdslotID",
    "Adslotwidth",
    "Adslotheight",
    "Adslotvisibility",
    "Adslotformat",
    "Adslotfloorprice",
    "CreativeID",
    "BiddingPrice",
    "PayingPrice",
    "KeyPageURL",
    "AdvertiserID",
    "UserProfileTags"
]

log_type_map = {1: 'Impression', 2: 'Click', 3: 'Conversion'}

In [7]:
ad_exchange_map = {1: 'Tanx (Alibaba)', 2: 'Adx (Google DoubleClick AdX)', 3: 'Tencent (Tencent)', 4: 'Baidu (Baidu)', 5: 'Youku (Youku)', 6: 'Amx (Google Mobile)'}

adslot_visibility_map = {0: 'unknown', 1: 'Above Fold', 2: 'Below Fold'}

adslot_format_map = {1: 'fixed', 2: 'popup'}

In [8]:
path = '../Adobe Devcraft Dataset/dataset/'

dataset_files = glob.glob(f'{path}*.txt')

In [9]:
def convert_timestamp_format(timestamp):
    dt = datetime.strptime(str(timestamp), '%Y%m%d%H%M%S%f')
    dt = dt.strftime('%Y-%m-%d %H:%M:%S') + '.{}'.format(dt.microsecond)

    return dt

In [10]:
def process_dataset(df, bidRequest=False):
    df['Region'] = df['Region'].map(region_df.set_index('Id')['Region'])
    df['City'] = df['City'].map(city_df.set_index('Id')['City'])
    print('Converted Region & City.....')

    df['AdExchange'] = df['AdExchange'].map(ad_exchange_map)
    df['Adslotvisibility'] = df['Adslotvisibility'].map(adslot_visibility_map)
    df['Adslotformat'] = df['Adslotformat'].map(adslot_format_map)

    print('Mapped other columns.....')

    if not bidRequest:
        df['LogType'] = df['LogType'].map(log_type_map)
        df['UserProfileTags'] = df['UserProfileTags'].fillna('0')
        df['UserProfileTags'] = df['UserProfileTags'].str.split(',')
        df['UserProfileTags'] = df['UserProfileTags'].apply(lambda x: [user_tags[int(i)] for i in x])

        mlb = MultiLabelBinarizer()
        encoded_data = mlb.fit_transform(df['UserProfileTags'])
        ohe_df = pd.DataFrame(encoded_data, columns=mlb.classes_)

        print('Encoded User Profile Tags.....')

        df = pd.concat([df, ohe_df], axis=1)
        df = df.drop('UserProfileTags', axis=1)

        if 'unknown' in df.columns:
            df = df.drop('unknown', axis=1)

        print('Dropped cols.....')

    df['Timestamp'] = df['Timestamp'].apply(convert_timestamp_format)
    print('Coverted timestamp.....')

    return df

In [13]:
for file in dataset_files:
    if 'bid' not in file:
        print('-'*20)
        print(f'Processing file: {file}')
        print()

        data = pd.read_csv(file, sep='\t', header=None, names=col_list)
        print('File Loaded.....')

        try:
            data = process_dataset(data)
            print('Preprocessed data.....')

            name = file.split('/')[-1].split('\\')[-1].split('.')
            name = name[0] + '_' + name[1]
            # data.to_csv(f'data/{name}.csv', index=False)
            # data.to_pickle(f'data/{name}.pkl')
            np_data = data.to_numpy()
            np.save(f'np_data/{name}.npy', np_data)
            
            print('Saved file.....')
            print()
            print(f'Processed file: {file}')
            print('-'*20)
            
        except Exception as e:
            print(f'Error processing file: {file}')
            print(e)
            break

--------------------
Processing file: ../Adobe Devcraft Dataset/dataset\clk.06.txt

File Loaded.....
Converted Region & City.....
Mapped other columns.....
Encoded User Profile Tags.....
Dropped cols.....
Coverted timestamp.....
Preprocessed data.....
Saved file.....

Processed file: ../Adobe Devcraft Dataset/dataset\clk.06.txt
--------------------
--------------------
Processing file: ../Adobe Devcraft Dataset/dataset\clk.07.txt

File Loaded.....
Converted Region & City.....
Mapped other columns.....
Encoded User Profile Tags.....
Dropped cols.....
Coverted timestamp.....
Preprocessed data.....
Saved file.....

Processed file: ../Adobe Devcraft Dataset/dataset\clk.07.txt
--------------------
--------------------
Processing file: ../Adobe Devcraft Dataset/dataset\clk.08.txt

File Loaded.....
Converted Region & City.....
Mapped other columns.....
Encoded User Profile Tags.....
Dropped cols.....
Coverted timestamp.....
Preprocessed data.....
Saved file.....

Processed file: ../Adobe Devc

In [2]:
bidRequest_cols = [
    "BidId",
    "Timestamp",
    "iPinYouID",
    "User-Agent",
    "IP",
    "Region",
    "City",
    "AdExchange",
    "Domain",
    "URL",
    "AnonymousURLID",
    "AdslotID",
    "Adslotwidth",
    "Adslotheight",
    "Adslotvisibility",
    "Adslotformat",
    "Adslotfloorprice",
    "CreativeID",
    "BiddingPrice",
    "AdvertiserID",
    "UserProfileTags"
]

In [3]:
df = pd.read_csv('../Adobe Devcraft Dataset/dataset/bid.10.txt', sep='\t', header=None, names=bidRequest_cols)

  df = pd.read_csv('../Adobe Devcraft Dataset/dataset/bid.10.txt', sep='\t', header=None, names=bidRequest_cols)


In [4]:
df.head()

Unnamed: 0,BidId,Timestamp,iPinYouID,User-Agent,IP,Region,City,AdExchange,Domain,URL,...,AdslotID,Adslotwidth,Adslotheight,Adslotvisibility,Adslotformat,Adslotfloorprice,CreativeID,BiddingPrice,AdvertiserID,UserProfileTags
0,a39ef4a90a053c414d0116008a2df8d7,20130610000102810,,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,125.68.251.*,276,282,2,,c96b158cfea64c1752b6e43d2f90d550,...,3175797632,120,600,0,0,250,e87d7633d474589c2e2e3ba4eda53f6c,300,3386,
1,b55c94584c3df13b45010e1d01ca140d,20130610000102810,VhC0LDK3OTssXsj,mozilla/4.0 (compatible; msie 6.0; windows nt ...,115.237.8.*,94,100,1,DFpETJn8Penx1m54,cf67e9aeaeac8dbc16c869f4877fc5aa,...,mm_10941301_943789_11236963,728,90,0,1,0,7184c9560e68e977187e67e45a4f3198,227,3427,
2,4c4773749c3ac8a0313cb937f0438802,20130610000102812,VhTC1psxD4Nc3gL,Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...,110.87.213.*,124,131,2,trqRTJkyOeas1m58uG,3b53e082b33c617f3b7faa749a18ae7b,...,477946803,728,90,1,0,5,7184c9560e68e977187e67e45a4f3198,238,3427,
3,78306eb271d9ff74862400de0106de3b,20130610000102812,,Mozilla/4.0 (compatible; MSIE 6.0; Windows NT ...,219.133.128.*,216,219,2,,14813226981bea2d88d5a25b72bd895b,...,3366799063,300,250,2,0,5,44966cc8da1ed40c95d59e863c8c75f0,300,3386,
4,41b8bfc4395b2765b83f70cc75ee47d5,20130610000102813,VhkyL5MG3ZFfGVn,mozilla/5.0 (compatible; msie 9.0; windows nt ...,114.217.166.*,80,85,1,trqRTuqUGZc7gsk,b1244d82ae90d86e5c4def950088ab0a,...,mm_12062233_2267501_10873713,160,600,0,1,0,47905feeb59223468fb898b3c9ac024d,300,3386,


In [7]:
df.to_pickle('./bidder.submission.code/python/bid.10.pkl')

In [15]:
for file in dataset_files:
    if 'bid' in file:
        data = pd.read_csv(file, sep='\t', header=None, names=bidRequest_cols)

        try:
            data = process_dataset(data, bidRequest=True)
            name = file.split('/')[-1].split('\\')[-1].split('.')
            name = name[0] + '_' + name[1]

            # data.to_csv(f'data/{name}.csv', index=False)
            # data.to_pickle(f'data/{name}.pkl')
            np_data = data.to_numpy()
            np.save(f'np_data/{name}.npy', np_data)
            
            print(f'Processed file: {file}')

            
        except Exception as e:
            print(f'Error processing file: {file}')
            print(e)
            break

  data = pd.read_csv(file, sep='\t', header=None, names=bidRequest_cols)


Converted Region & City.....
Mapped other columns.....
Coverted timestamp.....
Processed file: ../Adobe Devcraft Dataset/dataset\bid.06.txt


  data = pd.read_csv(file, sep='\t', header=None, names=bidRequest_cols)


Converted Region & City.....
Mapped other columns.....
Coverted timestamp.....
Processed file: ../Adobe Devcraft Dataset/dataset\bid.07.txt


  data = pd.read_csv(file, sep='\t', header=None, names=bidRequest_cols)


Converted Region & City.....
Mapped other columns.....
Coverted timestamp.....
Processed file: ../Adobe Devcraft Dataset/dataset\bid.08.txt


  data = pd.read_csv(file, sep='\t', header=None, names=bidRequest_cols)


Converted Region & City.....
Mapped other columns.....
Coverted timestamp.....
Processed file: ../Adobe Devcraft Dataset/dataset\bid.09.txt


  data = pd.read_csv(file, sep='\t', header=None, names=bidRequest_cols)


Converted Region & City.....
Mapped other columns.....
Coverted timestamp.....
Processed file: ../Adobe Devcraft Dataset/dataset\bid.10.txt


  data = pd.read_csv(file, sep='\t', header=None, names=bidRequest_cols)


Converted Region & City.....
Mapped other columns.....
Coverted timestamp.....
Processed file: ../Adobe Devcraft Dataset/dataset\bid.11.txt


  data = pd.read_csv(file, sep='\t', header=None, names=bidRequest_cols)


Converted Region & City.....
Mapped other columns.....
Coverted timestamp.....
Processed file: ../Adobe Devcraft Dataset/dataset\bid.12.txt


In [2]:
df = pd.read_pickle('data/clk_06.pkl')

In [3]:
df.head()

Unnamed: 0,BidID,Timestamp,LogType,VisitorID,User-Agent,IP,Region,City,AdExchange,Domain,...,Long-term interest/health,Long-term interest/home and lifestyle,Long-term interest/luxury,Long-term interest/motherhood&parenting,Long-term interest/news,Long-term interest/online literature,Long-term interest/real estate,Long-term interest/social,Long-term interest/sports,Long-term interest/travel&outdoors
0,c7654fe86bc7f66d75242d5e12a6aad4,2013-06-06 00:01:10.312000,Click,Vh1OPiSeP2kfQGj,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.1...,49.223.203.*,liaoning,dalian,Tanx (Alibaba),trqRTvpogNlyDok4JKTI,...,1,0,0,0,1,0,0,0,0,0
1,82d22846bc757857f11064c60e8a0f3b,2013-06-06 00:01:10.864000,Click,Vh111kuiLlaWqG8,Mozilla/5.0 (Linux; U; Android 4.0.4; zh-cn; 7...,110.17.170.*,neimenggu,baotou,Tanx (Alibaba),2h5wl5SvdNmRaqKbu-,...,0,0,0,0,1,0,0,0,0,0
2,6ad8bc340c03cbcf6a5bd1f9d20ce07d,2013-06-06 00:01:11.403000,Click,Vhkr1vTlOt56qfn,K-TouchC986t_TD/1.0 Android 4.0.3 Release/10.0...,183.7.247.*,guangdong,shantou,Tanx (Alibaba),5Sc-GxfIL5scFsf,...,0,0,0,0,0,0,0,0,0,0
3,22bbc00c2db42aa8a60497028845e85f,2013-06-06 00:01:19.661000,Click,Vh1DCnKPDvuLqOE,Mozilla/5.0 (Linux; U; Android 3.2; zh-cn; Ide...,123.144.19.*,chongqing,,Tanx (Alibaba),trqRTu1uGTdUgNK4wJB,...,0,0,0,0,0,1,0,0,0,0
4,47e7c5523cbda762a7c00e0c47c41993,2013-06-06 00:01:20.463000,Click,Vhk7ZApCP4BWjgk,MQQBrowser/43 Mozilla/5.0 (iPhone 5ATT; CPU iP...,14.212.204.*,guangdong,foshan,Tanx (Alibaba),erdbexb0gI5RgsxfU11_qxMzmw,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df.columns

Index(['BidID', 'Timestamp', 'LogType', 'VisitorID', 'User-Agent', 'IP',
       'Region', 'City', 'AdExchange', 'Domain', 'URL', 'AnonymousURLID',
       'AdslotID', 'Adslotwidth', 'Adslotheight', 'Adslotvisibility',
       'Adslotformat', 'Adslotfloorprice', 'CreativeID', 'BiddingPrice',
       'PayingPrice', 'KeyPageURL', 'AdvertiserID',
       'Demographic/gender/famale', 'Demographic/gender/male',
       'In-market/3c product', 'In-market/Beauty& Personal Care',
       'In-market/appliances', 'In-market/automobile', 'In-market/book',
       'In-market/clothing、shoes&bags', 'In-market/education',
       'In-market/electronic game', 'In-market/finance',
       'In-market/food&drink', 'In-market/health care products',
       'In-market/household&home improvement', 'In-market/luxury',
       'In-market/medicine', 'In-market/real estate', 'In-market/service',
       'In-market/sports item', 'In-market/travel', 'Long-term interest/3c',
       'Long-term interest/IT', 'Long-term interest/