In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import nltk
from nltk.tokenize import RegexpTokenizer

In [8]:
# read raw data
raw_sample = pd.read_csv('./raw_sample.csv', engine='c')
ad_feature = pd.read_csv('./ad_feature.csv', engine='c')
user_profile = pd.read_csv('./user_profile.csv', engine = 'c')

In [9]:
# join samples
sample = raw_sample.join(ad_feature.set_index('adgroup_id'), on='adgroup_id')
sample = sample.join(user_profile.set_index('userid'), on='user')
sample = sample.rename(columns = {"adgroup_id": "item_id", "user": "user_id"})
sample = sample.drop('nonclk', axis=1)

In [10]:
# define unique function
def _unique(sample, fname):
    tmp_df = pd.DataFrame()
    tmp_df[fname] = sample[fname].unique()
    num = len(tmp_df)
    tmp_df['tmp_feature'] = range(num)
    sample = sample.join(tmp_df.set_index(fname), on=fname)
    sample.drop(fname, axis=1, inplace=True)
    sample = sample.rename(columns = {"tmp_feature": fname})
    return num, sample

# preprocess fequency
item2count = sample.groupby(['item_id']).size().reset_index(name='count').sort_values(by='count')
sample = sample.join(item2count.set_index('item_id'), on='item_id')

# preprocess and generate descripition
spr_features = ['user_id', 'item_id', 'pid', 'cate_id', 'campaign_id', 'customer', 'brand', 'cms_segid', \
       'cms_group_id', 'final_gender_code', 'age_level', 'pvalue_level', \
       'shopping_level', 'occupation', 'new_user_class_level ']
ctn_features = ['time_stamp', 'price', 'count']
label = ['clk']
description = []
for spr_f in spr_features:
    print(spr_f) 
    num, sample = _unique(sample, spr_f)
    num += 1
    sample[spr_f] = sample[spr_f].fillna(num - 1)
    sample[spr_f] = sample[spr_f].astype('int')
    description.append((spr_f, num, 'spr'))
for ctn_f in ctn_features:
    sample[ctn_f] = sample[ctn_f].fillna(0.0)
    min_v = np.min(sample[ctn_f])
    max_v = np.max(sample[ctn_f])
    sample[ctn_f] = sample[ctn_f].map(lambda x: (x - min_v)/(max_v - min_v))
    description.append((ctn_f, -1, 'ctn'))
for l in label:
    description.append((l, 2, 'label'))
description

user_id
item_id
pid
cate_id
campaign_id
customer
brand
cms_segid
cms_group_id
final_gender_code
age_level
pvalue_level
shopping_level
occupation
new_user_class_level 


[('user_id', 1141730, 'spr'),
 ('item_id', 846812, 'spr'),
 ('pid', 3, 'spr'),
 ('cate_id', 6770, 'spr'),
 ('campaign_id', 423437, 'spr'),
 ('customer', 255876, 'spr'),
 ('brand', 99816, 'spr'),
 ('cms_segid', 99, 'spr'),
 ('cms_group_id', 15, 'spr'),
 ('final_gender_code', 4, 'spr'),
 ('age_level', 9, 'spr'),
 ('pvalue_level', 5, 'spr'),
 ('shopping_level', 5, 'spr'),
 ('occupation', 4, 'spr'),
 ('new_user_class_level ', 6, 'spr'),
 ('time_stamp', -1, 'ctn'),
 ('price', -1, 'ctn'),
 ('count', -1, 'ctn'),
 ('clk', 2, 'label')]

In [11]:
x_pid0 = sample[sample['pid']==0][['item_id', 'user_id']]
x_pid1 = sample[sample['pid']==1][['item_id', 'user_id']] 

In [12]:
print(len(set(x_pid0['user_id']).intersection(set(x_pid1['user_id']))))

13767


In [14]:
# split dataset
N, K = 2000, 500
item2count = sample.groupby(['item_id']).size().reset_index(name='count').sort_values(by='count')
item_ids = list(item2count['item_id'])
counts = np.array(item2count['count'])

item_ids, counts = np.asarray(item_ids), np.asarray(counts)
hot_item_ids = item_ids[counts > N]
cold_item_ids = item_ids[np.logical_and(counts <= N, counts >= 3 * K)]
item_group = sample.groupby('item_id')
train_base = pd.DataFrame()
for item_id in hot_item_ids:
    df_hot = item_group.get_group(item_id).sort_values(by='time_stamp')
    train_base = train_base.append(df_hot, ignore_index=True)
train_warm_a, train_warm_b, train_warm_c, test = pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
for item_id in cold_item_ids:
    df_cold = item_group.get_group(item_id).sort_values(by='time_stamp')
    train_warm_a = train_warm_a.append(df_cold[: K], ignore_index=True)
    train_warm_b = train_warm_b.append(df_cold[K: 2*K], ignore_index=True)
    train_warm_c = train_warm_c.append(df_cold[2*K: 3*K], ignore_index=True)
    test = test.append(df_cold[3*K:], ignore_index=True)
save_dic = {
    'train_base': train_base.sort_values('time_stamp'),
    'train_warm_a': train_warm_a.sort_values('time_stamp'),
    'train_warm_b': train_warm_b.sort_values('time_stamp'),
    'train_warm_c': train_warm_c.sort_values('time_stamp'),
    'test': test.sort_values('time_stamp'),
    'description': description
}
for name, df in save_dic.items():
    print("{} size: {}".format(name, len(df)))
with open('./cold_start/emb_warm_split_preprocess_taobao-ad.pkl', 'bw+') as f:
    pickle.dump(save_dic, f)

train_base size: 3592047
train_warm_a size: 270500
train_warm_b size: 270500
train_warm_c size: 270500
test size: 109712
description size: 19


In [15]:
import pickle
import numpy as np
with open('./cold_start/emb_warm_split_preprocess_taobao-ad.pkl', 'rb+') as f:
    data = pickle.load(f)

In [16]:
train_base = data['train_base']
shuffle_idx = np.random.permutation(len(train_base))[:800 * 2048]
res = train_base.iloc[shuffle_idx, ]
data['train_base'] = res
with open('./small_emb_warm_split_preprocess_taobao-ad.pkl', 'wb+') as f:
    pickle.dump(data, f)

In [17]:
res

Unnamed: 0,time_stamp,clk,price,count,user_id,item_id,pid,cate_id,campaign_id,customer,brand,cms_segid,cms_group_id,final_gender_code,age_level,pvalue_level,shopping_level,occupation,new_user_class_level
381354,0.964761,0,1.149900e-06,0.031877,130980,496304,0,245,127238,75272,1,17,8,2,3,1,0,0,1
2685511,0.481650,0,8.699000e-07,0.112382,300602,777757,1,207,145741,99989,1,25,5,2,4,1,0,0,1
2426105,0.047275,0,1.419900e-06,0.088780,54498,248167,0,245,124130,97913,47615,0,8,2,3,1,0,0,0
1659223,0.633922,0,7.899000e-07,0.054154,101639,508094,1,75,291439,121158,55523,0,5,2,4,0,0,0,1
336622,0.097956,0,9.899000e-07,0.031581,168064,793772,0,75,157160,109914,1,0,8,2,3,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1345508,0.699780,0,7.498000e-07,0.047245,113692,246973,0,42,154175,99989,1,4,8,2,3,1,0,0,2
419563,0.803185,0,9.990000e-08,0.032117,62553,720616,0,328,84325,69274,34550,39,9,2,0,3,0,2,2
2987667,0.496871,1,2.599000e-07,0.150561,107667,745015,0,3570,76407,24022,1,59,10,2,6,1,0,0,1
1299,0.695152,0,1.369900e-06,0.028198,235300,490017,0,207,135601,99989,1,0,8,2,3,0,0,0,0


In [19]:
# Get the train sef for Meta-Embedding method
with open('./cold_start/emb_warm_split_preprocess_taobao-ad.pkl', 'rb+') as f:
    data = pickle.load(f)
    df_base = data['train_base']
item2group = df_base.groupby('item_id')
train_a, train_b, train_c, train_d = pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
for item_id, df_group in item2group:
    l, e = df_group.shape[0], df_group.shape[0] // 4     
    train_a = train_a.append(df_group.iloc[0: e,], ignore_index=True)
    train_b = train_b.append(df_group.iloc[e: 2 * e, ], ignore_index=True)
    train_c = train_c.append(df_group.iloc[2 * e: 3 * e, ], ignore_index=True)
    train_d = train_d.append(df_group.iloc[3 * e: 4 * e, ], ignore_index=True)
shuffle_idx = np.random.permutation(train_a.shape[0])
train_a = train_a.iloc[shuffle_idx]
train_b = train_b.iloc[shuffle_idx]
train_c = train_c.iloc[shuffle_idx]
train_d = train_d.iloc[shuffle_idx]
data["metaE_a"] = train_a
data["metaE_b"] = train_b
data["metaE_c"] = train_c
data["metaE_d"] = train_d
with open('./taobaoAD_data.pkl', 'wb+') as f:
    pickle.dump(data, f)