# Define functions to deal with different features

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from pandas import DataFrame

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
ad = pd.read_csv('ad.csv')
app = pd.read_csv('app_categories.csv')
user = pd.read_csv('user.csv')

## train test feature

In [2]:
train.head()

Unnamed: 0,label,clickTime,conversionTime,creativeID,userID,positionID,connectionType,telecomsOperator
0,0,170000,,3089,2798058,293,1,1
1,0,170000,,1259,463234,6161,1,2
2,0,170000,,4465,1857485,7434,4,1
3,0,170000,,1004,2038823,977,1,1
4,0,170000,,1887,2015141,3688,1,1


In [3]:
test.head()

Unnamed: 0,instanceID,label,clickTime,creativeID,userID,positionID,connectionType,telecomsOperator
0,1,-1,310000,3745,1164848,3451,1,3
1,2,-1,310000,2284,2127247,1613,1,3
2,3,-1,310000,1456,2769125,5510,2,1
3,4,-1,310000,4565,9762,4113,2,3
4,5,-1,310000,49,2513636,3615,1,3


In [4]:
def get_time_day(t): #click time is DDHHMM
    t = str(t)
    t=int(t[0:2])
    return t

def get_time_hour(t):
    t = str(t)
    t=int(t[2:4])
    if t<6:
        return 0
    elif t<12:
        return 1
    elif t<18:
        return 2
    else:
        return 3

In [5]:
train['clickTime_day'] = train['clickTime'].apply(get_time_day)
train['clickTime_hour']= train['clickTime'].apply(get_time_hour)

test['clickTime_day'] = test['clickTime'].apply(get_time_day)
test['clickTime_hour']= test['clickTime'].apply(get_time_hour)

## ad feature

In [6]:
ad.head()

Unnamed: 0,creativeID,adID,camgaignID,advertiserID,appID,appPlatform
0,4079,2318,147,80,14,2
1,4565,3593,632,3,465,1
2,3170,1593,205,54,389,1
3,6566,2390,205,54,389,1
4,5187,411,564,3,465,1


## app feature

In [7]:
app.head()

Unnamed: 0,appID,appCategory
0,14,2
1,25,203
2,68,104
3,75,402
4,83,203


App开发者设定的App类目标签，类目标签有两层，使用3位数字编码，百位数表示一级类目，十位个位数表示二级类目，如“210”表示一级类目编号为2，二级类目编号为10，类目未知或者无法获取时，标记为0。

In [8]:
app['appCategory'].value_counts()[:10]

0      67757
106    34314
401    10779
209     9875
104     9355
301     7467
203     7056
402     6880
105     6797
503     6708
Name: appCategory, dtype: int64

In [9]:
# method 1
def categories_first_class(cate):
    cate = str(cate)
    if len(cate)==1:
        if int(cate)==0:
            return 0
    else:
        return int(cate[0])

# method 2
def categories_second_class(cate):
    cate = str(cate)
    if len(cate)<3:
        return 0
    else:
        return int(cate[1:])

In [10]:
app["app_categories_first_class"] = app['appCategory'].apply(categories_first_class)
app["app_categories_second_class"] = app['appCategory'].apply(categories_second_class)

## user feature

In [11]:
user.head()

Unnamed: 0,userID,age,gender,education,marriageStatus,haveBaby,hometown,residence
0,1,42,1,0,2,0,512,503
1,2,18,1,5,1,0,1403,1403
2,3,0,2,4,0,0,0,0
3,4,21,2,5,3,0,607,607
4,5,22,2,0,0,0,0,1301


用户出生地，取值具体到市级城市，使用二级编码，千位百位数表示省份，十位个位数表示省内城市，如1806表示省份编号为18，城市编号是省内的6号，编号0表示未知。

In [12]:
user[user['age']!=0].describe() #age 0 means unknown

Unnamed: 0,userID,age,gender,education,marriageStatus,haveBaby,hometown,residence
count,2510847.0,2510847.0,2510847.0,2510847.0,2510847.0,2510847.0,2510847.0,2510847.0
mean,1393745.0,22.77593,1.410284,1.97558,1.017286,0.3033893,737.2152,979.7279
std,810565.2,9.675687,0.5321244,1.57753,0.9508679,0.8098684,774.6096,785.5014
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,690360.5,15.0,1.0,1.0,0.0,0.0,0.0,305.0
50%,1388503.0,21.0,1.0,2.0,1.0,0.0,503.0,803.0
75%,2095096.0,28.0,2.0,3.0,2.0,0.0,1301.0,1513.0
max,2805118.0,80.0,2.0,7.0,3.0,6.0,3401.0,3401.0


In [13]:
# deal with age
def age_process(age):
    age = int(age)
    if age==0:
        return 0
    elif age<15:
        return 1
    elif age<21:
        return 2
    elif age<28:
        return 3
    else:
        return 4

# deal with provice
def process_province(hometown):
    hometown = str(hometown)
    province = int(hometown[0:2])
    return province

#城市处理
def process_city(hometown):
    hometown = str(hometown)
    if len(hometown)>1:
        province = int(hometown[2:])
    else:
        province = 0
    return province

In [14]:
user['age_process'] = user['age'].apply(age_process)
user["hometown_province"] = user['hometown'].apply(process_province)
user["hometown_city"] = user['hometown'].apply(process_city)
user["residence_province"] = user['residence'].apply(process_province)
user["residence_city"] = user['residence'].apply(process_city)

## merge data

In [15]:
# merge data
train_user = pd.merge(train,user,on='userID')
train_user_ad = pd.merge(train_user,ad,on='creativeID')
train_user_ad_app = pd.merge(train_user_ad,app,on='appID')
train_user_ad_app.head()

Unnamed: 0,label,clickTime,conversionTime,creativeID,userID,positionID,connectionType,telecomsOperator,clickTime_day,clickTime_hour,...,residence_province,residence_city,adID,camgaignID,advertiserID,appID,appPlatform,appCategory,app_categories_first_class,app_categories_second_class
0,0,170000,,3089,2798058,293,1,1,17,0,...,13,1,1321,83,10,434,1,108,1.0,8
1,0,170001,,3089,195578,3659,0,2,17,0,...,13,1,1321,83,10,434,1,108,1.0,8
2,0,170014,,3089,1462213,3659,0,3,17,0,...,13,1,1321,83,10,434,1,108,1.0,8
3,0,170030,,3089,1985880,5581,1,1,17,0,...,0,0,1321,83,10,434,1,108,1.0,8
4,0,170047,,3089,2152167,5581,1,1,17,0,...,13,3,1321,83,10,434,1,108,1.0,8


In [16]:
test_user = pd.merge(test,user,on='userID')
test_user_ad = pd.merge(test_user,ad,on='creativeID')
test_user_ad_app = pd.merge(test_user_ad,app,on='appID')
test_user_ad_app.head()

Unnamed: 0,instanceID,label,clickTime,creativeID,userID,positionID,connectionType,telecomsOperator,clickTime_day,clickTime_hour,...,residence_province,residence_city,adID,camgaignID,advertiserID,appID,appPlatform,appCategory,app_categories_first_class,app_categories_second_class
0,1,-1,310000,3745,1164848,3451,1,3,31,0,...,60,5,1166,430,80,14,2,2,,0
1,9,-1,310000,3745,1113275,3347,1,2,31,0,...,50,4,1166,430,80,14,2,2,,0
2,43,-1,310000,3745,1215329,3347,1,1,31,0,...,50,4,1166,430,80,14,2,2,,0
3,83,-1,310000,3745,2077956,3347,1,2,31,0,...,16,10,1166,430,80,14,2,2,,0
4,108,-1,310000,3745,2343346,7422,1,1,31,0,...,13,3,1166,430,80,14,2,2,,0


In [17]:
feature_columns = ['creativeID','userID','positionID',
 'connectionType','telecomsOperator','clickTime_day','clickTime_hour','age', 'gender' ,'education',
 'marriageStatus' ,'haveBaby' , 'residence' ,'age_process',
 'hometown_province', 'hometown_city','residence_province', 'residence_city',
 'adID', 'camgaignID', 'advertiserID', 'appID' ,'appPlatform' ,
 'app_categories_first_class' ,'app_categories_second_class']

In [18]:
train_x = train_user_ad_app[feature_columns]
test_x = test_user_ad_app[feature_columns]

In [20]:
train_y = DataFrame(train_user_ad_app['label'])

In [22]:
# save as csv
train_x.to_csv('train_x.csv',index=False)
train_y.to_csv('train_y.csv',index=False)
test_x.to_csv('test_x.csv',index=False)

In [23]:
train_x.head()

Unnamed: 0,creativeID,userID,positionID,connectionType,telecomsOperator,clickTime_day,clickTime_hour,age,gender,education,...,hometown_city,residence_province,residence_city,adID,camgaignID,advertiserID,appID,appPlatform,app_categories_first_class,app_categories_second_class
0,3089,2798058,293,1,1,17,0,25,2,1,...,0,13,1,1321,83,10,434,1,1.0,8
1,3089,195578,3659,0,2,17,0,27,2,2,...,1,13,1,1321,83,10,434,1,1.0,8
2,3089,1462213,3659,0,3,17,0,35,2,0,...,0,13,1,1321,83,10,434,1,1.0,8
3,3089,1985880,5581,1,1,17,0,26,1,1,...,0,0,0,1321,83,10,434,1,1.0,8
4,3089,2152167,5581,1,1,17,0,23,1,0,...,4,13,3,1321,83,10,434,1,1.0,8
