In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import numpy as np

In [2]:
train = pd.read_csv('../../data_ori/train.csv')
test = pd.read_csv('../../data_ori/test.csv')

In [3]:
#时间离散化
train['clickTime_day'] = train['clickTime'].map(lambda x: x / 10000)
train['clickTime_hour'] = train['clickTime'].map(lambda x: x / 100 % 100)
train['clickTime_minute'] = train['clickTime'].map(lambda x: x % 100)

In [4]:
train.groupby(['clickTime_day'])['label'].value_counts()

clickTime_day  label
17.0000        0        118
17.0001        0        179
               1          5
17.0002        0        164
               1          5
17.0003        0        182
               1          4
17.0004        0        165
               1          5
17.0005        0        166
               1          6
17.0006        0        145
               1         10
17.0007        0        157
               1          6
17.0008        0        153
               1          2
17.0009        0        142
               1          4
17.0010        0        164
               1          4
17.0011        0        134
               1          4
17.0012        0        136
               1          4
17.0013        0        154
               1          8
17.0014        0        141
               1          2
17.0015        0        149
                       ... 
30.2345        0        213
               1          4
30.2346        0        232
               1          5

In [5]:
train.head()

Unnamed: 0,label,clickTime,conversionTime,creativeID,userID,positionID,connectionType,telecomsOperator,clickTime_day,clickTime_hour,clickTime_minute
0,0,170000,,3089,2798058,293,1,1,17.0,0.0,0
1,0,170000,,1259,463234,6161,1,2,17.0,0.0,0
2,0,170000,,4465,1857485,7434,4,1,17.0,0.0,0
3,0,170000,,1004,2038823,977,1,1,17.0,0.0,0
4,0,170000,,1887,2015141,3688,1,1,17.0,0.0,0


In [6]:
test['clickTime_day'] = test['clickTime'].map(lambda x: x / 10000)
test['clickTime_hour'] = test['clickTime'].map(lambda x: x / 100 % 100)
test['clickTime_minute'] = test['clickTime'].map(lambda x: x % 100)

In [7]:
#position直接加上去
position = pd.read_csv('../../data_ori/position.csv')
train = pd.merge(train, position, on='positionID', how='left')
test = pd.merge(test, position, on='positionID', how='left')

In [8]:
feature_name = [a for a in train.columns if a not in ['label', 'conversionTime']]
categorical_feature = ['creativeID', 'userID', 'positionID', 'connectionType', 'telecomsOperator']

In [None]:
train_label = train['label']
train = train[feature_name]
#test_label = test['label']
test = test[feature_name]

In [None]:
#添加appID特征（tfidf）
user_installedapps = pd.read_csv('../../data_ori/user_installedapps.csv')
user_installedapps_count = user_installedapps.groupby('userID').agg(len).reset_index()  #计数特征
user_installedapps_count.columns = ['userID', 'user_appID_count']
user_installedapps = user_installedapps.groupby('userID').agg(
    lambda x: ' '.join(['app' + str(s) for s in x.values])).reset_index()

user_id_all = pd.concat([train.userID, test.userID], axis=0)
user_id_all = pd.DataFrame(user_id_all, columns=['userID'])

#不同用户的先提取出来
user_installedapps = pd.merge(user_id_all.drop_duplicates(), user_installedapps, on='userID', how='left')
user_installedapps = user_installedapps.fillna('Missing')

tfv = TfidfVectorizer()
tfv.fit(user_installedapps.appID)

#按照顺序转化为tfidf特征
user_installedapps = pd.merge(user_id_all, user_installedapps, on='userID', how='left')
user_installedapps = user_installedapps.fillna('Missing')
user_installedapps_tfv = tfv.transform(user_installedapps.appID)

In [None]:
user = pd.read_csv('../../data_ori/user.csv')
user['hometown_city'] = user['hometown'] % 100
user['hometown_province'] = (user['hometown'] / 100).astype('int')
user['residence_city'] = user['residence'] % 100
user['residence_province'] = (user['residence'] / 100).astype('int')
user.head()

In [None]:
ad = pd.read_csv('../../data_ori/ad.csv')
ad.head()

In [None]:
#合并特征
train = pd.merge(train, user_installedapps_count, on='userID', how='left')
train = pd.merge(train, user, on='userID', how='left')
train = pd.merge(train, ad, on='creativeID', how='left')

test = pd.merge(test, user_installedapps_count, on='userID', how='left')
test = pd.merge(test, user, on='userID', how='left')
test = pd.merge(test, ad, on='creativeID', how='left')

In [None]:
train.head()

In [None]:
train.dtypes

In [None]:
del train['user_appID_count'], test['user_appID_count']

In [None]:
oneEnc = OneHotEncoder()
data_one = pd.concat([train, test])
data_one = oneEnc.fit_transform(data_one)
train_one = data_one[:train.shape[0]]
test_one = data_one[train.shape[0]:]

In [None]:
train = hstack([train_one, user_installedapps_tfv[:train.shape[0]]])
test = hstack([test_one, user_installedapps_tfv[train.shape[0]:]])