In [2]:
import json
import datetime
import pandas as pd
from sklearn import preprocessing

tr_csv_path = 'data/train.csv'
ts_csv_path = 'data/test.csv'

data_type = {'id': 'U', 'hour': 'U', 'device_type':'U', 'C1':'U', 'C15':'U', 'C16':'U'}

In [3]:
train = pd.read_csv(tr_csv_path, dtype=data_type, index_col='id')
train.shape

(40428967, 23)

In [4]:
test  = pd.read_csv(ts_csv_path, dtype=data_type, index_col='id')
test.insert(0, 'click', 0)
test.shape

(4577464, 23)

In [5]:
tr_ts = pd.concat([test, train], copy=False)

## 一、 统计类别型

#### 1. site_id 处理频率低于20

In [None]:
import json
site_id_count = tr_ts.site_id.value_counts()
site_id_category={}
site_id_category[0] = site_id_count.loc[site_id_count>20].index.values
site_id_category[1] = site_id_count.loc[site_id_count<=20].index.values

site_id_C_type_dict = {}
for key, values in site_id_category.items():
    for item in values:
        site_id_C_type_dict[str(item)] = key

json.dump(site_id_C_type_dict, open("data/site_id_C_type_dict.json", "w"))

#### 2. site_domain 处理频率低于20

In [None]:
import json
site_domain_count = tr_ts.site_domain.value_counts()
site_domain_category={}
site_domain_category[0] = site_domain_count.loc[site_domain_count>20].index.values
site_domain_category[1] = site_domain_count.loc[site_domain_count<=20].index.values

site_domain_C_type_dict = {}
for key, values in site_domain_category.items():
    for item in values:
        site_domain_C_type_dict[str(item)] = key

json.dump(site_domain_C_type_dict, open("data/site_domain_C_type_dict.json", "w"))

#### 3. app_id 处理频率低于20

In [6]:
import json
app_id_count = tr_ts.app_id.value_counts()
app_id_category={}
app_id_category[0] = app_id_count.loc[app_id_count>20].index.values
app_id_category[1] = app_id_count.loc[app_id_count<=20].index.values

app_id_C_type_dict = {}
for key, values in app_id_category.items():
    for item in values:
        app_id_C_type_dict[str(item)] = key

json.dump(app_id_C_type_dict, open("data/app_id_C_type_dict.json", "w"))

#### 4. device_model 处理频率低于200

In [None]:
import json
device_model_count = tr_ts.device_model.value_counts()
device_model_category={}
device_model_category[0] = device_model_count.loc[device_model_count>200].index.values
device_model_category[1] = device_model_count.loc[device_model_count<=200].index.values

device_model_C_type_dict = {}
for key, values in device_model_category.items():
    for item in values:
        device_model_C_type_dict[str(item)] = key

json.dump(device_model_C_type_dict, open("data/device_model_C_type_dict.json", "w"))

## 二、 处理类别型数据

In [7]:
tr_ts['hour'] = tr_ts['hour'].apply(lambda x: x[-2:])
tr_ts['is_device'] = tr_ts['device_id'].apply(lambda x: 0 if x=='a99f214a' else 1)

In [8]:
app_id_C_type_dict = json.load(open("data/app_id_C_type_dict.json", "r"))
site_id_C_type_dict = json.load(open("data/site_id_C_type_dict.json", "r"))
site_domain_C_type_dict = json.load(open("data/site_domain_C_type_dict.json", "r"))
device_model_C_type_dict = json.load(open("data/device_model_C_type_dict.json", "r"))

In [9]:
tr_ts['C_app_id'] = tr_ts["app_id"].apply(lambda x: x if app_id_C_type_dict.get(x)==0 else "other_app_id")
tr_ts['C_site_id'] = tr_ts['site_id'].apply(lambda x: x if site_id_C_type_dict.get(x)==0 else "other_site_id")
tr_ts['C_site_domain'] = tr_ts['site_domain'].apply(lambda x: x if site_domain_C_type_dict.get(x)==0 else "other_site_domain")
tr_ts['C_device_model'] = tr_ts['device_model'].apply(lambda x: x if device_model_C_type_dict.get(x)==0 else "other_device_model")

In [10]:
tr_ts["C_pix"] = tr_ts["C15"] + '&' + tr_ts["C16"]
tr_ts["C_device_type_1"] = tr_ts["device_type"] + '&' + tr_ts["C1"]

In [11]:
tr_ts.drop(['device_id', "device_type", 'app_id', 'site_id', 'site_domain', 'device_model',"C1", "C17", 'C15', 'C16'], axis=1, inplace=True)

In [13]:
lenc = preprocessing.LabelEncoder()
C_fields = [ 'hour', 'banner_pos', 'site_category', 'app_domain', 'app_category',
            'device_conn_type', 'C14', 'C18', 'C19', 'C20','C21', 'is_device', 'C_app_id', 'C_site_id', 
            'C_site_domain', 'C_device_model', 'C_pix', 'C_device_type_1']
for f, column in enumerate(C_fields):
    print("convert " + column + "...")
    tr_ts[column] = lenc.fit_transform(tr_ts[column])

convert hour...
convert banner_pos...
convert site_category...
convert app_domain...
convert app_category...
convert device_conn_type...
convert C14...
convert C18...
convert C19...
convert C20...
convert C21...
convert is_device...
convert C_app_id...
convert C_site_id...
convert C_site_domain...
convert C_device_model...
convert C_pix...
convert C_device_type_1...


In [14]:
tr_ts.iloc[:test.shape[0],].to_csv('data/test_FE.csv')
tr_ts.iloc[test.shape[0]:,].to_csv('data/train_FE.csv')

In [12]:
tr_ts.columns

Index(['click', 'hour', 'banner_pos', 'site_category', 'app_domain',
       'app_category', 'device_ip', 'device_conn_type', 'C14', 'C18', 'C19',
       'C20', 'C21', 'is_device', 'C_app_id', 'C_site_id', 'C_site_domain',
       'C_device_model', 'C_pix', 'C_device_type_1'],
      dtype='object')