In [1]:
import time
import gc
import numpy as np
import pandas as pd
import keras
from keras.callbacks import EarlyStopping
from keras.layers import Input, Embedding, Dense, Dropout, concatenate, Reshape,Flatten
from keras.layers import Lambda, GaussianDropout, CuDNNGRU, BatchNormalization, PReLU
from keras.models import Model
from keras.optimizers import Adam
from keras.utils.vis_utils import plot_model
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import roc_auc_score,classification_report,f1_score,accuracy_score,recall_score
import warnings
warnings.filterwarnings('ignore')

def get_keras_data(dataset,sparse_cate_list, num_list):
    X = {
        'category_inp': dataset[sparse_cate_list].values,
        'numerical_inp': dataset[num_list].values,
    }
    return X

def reduce_mem_usage(data):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = data.memory_usage().sum() / 1024**2    
    for col in data.columns:
        col_type = data[col].dtypes
        if col_type in numerics:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)    
    end_mem = data.memory_usage().sum() / 1024**2
    print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return data

feature_path = 'feature/'
model_path = 'model/'
submit_path = 'submit/'
#stacking feature data
metafeature_path = 'meta-feature/'
pic_path = 'pic/'
train_data_1_len = 1000000
train_data_2_len = 5000000
test_data_len = 1000000

Using TensorFlow backend.


In [2]:
data = pd.read_feather(feature_path+"combine.bin",nthreads=12)

In [9]:
dense_cate_list = [
    'adidmd5', 'imeimd5', 'macmd5', 'ip','openudidmd5','ip3'
]
sparse_cate_list = [ 'make', 'model', 'big_model',  
       'ip1', 'reqrealip', 'ip2',  'reqrealip1', 'reqrealip2',
       'reqrealip3',  
       'adunitshowid', 'mediashowid', 'city', 'province', 'creative_dpi','dvctype', 'apptype', 'carrier',
       'lan', 'ntt', 'new_ntt', 'osv', 'osv_summary', 'ver', 'pkgname', 'hour', 'period']
drift_ft = ['ip','adidmd5','imeimd5','macmd5','ip3']
corr_ft = ['reqrealip3_count', 'creative_dpi_count', 'size_count', 'adunitshowid_count_ip', 'adunitshowid_count_ip3', 'mediashowid_count_ip', 'mediashowid_count_ip3', 'apptype_count_ip', 'apptype_count_ip3', 'model_count_ip', 'model_count_adidmd5', 'model_count_macmd5', 'model_count_ip3', 'make_count_ip', 'make_count_macmd5', 'make_count_ip3', 'osv_count_ip', 'osv_count_adidmd5', 'osv_count_macmd5', 'osv_count_ip3', 'pkgname_count_ip', 'pkgname_count_macmd5', 'pkgname_count_ip3', 'reqrealip_count_ip', 'reqrealip_count_ip3', 'reqrealip2_count_ip', 'reqrealip2_count_ip3', 'reqrealip3_count_ip', 'reqrealip3_count_macmd5', 'reqrealip3_count_ip3', 'ver_count_ip', 'ver_count_adidmd5', 'ver_count_macmd5', 'ver_count_ip2', 'ver_count_ip3', 'model_count_adunitshowid', 'adunitshowid_count_reqrealip2', 'adunitshowid_count_reqrealip3', 'pkgname_count_ver', 'pkgname_count_reqrealip', 'pkgname_count_reqrealip2', 'pkgname_count_reqrealip3', 'apptype_count_ver', 'adunitshowid_max_log_screen_area', 'adunitshowid_mean_log_screen_area', 'adunitshowid_max_size', 'adunitshowid_mean_size', 'adunitshowid_var_size', 'adunitshowid_mean_px', 'mediashowid_mean_h', 'mediashowid_mean_w', 'mediashowid_var_w', 'mediashowid_mean_log_screen_area', 'mediashowid_mean_h_w_ratio', 'mediashowid_max_size', 'mediashowid_mean_size', 'mediashowid_mean_px', 'apptype_mean_h', 'apptype_mean_w', 'apptype_mean_log_screen_area', 'apptype_max_size', 'apptype_mean_size', 'apptype_mean_px', 'model_mean_w', 'model_max_size', 'model_mean_size', 'model_var_size', 'make_max_h', 'make_max_w', 'make_mean_w', 'make_max_log_screen_area', 'make_max_size', 'make_mean_size', 'make_var_size', 'osv_max_w', 'osv_mean_w', 'osv_var_w', 'osv_max_log_screen_area', 'osv_mean_log_screen_area', 'osv_var_h_w_ratio', 'osv_max_size', 'osv_mean_size', 'osv_var_size', 'pkgname_var_w', 'pkgname_mean_log_screen_area', 'pkgname_max_h_w_ratio', 'pkgname_max_size', 'pkgname_mean_size', 'pkgname_var_size', 'pkgname_max_px', 'pkgname_mean_px', 'reqrealip_mean_h', 'reqrealip_var_h', 'reqrealip_mean_w', 'reqrealip_var_w', 'reqrealip_max_log_screen_area', 'reqrealip_mean_log_screen_area', 'reqrealip_mean_h_w_ratio', 'reqrealip_var_h_w_ratio', 'reqrealip_max_size', 'reqrealip_mean_size', 'reqrealip_mean_px', 'reqrealip2_mean_h', 'reqrealip2_max_w', 'reqrealip2_mean_w', 'reqrealip2_mean_log_screen_area', 'reqrealip2_var_log_screen_area', 'reqrealip2_var_h_w_ratio', 'reqrealip2_max_size', 'reqrealip2_mean_size', 'reqrealip3_max_h', 'reqrealip3_mean_h', 'reqrealip3_max_w', 'reqrealip3_mean_w', 'reqrealip3_var_w', 'reqrealip3_max_log_screen_area', 'reqrealip3_mean_log_screen_area', 'reqrealip3_var_log_screen_area', 'reqrealip3_max_h_w_ratio', 'reqrealip3_mean_h_w_ratio', 'reqrealip3_var_h_w_ratio', 'reqrealip3_max_size', 'reqrealip3_mean_size', 'reqrealip3_max_px', 'reqrealip3_mean_px', 'reqrealip3_var_px', 'ver_var_w', 'ver_mean_log_screen_area', 'ver_var_log_screen_area', 'ver_max_size', 'ver_mean_size', 'ver_var_size', 'ver_mean_px', 'adunitshowid_var_hour', 'mediashowid_mean_period', 'mediashowid_var_period', 'mediashowid_mean_hour', 'mediashowid_var_hour', 'apptype_mean_period', 'apptype_var_period', 'apptype_var_hour', 'osv_var_hour', 'pkgname_var_hour', 'reqrealip_var_hour', 'reqrealip2_var_hour', 'reqrealip3_mean_period', 'reqrealip3_var_period', 'reqrealip3_mean_hour', 'reqrealip3_var_hour', 'make_mediashowid_dvctype_var_hour', 'adunitshowid_mediashowid_count_ip', 'adunitshowid_mediashowid_count_ip2', 'adunitshowid_mediashowid_count_ip3']

In [None]:
data = data.drop(drift_ft,axis=1)
features = [col for col in data.columns if col not in ['sid', 'label']]
numerical = [col for col in features if col not in sparse_cate_list]

In [4]:
%time data = reduce_mem_usage(data)
gc.collect()
data = data.replace([np.inf,-np.inf],np.nan)
imp = SimpleImputer(missing_values=np.nan, strategy='mean',copy=False)
imp.fit_transform(data[features])

%time data = reduce_mem_usage(data)
gc.collect()

scaler = StandardScaler(copy=False, with_mean=True, with_std=True)
scaler.fit_transform(data[numerical])

%time data = reduce_mem_usage(data)
gc.collect()

minmax = MinMaxScaler(copy=False)
minmax .fit_transform(data[numerical])

%time data = reduce_mem_usage(data)
gc.collect()

%time data = reduce_mem_usage(data)
train_x = data[:6000000]
train_y=train_x['label'].values
test=data[6000000:].reset_index(drop=True)
del data
gc.collect()

Mem. usage decreased to 885.01 Mb (60.8% reduction)
Wall time: 6.14 s
Mem. usage decreased to 885.01 Mb (67.0% reduction)
Wall time: 13 s
Mem. usage decreased to 885.01 Mb (0.0% reduction)
Wall time: 1.22 s
Mem. usage decreased to 885.01 Mb (0.0% reduction)
Wall time: 1.26 s
Mem. usage decreased to 885.01 Mb (0.0% reduction)
Wall time: 1.23 s


0

In [23]:
K = keras.backend
def f1_loss(y_true, y_pred):
    #计算tp、tn、fp、fn
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)
	
    #percision与recall，这里的K.epsilon代表一个小正数，用来避免分母为零
    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())
    #计算f1
    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = K.where(K.is_nan(f1), K.zeros_like(f1), f1)#其实就是把nan换成0
    return 1 - K.mean(f1)

#计算F1score的函数,传入正确的结果和预测结果，返回F1_score
def cal_f1(true_valid,pred_valid):
    TP = K.sum(true_valid*(K.round(pred_valid)), axis=0)
    TN = K.sum((1-true_valid)*(1-(K.round(pred_valid))), axis=0)
    FP = K.sum((1-true_valid)*(K.round(pred_valid)), axis=0)
    FN = K.sum(true_valid*(1-(K.round(pred_valid))), axis=0)
    
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    F1 = 2*precision*recall/(precision + recall)
    return F1

def gru_model():
    emb_n_1 = 30
    emb_n_2 = 20
    category_num = {
        'reqrealip': (27027, emb_n_1),
        'reqrealip1': (106, emb_n_2),
        'reqrealip2': (3233, emb_n_1),
        'reqrealip3': (21836, emb_n_1),
        'ip1': (661, emb_n_2),
        'ip2': (12452, emb_n_1),
        'adunitshowid': (965, emb_n_1),
        'apptype': (92, emb_n_2),
        'carrier': (5, emb_n_2),
        'city': (346, emb_n_1),
        'province': (23, emb_n_2),
        'dvctype': (4, emb_n_2),
        'model': (14434, emb_n_1), 
        'make': (2297, emb_n_1),
        'mediashowid': (356, emb_n_1),
        'ntt': (10, emb_n_2),
        'new_ntt': (6, emb_n_2),
        'osv': (156, emb_n_2),
        'pkgname': (7263, emb_n_1),
        'ver': (6836, emb_n_1),
        'creative_dpi': (6663, emb_n_1),
        'hour': (24, emb_n_2),
        'osv_summary': (11, emb_n_2),
        'period': (6, emb_n_2),
        'minute': (1440, emb_n_1),
        'lan': (56, emb_n_2),
        'big_model': (10024, emb_n_1),
    }
    # 类别型变量输入
    category_inp = Input(shape=(len(sparse_cate_list),), name='category_inp')
    cat_embeds = []
    for idx, col in enumerate(sparse_cate_list):
        x = Lambda(lambda x: x[:, idx, None])(category_inp)
        x = Embedding(category_num[col][0], category_num[col][1], input_length=1)(x)
        cat_embeds.append(x)
    embeds = concatenate(cat_embeds, axis=2)
    embeds = GaussianDropout(0.2)(embeds)
    # 数值型变量输入
    numerical_inp = Input(shape=(len(numerical),), name='numerical_inp')
    '''
    print('numerical', len(numerical) // 8 * 8 + 8)
    x2 = Dense(len(numerical) // 8 + 8, activation='relu', kernel_initializer='random_uniform',
               bias_initializer='zeros')(
        numerical_inp)
    x2 = Dropout(0.2)(x2)
    x2 = BatchNormalization()(x2)
    '''
    x2 = Reshape([1, int(numerical_inp.shape[1])])(numerical_inp)
    x = concatenate([embeds, x2], axis=2)
    # 主干网络
    x = CuDNNGRU(256)(x)
    x = BatchNormalization()(x)
    x_gru = Dropout(0.2)(x)
    x = Dense(128)(x_gru)
    x = PReLU()(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    x = concatenate([x,x_gru], axis=1)
    x = Dense(128)(x)
    x = PReLU()(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    x = Dense(64)(x)
    x = PReLU()(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    x = Dense(64)(x)
    x = PReLU()(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    x = Dense(32)(x)
    x = PReLU()(x)
    x = BatchNormalization()(x)
    x = Dropout(0.05)(x)
    out_p = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=[category_inp, numerical_inp], outputs=out_p)
    model.summary()
    return model

In [24]:
model = gru_model()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
category_inp (InputLayer)       (None, 26)           0                                            
__________________________________________________________________________________________________
lambda_93 (Lambda)              (None, 1)            0           category_inp[0][0]               
__________________________________________________________________________________________________
lambda_94 (Lambda)              (None, 1)            0           category_inp[0][0]               
__________________________________________________________________________________________________
lambda_95 (Lambda)              (None, 1)            0           category_inp[0][0]               
__________________________________________________________________________________________________
lambda_96 

In [25]:
batch_size = 2048  # 20000 512
epochs = 10

steps = int(len(train_x) / batch_size) * epochs
exp_decay = lambda init, fin, steps: (init / fin) ** (1 / (steps - 1)) - 1
lr_init, lr_fin = 0.002, 0.0001
lr_decay = exp_decay(lr_init, lr_fin, steps)
optimizer_adam = Adam(lr=0.002, decay=lr_decay)
model.compile(loss='binary_crossentropy', optimizer=optimizer_adam, metrics=[cal_f1])

In [18]:
train_x = get_keras_data(train_x, sparse_cate_list, numerical)

In [None]:
early_stopping = EarlyStopping(monitor='va', patience=3)
%time model.fit(train_x, train_y, callbacks=[early_stopping], validation_split=0.1, batch_size=batch_size, epochs=epochs,shuffle=True, verbose=1)

Train on 5400000 samples, validate on 600000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10