In [1]:
import numpy as np
import os
import pandas as pd
import time
import ast
from tqdm import tqdm
import datetime
from multiprocessing import Pool, cpu_count
from itertools import zip_longest
from collections import defaultdict, OrderedDict, Counter
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import chi2,SelectKBest
%matplotlib inline

In [2]:
from keras import optimizers
from keras.callbacks import EarlyStopping, ModelCheckpoint,ReduceLROnPlateau
from keras.models import Model
from keras.layers import Add,Input, Dense, Concatenate, Reshape, Dropout
from keras.layers import Flatten,Activation,LeakyReLU,PReLU,Lambda,LSTM,GRU
from keras.layers import Conv1D,GlobalAveragePooling1D,GlobalMaxPooling1D,TimeDistributed
from keras.layers.wrappers import Bidirectional
from keras.layers.embeddings import Embedding
import tensorflow as tf
import keras.backend.tensorflow_backend as KTF
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.models import load_model
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K
from keras.utils import multi_gpu_model
import keras

#进行配置，每个GPU使用60%上限现存
config = tf.ConfigProto()
os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7"
# config.gpu_options.per_process_gpu_memory_fraction = 0.4 # 每个GPU现存上届控制在60%以内
session = tf.Session(config=config)

# 设置session
KTF.set_session(session )

Using TensorFlow backend.


In [3]:
base_dir = './dataset/'

In [None]:
### 用户的激活APP列表

In [4]:
app_actived_df = pd.read_csv(base_dir+'user_app_actived.csv', header=None, names=['uId','appId'],
                             dtype={'uId':np.uint32,'appId':str})
app_actived_df.head()

Unnamed: 0,uId,appId
0,1000110,a001048#a003072#a004443#a006024#a007087#a00743...
1,1000542,a001010#a00158#a001671#a002450#a003484#a003577...
2,1000866,a001048#a00108#a004622#a007104#a0075
3,1001028,a001012#a001055#a001062#a001275#a001403#a00158...
4,1001190,a001012#a00107#a001304#a001403#a001533#a001679...


In [5]:
print(app_actived_df.shape)

(4999341, 2)


In [None]:
### 原始训练集

In [6]:
app_info=pd.read_csv(base_dir+'app_info.csv',header=None,names=['appId','category'],dtype={'appId':str,'category':'category'})
app_category_list=list(app_info.category.unique())

In [7]:
len(app_category_list)

32

In [8]:
user_set_map = {}
user_set_map['uId'] = np.uint32
user_set_map['age_group'] = np.uint8
user_set_map['gender'] = np.uint8
user_set_map['city'] = str
user_set_map['prodName'] = str
user_set_map['ramCapacity'] = np.float32
user_set_map['romCapacity'] = np.float32
user_set_map['fontSize'] = np.float32
user_set_map['color'] = str
user_set_map['ct'] = str
user_set_map['carrier'] = str
user_set_map['os'] = np.float32
user_set_map['appNums'] = np.uint32
user_set_map['bootTimes'] = np.int32
user_set_map['AFuncTimes'] = np.float32
user_set_map['BFuncTimes'] = np.float32
user_set_map['CFuncTimes'] = np.float32
user_set_map['DFuncTimes'] = np.float32
user_set_map['EFuncTimes'] = np.float32
user_set_map['FFuncTimes'] = np.float32
user_set_map['GFuncSum'] = np.int32
user_set_map['totalGame'] = np.uint32
user_set_map['used_rom'] = np.float32
user_set_map['used_ram'] = np.float32
user_set_map['all_used_features'] = np.int8
user_set_map['gender_color'] = str
user_set_map['young_feature'] = np.uint32
user_set_map['business_feature'] = np.uint32
user_set_map['middle_feature'] = np.uint32
for app_category_name in app_category_list:
    user_set_map[app_category_name] = np.uint32

In [9]:
train_set = pd.read_csv(base_dir+'train_user_set.csv',dtype=user_set_map)
_ = user_set_map.pop('age_group')
test_set = pd.read_csv(base_dir+'test_user_set.csv',dtype=user_set_map)
print(train_set.shape)
print(test_set.shape)

(4000000, 61)
(1000000, 60)


In [10]:
train_set.columns

Index(['uId', 'age_group', 'gender', 'city', 'prodName', 'ramCapacity',
       'romCapacity', 'color', 'fontSize', 'ct', 'carrier', 'os', 'bootTimes',
       'AFuncTimes', 'BFuncTimes', 'CFuncTimes', 'DFuncTimes', 'EFuncTimes',
       'FFuncTimes', 'GFuncSum', '运动健康', '实用工具', '新闻阅读', '图书阅读', '金融理财',
       '社交通讯', '便捷生活', '休闲益智', '拍摄美化', '经营策略', '儿童', '汽车', '教育', '主题个性',
       '影音娱乐', '棋牌桌游', '购物比价', '旅游住宿', '出行导航', '商务', '角色扮演', '动作射击', '体育竞速',
       '美食', '休闲娱乐', '表盘个性', '学习办公', '网络游戏', '主题铃声', '动漫', '休闲游戏', '资讯生活',
       'appNums', 'totalGame', 'young_feature', 'business_feature',
       'middle_feature', 'used_rom', 'used_ram', 'gender_color',
       'all_used_features'],
      dtype='object')

In [None]:
### app 分类使用数据

In [10]:
# user_app_usage_statistic_dtype_map={}
# for app_category_name in app_category_list:
#     user_app_usage_statistic_dtype_map[app_category_name+'_times']=np.float32
#     user_app_usage_statistic_dtype_map[app_category_name+'_duration']=np.float32
#     user_app_usage_statistic_dtype_map[app_category_name+'_avg']=np.float32
# user_app_usage_statistic_dtype_map['uId']=np.int32
# user_app_usage_statistic_dtype_map['all_times']=np.int32
# user_app_usage_statistic_dtype_map['all_duration']=np.int32
# user_app_usage_statistic_dtype_map['use_days']=np.int32

In [11]:
user_app_usage_stat = pd.read_hdf('/home/uniml/work/huawei/temp_data/user_app_usage_statistic.h5',key='data')
print(user_app_usage_stat.shape)

(4020281, 101)


In [12]:
for col in user_app_usage_stat.columns:
    if '_avg' in col:
        _ = user_app_usage_stat.pop(col)

In [13]:
user_app_usage_stat.columns

Index(['运动健康_times', '运动健康_duration', '实用工具_times', '实用工具_duration',
       '新闻阅读_times', '新闻阅读_duration', '图书阅读_times', '图书阅读_duration',
       '金融理财_times', '金融理财_duration', '社交通讯_times', '社交通讯_duration',
       '便捷生活_times', '便捷生活_duration', '休闲益智_times', '休闲益智_duration',
       '拍摄美化_times', '拍摄美化_duration', '经营策略_times', '经营策略_duration',
       '儿童_times', '儿童_duration', '汽车_times', '汽车_duration', '教育_times',
       '教育_duration', '主题个性_times', '主题个性_duration', '影音娱乐_times',
       '影音娱乐_duration', '棋牌桌游_times', '棋牌桌游_duration', '购物比价_times',
       '购物比价_duration', '旅游住宿_times', '旅游住宿_duration', '出行导航_times',
       '出行导航_duration', '商务_times', '商务_duration', '角色扮演_times',
       '角色扮演_duration', '动作射击_times', '动作射击_duration', '体育竞速_times',
       '体育竞速_duration', '美食_times', '美食_duration', '休闲娱乐_times',
       '休闲娱乐_duration', '表盘个性_times', '表盘个性_duration', '学习办公_times',
       '学习办公_duration', '网络游戏_times', '网络游戏_duration', '主题铃声_times',
       '主题铃声_duration', '动漫_times', '动漫_

In [None]:
### 神经网络

In [14]:
train_set['gender'] = train_set['gender'].astype(str)
test_set['gender'] = test_set['gender'].astype(str)

In [15]:
print(len(train_set['city'].unique()))
print(len(train_set['prodName'].unique()))
print(len(train_set['color'].unique()))
print(len(train_set['ct'].unique()))
print(len(train_set['carrier'].unique()))
print(len(train_set['gender_color'].unique()))

343
113
117
5
4
234


In [16]:
train_set['age_group'] = train_set['age_group'].apply(lambda x : x-1)
label = train_set['age_group'].values
_ = train_set.pop('age_group')

In [17]:
train_set = train_set.merge(user_app_usage_stat,on=['uId'],how='left')
train_set.fillna(0, inplace=True)
test_set = test_set.merge(user_app_usage_stat,on=['uId'],how='left')
test_set.fillna(0, inplace=True)
print(train_set.shape,test_set.shape)

(4000000, 128) (1000000, 128)


In [18]:
num_train = train_set[['gender','ct','carrier','city','prodName','color','gender_color']]
_ = train_set.pop('gender')
_ = train_set.pop('ct')
_ = train_set.pop('carrier')
_ = train_set.pop('city')
_ = train_set.pop('prodName')
_ = train_set.pop('color')
_ = train_set.pop('gender_color')
print(num_train.shape)

(4000000, 7)


In [19]:
num_test = test_set[['gender','ct','carrier','city','prodName','color','gender_color']]
_ = test_set.pop('gender')
_ = test_set.pop('ct')
_ = test_set.pop('carrier')
_ = test_set.pop('city')
_ = test_set.pop('prodName')
_ = test_set.pop('color')
_ = test_set.pop('gender_color')
print(num_test.shape)

(1000000, 7)


In [None]:
### app使用情况统计类特征

In [20]:
train_app_usage_stat_df = pd.read_hdf(base_dir+'train_app_usage_stat_df.h5',key='data')
test_app_usage_stat_df = pd.read_hdf(base_dir+'test_app_usage_stat_df.h5',key='data')
print(train_app_usage_stat_df.shape,test_app_usage_stat_df.shape)

(4000000, 26) (1000000, 26)


In [21]:
train_set = train_set.merge(train_app_usage_stat_df,on='uId',how='left')
test_set = test_set.merge(test_app_usage_stat_df,on='uId',how='left')
print(train_set.shape,test_set.shape)

(4000000, 146) (1000000, 146)


In [23]:
# train_set = train_set.merge(lbl_emb,on='uId',how='left')
# train_set.fillna(0,inplace=True)
# test_set = test_set.merge(lbl_emb,on='uId',how='left')
# test_set.fillna(0,inplace=True)
# print(train_set.shape,test_set.shape)

(4000000, 274) (1000000, 274)


In [64]:
# train_app_usage_tfidf_input_csr = pd.read_hdf('../huawei/temp_data/train_app_usage_tfidf_input_csr.h5')
# train_app_usage_tfidf_input_csr.shape

(4000000, 500)

In [65]:
# test_app_usage_tfidf_input_csr = pd.read_hdf('../huawei/temp_data/test_app_usage_tfidf_input_csr.h5')
# test_app_usage_tfidf_input_csr.shape

(1000000, 500)

In [23]:
# train_app_actived_tfidf_input_csr = pd.read_hdf('../huawei/temp_data/train_app_actived_tfidf_input_csr.h5')
# train_app_actived_tfidf_input_csr.shape

(4000000, 500)

In [24]:
# test_app_actived_tfidf_input_csr = pd.read_hdf('../huawei/temp_data/test_app_actived_tfidf_input_csr.h5')
# test_app_actived_tfidf_input_csr.shape

(1000000, 500)

In [None]:
### app使用的分类情况及转化率

In [22]:
train_app_trans_rate_with_usage = pd.read_hdf(base_dir+'train_app_trans_rate_with_usage.h5')
test_app_trans_rate_with_usage = pd.read_hdf(base_dir+'test_app_trans_rate_with_usage.h5')
print(train_app_trans_rate_with_usage.shape,test_app_trans_rate_with_usage.shape)

(4000000, 66) (1000000, 66)


In [None]:
### app激活和使用的分类比例

In [23]:
train_app_actived_rate = pd.read_hdf(base_dir+'train_app_actived_rate.h5')
test_app_actived_rate = pd.read_hdf(base_dir+'test_app_actived_rate.h5')
print(train_app_actived_rate.shape,test_app_actived_rate.shape)

(4000000, 32) (1000000, 32)


In [24]:
train_app_usage_rate = pd.read_hdf(base_dir+'train_app_usage_rate.h5')
test_app_usage_rate = pd.read_hdf(base_dir+'test_app_usage_rate.h5')
print(train_app_usage_rate.shape,test_app_usage_rate.shape)

(4000000, 32) (1000000, 32)


In [25]:
trainID = train_set['uId']
testID = test_set['uId']
train_set.drop('uId', axis=1, inplace=True)
test_set.drop('uId', axis=1, inplace=True)
print(train_set.shape)
print(test_set.shape)

(4000000, 145)
(1000000, 145)


In [26]:
# 对数值类型进行归一化
scaler = StandardScaler().fit(train_set)
train_set = scaler.transform(train_set)  
test_set = scaler.transform(test_set)
print(train_set.shape,test_set.shape)

(4000000, 145) (1000000, 145)


In [27]:
nTrain = num_train.shape[0]
nTest = num_test.shape[0]
all_data = pd.concat([num_train, num_test]).reset_index(drop=True)
print(all_data.shape)

(5000000, 7)


In [28]:
cols = ['city','prodName','color','gender_color']
for c in cols:
    lbl = LabelEncoder()   
    lbl.fit(list(all_data[c].values)) 
    all_data[c] = lbl.transform(list(all_data[c].values))     
print('Shape all_data: {}'.format(all_data.shape))

Shape all_data: (5000000, 7)


In [29]:
print(all_data['city'].min())
print(all_data['city'].max())
print(all_data['prodName'].min())
print(all_data['prodName'].max())
print(all_data['color'].min())
print(all_data['color'].max())
print(all_data['gender_color'].min())
print(all_data['gender_color'].max())

0
342
0
113
0
116
0
233


In [30]:
print(0)

0


In [30]:
# 对gender、ct、carrier做one-hot处理
all_data = pd.get_dummies(all_data)
print(all_data.shape)

(5000000, 15)


In [31]:
num_train = all_data[:nTrain]
num_test = all_data[nTrain:]
print(num_train.shape)
print(num_test.shape)

(4000000, 15)
(1000000, 15)


In [32]:
print(train_set.shape)
print(test_set.shape)

(4000000, 145)
(1000000, 145)


In [33]:
train_id = pd.DataFrame(trainID)
test_id = pd.DataFrame(testID)
print(train_id.shape,test_id.shape)

(4000000, 1) (1000000, 1)


In [35]:
from keras.layers import *
from keras.models import *
from keras.optimizers import *

In [58]:
class AdamW(Optimizer):
    def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4,  # decoupled weight decay (1/4)
                 epsilon=1e-8, decay=0., **kwargs):
        super(AdamW, self).__init__(**kwargs)
        with K.name_scope(self.__class__.__name__):
            self.iterations = K.variable(0, dtype='int64', name='iterations')
            self.lr = K.variable(lr, name='lr')
            self.beta_1 = K.variable(beta_1, name='beta_1')
            self.beta_2 = K.variable(beta_2, name='beta_2')
            self.decay = K.variable(decay, name='decay')
            # decoupled weight decay (2/4)
            self.wd = K.variable(weight_decay, name='weight_decay')
        self.epsilon = epsilon
        self.initial_decay = decay

    @interfaces.legacy_get_updates_support
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]
        wd = self.wd  # decoupled weight decay (3/4)

        lr = self.lr
        if self.initial_decay > 0:
            lr *= (1. / (1. + self.decay * K.cast(self.iterations,
                                                  K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        self.weights = [self.iterations] + ms + vs

        for p, g, m, v in zip(params, grads, ms, vs):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            # decoupled weight decay (4/4)
            p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates

    def get_config(self):
        config = {'lr': float(K.get_value(self.lr)),
                  'beta_1': float(K.get_value(self.beta_1)),
                  'beta_2': float(K.get_value(self.beta_2)),
                  'decay': float(K.get_value(self.decay)),
                  'weight_decay': float(K.get_value(self.wd)),
                  'epsilon': self.epsilon}
        base_config = super(AdamW, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [34]:
callbacks_list = [
    EarlyStopping(
        monitor='val_acc',
        patience=30
    ),
    ModelCheckpoint(
        filepath='model.h5', 
        monitor='val_acc', 
        save_best_only=True
    )
]

In [None]:
### 残差网络

In [34]:
def build_res_net():
    inputs = []
    city_embedding_size = 20
    prodName_embedding_size = 15
    color_embedding_size = 10
    gender_color_embedding_size = 15
    app_actived_size = 32
    app_usage_size = 32
    
    city_input = Input(shape=(1,))
    city_embedded = Embedding(343, city_embedding_size, input_length=1)(city_input)
    inputs.append(city_input)
    
    prodName_input = Input(shape=(1,))
    prodName_embedded = Embedding(114, prodName_embedding_size, input_length=1)(prodName_input)
    inputs.append(prodName_input)
    
    color_input = Input(shape=(1,))
    color_embedded = Embedding(117, color_embedding_size, input_length=1)(color_input)
    inputs.append(color_input)
    
    gender_color_input = Input(shape=(1,))
    gender_color_embedded = Embedding(234, gender_color_embedding_size, input_length=1)(gender_color_input)
    inputs.append(gender_color_input)
    
    input_numeric = Input(shape=(286,))
    inputs.append(input_numeric)
    
    app_actived_input = Input(shape=(150,))
    app_actived_embedded = Embedding(10010,app_actived_size,input_length=150)(app_actived_input)
    t1 = TimeDistributed(Dense(128, activation='relu'))(app_actived_embedded)
    t1 = TimeDistributed(Dropout(0.15))(t1)
    app_actived_embedded = Lambda(lambda x:K.sum(x,axis=1))(app_actived_embedded)
    inputs.append(app_actived_input)
    
    app_usage_input = Input(shape=(200,))
    app_usage_embedded = Embedding(10010,app_usage_size,input_length=200)(app_usage_input)
    t2 = TimeDistributed(Dense(128, activation='relu'))(app_usage_embedded)
    t2 = TimeDistributed(Dropout(0.15))(t2)
    app_usage_embedded = Lambda(lambda x:K.sum(x,axis=1))(app_usage_embedded)
    inputs.append(app_usage_input)
    
    fasttext_app_actived_input = Input(shape=(128,))
    inputs.append(fasttext_app_actived_input)
    
    fasttext_app_usage_input = Input(shape=(128,))
    inputs.append(fasttext_app_usage_input)

    gru_input = Input(shape=(7,200))
    gru = Bidirectional(GRU(128,dropout=0.2,recurrent_dropout=0.2,return_sequences=True))(gru_input)
    inputs.append(gru_input)
    
    # 1D convolutions that can iterate over the word vectors
    conv1 = Conv1D(filters=64, kernel_size=1,
                   padding='same', activation='relu',)
#     conv2 = Conv1D(filters=64, kernel_size=2,
#                    padding='same', activation='relu', )
#     conv3 = Conv1D(filters=64, kernel_size=3,
#                    padding='same', activation='relu',)
    conv5 = Conv1D(filters=32, kernel_size=5,
                   padding='same', activation='relu',)
    
    conv1a = conv1(gru)
    gap1a = GlobalAveragePooling1D()(conv1a)
    gmp1a = GlobalMaxPooling1D()(conv1a)

#     conv2a = conv2(gru)
#     gap2a = GlobalAveragePooling1D()(conv2a)
#     gmp2a = GlobalMaxPooling1D()(conv2a)

#     conv3a = conv3(gru)
#     gap3a = GlobalAveragePooling1D()(conv3a)
#     gmp3a = GlobalMaxPooling1D()(conv3a)

    conv5a = conv5(gru)
    gap5a = GlobalAveragePooling1D()(conv5a)
    gmp5a = GlobalMaxPooling1D()(conv5a)
    
    con = Concatenate()([city_embedded, prodName_embedded, color_embedded,gender_color_embedded])
    flatten = Flatten()(con)
    x = Concatenate()([flatten,input_numeric,app_actived_embedded,app_usage_embedded,fasttext_app_actived_input,
                       fasttext_app_usage_input,gap1a,gmp1a,gap5a,gmp5a])
    
    x = Dense(1024)(x)
    y = BatchNormalization()(x)
    y = PReLU()(y)
    y = Dropout(0.3)(y)
    y = Dense(1024)(y)
    y = BatchNormalization()(y)
    y = PReLU()(y)
    y = Dropout(0.2)(y)
    residual = x
    y = Add()([y, residual])
    
    x = Dense(1024)(y)
    residual = y
    y = BatchNormalization()(x)
    y = PReLU()(y)
    y = Dropout(0.2)(y)
    y = Dense(1024)(y)
    y = BatchNormalization()(y)
    y = PReLU()(y)
    y = Dropout(0.2)(y)
    y = Add()([y, residual])
    
    x = Dense(1024)(y)
    residual = y
    y = BatchNormalization()(x)
    y = PReLU()(y)
    y = Dropout(0.2)(y)
    y = Dense(1024)(y)
    y = BatchNormalization()(y)
    y = PReLU()(y)
    y = Dropout(0.2)(y)
    y = Add()([y, residual])
    
    x = Dense(1024)(y)
    residual = y
    y = BatchNormalization()(x)
    y = PReLU()(y)
    y = Dropout(0.2)(y)
    y = Dense(1024)(y)
    y = BatchNormalization()(y)
    y = PReLU()(y)
    y = Dropout(0.2)(y)
    y = Add()([y, residual])
    
    x = Dense(1024)(y)
    residual = y
    y = BatchNormalization()(x)
    y = PReLU()(y)
    y = Dropout(0.2)(y)
    y = Dense(1024)(y)
    y = BatchNormalization()(y)
    y = PReLU()(y)
    y = Dropout(0.2)(y)
    y = Add()([y, residual])
    
    x = Dense(256)(y)
    y = BatchNormalization()(x)
    y = PReLU()(y)
    y = Dropout(0.2)(y)
    y = Dense(256)(y)
    y = BatchNormalization()(y)
    y = PReLU()(y)
    y = Dropout(0.2)(y)
    residual = x
    y = Add()([y, residual])
    
    output = Dense(6)(y)
    output = Activation('softmax',name='cate_out')(output)
    model = Model(inputs, output)
    model.compile(loss='categorical_crossentropy', optimizer=optimizers.Adam(lr=0.0001),metrics=['accuracy'])
    return model

In [35]:
model = build_res_net()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_10 (

In [None]:
### 构造经网络输入

In [36]:
city_input = num_train.pop('city')
prodName_input = num_train.pop('prodName')
color_input = num_train.pop('color')
gender_color = num_train.pop('gender_color')
city_input.shape

(4000000,)

In [37]:
numeric_input = np.concatenate((train_set,num_train.values),axis=1)
numeric_input = np.concatenate((numeric_input, train_app_trans_rate_with_usage.values),axis=1)
numeric_input = np.concatenate((numeric_input, train_app_actived_rate.values),axis=1)
numeric_input = np.concatenate((numeric_input, train_app_usage_rate.values),axis=1)
# numeric_input = np.concatenate((numeric_input, train_app_actived_tfidf_input_csr.values),axis=1)
print(numeric_input.shape)

(4000000, 286)


In [None]:
### fasttext

In [38]:
train_app_activted_emb_fasttext_sum = pd.read_hdf(base_dir+'train_app_activted_emb_fasttext_sum.h5')
train_app_activted_emb_fasttext_sum.fillna(0, inplace=True)
train_app_usage_emb_fasttext_sum = pd.read_hdf(base_dir+'train_app_usage_emb_fasttext_sum.h5')
train_app_usage_emb_fasttext_sum.fillna(0,inplace=True)
print(train_app_activted_emb_fasttext_sum.shape,train_app_usage_emb_fasttext_sum.shape)

(4000000, 128) (4000000, 128)


In [39]:
test_app_activted_emb_fasttext_sum = pd.read_hdf(base_dir+'test_app_activted_emb_fasttext_sum.h5')
test_app_activted_emb_fasttext_sum.fillna(0, inplace=True)
test_app_usage_emb_fasttext_sum = pd.read_hdf(base_dir+'test_app_usage_emb_fasttext_sum.h5')
test_app_usage_emb_fasttext_sum.fillna(0,inplace=True)
print(test_app_activted_emb_fasttext_sum.shape,test_app_usage_emb_fasttext_sum.shape)

(1000000, 128) (1000000, 128)


In [40]:
scaler = StandardScaler().fit(train_app_activted_emb_fasttext_sum)
train_app_activted_emb_fasttext_sum_scaler = scaler.transform(train_app_activted_emb_fasttext_sum)  
test_app_activted_emb_fasttext_sum_scaler = scaler.transform(test_app_activted_emb_fasttext_sum)
print(train_app_activted_emb_fasttext_sum_scaler.shape,test_app_activted_emb_fasttext_sum_scaler.shape)

(4000000, 128) (1000000, 128)


In [41]:
scaler = StandardScaler().fit(train_app_usage_emb_fasttext_sum)
train_app_usage_emb_fasttext_sum_scaler = scaler.transform(train_app_usage_emb_fasttext_sum)  
test_app_usage_emb_fasttext_sum_scaler = scaler.transform(test_app_usage_emb_fasttext_sum)
print(train_app_usage_emb_fasttext_sum_scaler.shape,test_app_usage_emb_fasttext_sum_scaler.shape)

(4000000, 128) (1000000, 128)


In [40]:
print(0)

0


In [None]:
### 激活的app做embedding¶

In [42]:
apps = app_actived_df[['appId']].values
vocab_size = 10010
encoded_docs = [one_hot(app[0], vocab_size, split='#') for app in apps]
max_length = 150
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
padded_docs = np.array(padded_docs)
padded_docs_uId = pd.concat([app_actived_df[['uId']],pd.DataFrame(padded_docs)], axis=1)
train_apps_input = train_id.merge(padded_docs_uId, on=['uId'], how='left')
test_apps_input = test_id.merge(padded_docs_uId, on=['uId'], how='left')
train_apps_input.fillna(0,inplace=True)
test_apps_input.fillna(0,inplace=True)
_ = train_apps_input.pop('uId')
_ = test_apps_input.pop('uId')
print(train_apps_input.shape,test_apps_input.shape)

(4000000, 150) (1000000, 150)


In [None]:
### 使用的app做embedding

In [43]:
user_app_usage_df = pd.read_hdf(base_dir+'user_app_usage_df.h5',key='data')
user_app_usage_df['appIds'] = user_app_usage_df['appIds'].astype(str)
usage_apps = user_app_usage_df[['appIds']].values
usage_app_size = 10010
encoded_apps = [one_hot(app[0], usage_app_size, split='#') for app in usage_apps]
usage_max_length = 200
padded_apps = pad_sequences(encoded_apps, maxlen=usage_max_length, padding='post')
padded_apps = np.array(padded_apps)
padded_apps_uId = pd.concat([user_app_usage_df[['uId']],pd.DataFrame(padded_apps)], axis=1)
train_app_usage_input = train_id.merge(padded_apps_uId, on=['uId'], how='left')
train_app_usage_input.fillna(0,inplace=True)
test_app_usage_input = test_id.merge(padded_apps_uId, on=['uId'], how='left')
test_app_usage_input.fillna(0,inplace=True)
_ = train_app_usage_input.pop('uId')
_ = test_app_usage_input.pop('uId')
print(train_app_usage_input.shape,test_app_usage_input.shape)

(4000000, 200) (1000000, 200)


In [None]:
### LSTM

In [44]:
train_lstm_input = pd.read_hdf(base_dir+'train_lstm_input.h5',key='data')
train_lstm_input = train_lstm_input.values.reshape((train_id.shape[0],7,200))
test_lstm_input = pd.read_hdf(base_dir+'test_lstm_input.h5',key='data')
test_lstm_input = test_lstm_input.values.reshape((test_id.shape[0],7,200))
print(train_lstm_input.shape,test_lstm_input.shape)

(4000000, 7, 200) (1000000, 7, 200)


In [None]:
### 训练

In [45]:
X = [city_input,prodName_input,color_input,gender_color,numeric_input,train_apps_input.values,
     train_app_usage_input.values,train_app_activted_emb_fasttext_sum_scaler,train_app_usage_emb_fasttext_sum_scaler,
     train_lstm_input]

In [46]:
Y = np_utils.to_categorical(label, 6)

In [47]:
test_city_input = num_test.pop('city')
test_prodName_input = num_test.pop('prodName')
test_color_input = num_test.pop('color')
test_gender_color_input = num_test.pop('gender_color')
test_city_input.shape

(1000000,)

In [48]:
test_numeric_input = np.concatenate((test_set,num_test.values),axis=1)
test_numeric_input = np.concatenate((test_numeric_input, test_app_trans_rate_with_usage.values),axis=1)
test_numeric_input = np.concatenate((test_numeric_input, test_app_actived_rate.values),axis=1)
test_numeric_input = np.concatenate((test_numeric_input, test_app_usage_rate.values),axis=1)
# test_numeric_input = np.concatenate((test_numeric_input, test_app_actived_tfidf_input_csr.values),axis=1)
print(test_numeric_input.shape)

(1000000, 286)


In [49]:
test_X = [test_city_input,test_prodName_input,test_color_input,test_gender_color_input,test_numeric_input,
          test_apps_input.values,test_app_usage_input.values,test_app_activted_emb_fasttext_sum_scaler,
          test_app_usage_emb_fasttext_sum_scaler,test_lstm_input]

In [None]:
###### try

In [53]:
# row_indices = np.random.permutation(Y.shape[0])
# train_x=[x_data[row_indices] for x_data in X]
# train_y=Y[row_indices] 

In [None]:
##### end try

In [None]:
###### 5 折

In [50]:
row_indices = np.random.permutation(Y.shape[0])
split_size=5
step=int(Y.shape[0]/float(split_size))
split_index_list=[]
for i in range(split_size):
    start=i*step
    end=(i+1)*step
    print(start,end)
    split_index_list.append(row_indices[start:end])

0 800000
800000 1600000
1600000 2400000
2400000 3200000
3200000 4000000


In [50]:
class ParallelModelCheckpoint(keras.callbacks.ModelCheckpoint):
    def __init__(self,model,filepath, monitor='val_loss', verbose=0,
                 save_best_only=False, save_weights_only=False,
                 mode='auto', period=1):
        self.single_model = model
        super(ParallelModelCheckpoint,self).__init__(filepath, monitor, verbose,save_best_only, save_weights_only,mode, period)

    def set_model(self, model):
        super(ParallelModelCheckpoint,self).set_model(self.single_model)

In [52]:
prefix='try_2_'

In [53]:
pred_list=[]
test_x=test_X

for i in range(split_size):
    ####切分数据集
    validation_index=list(split_index_list[i])
    train_index=[]
    for j in range(split_size):
        if j==i:
            continue
        train_index+=list(split_index_list[j])
    train_x=[x_data[train_index] for x_data in X]
    train_y=Y[train_index]  
    validation_x=[x_data[validation_index] for x_data in X]
    validation_y=Y[validation_index]
    print('finish split data')
    
    model_name='my_model_split_'+prefix+str(i)+'.h5'
    
    model = build_res_net()
#     callbacks_list = [
#     EarlyStopping(
#         monitor='val_loss',
#         patience=20
#     ),
#     ModelCheckpoint(
#         filepath=model_name, 
#         monitor='val_loss', 
#         save_best_only=True
#     )
# ]
#     model.fit(train_x, train_y, epochs=1,batch_size=4096,validation_data=[validation_x,validation_y],callbacks=callbacks_list)

    callbacks_list = [
        EarlyStopping(
            monitor='val_loss',
            patience=20
        ),
        ParallelModelCheckpoint(model,model_name,save_best_only=True,monitor='val_loss')

    ]
    
    model_parallel=multi_gpu_model(model,gpus=4)
    model_parallel.compile(loss='categorical_crossentropy', optimizer=optimizers.Adam(lr=0.0001),metrics=['accuracy'])
    
    
    model_parallel.fit(train_x, train_y, epochs=100,batch_size=8192*4,validation_data=[validation_x,validation_y],callbacks=callbacks_list,verbose=2)
    
    
    my_model = load_model(model_name)
    y_pred = my_model.predict(test_x)
    pred_list.append(y_pred)
    
    ##备份
    pd_y_pred=pd.DataFrame(y_pred)
    pd_y_pred.to_csv('pred_nn_'+prefix+str(i)+'.csv',index=False,header=None)

    print('finish:',str(i))

finish split data
Train on 3200000 samples, validate on 800000 samples
Epoch 1/100
 - 230s - loss: 1.9642 - acc: 0.4039 - val_loss: 1.2760 - val_acc: 0.5163
Epoch 2/100
 - 178s - loss: 1.3885 - acc: 0.4768 - val_loss: 1.2043 - val_acc: 0.5330
Epoch 3/100
 - 178s - loss: 1.2686 - acc: 0.5085 - val_loss: 1.1518 - val_acc: 0.5510
Epoch 4/100
 - 177s - loss: 1.2017 - acc: 0.5304 - val_loss: 1.1197 - val_acc: 0.5613
Epoch 5/100
 - 179s - loss: 1.1574 - acc: 0.5469 - val_loss: 1.1056 - val_acc: 0.5659
Epoch 6/100
 - 174s - loss: 1.1269 - acc: 0.5583 - val_loss: 1.0760 - val_acc: 0.5777
Epoch 7/100
 - 180s - loss: 1.1048 - acc: 0.5668 - val_loss: 1.0653 - val_acc: 0.5826
Epoch 8/100
 - 166s - loss: 1.0883 - acc: 0.5735 - val_loss: 1.0570 - val_acc: 0.5852
Epoch 9/100
 - 165s - loss: 1.0757 - acc: 0.5787 - val_loss: 1.0481 - val_acc: 0.5891
Epoch 10/100
 - 114s - loss: 1.0651 - acc: 0.5825 - val_loss: 1.0334 - val_acc: 0.5943
Epoch 11/100
 - 99s - loss: 1.0558 - acc: 0.5858 - val_loss: 1.0287 

KeyboardInterrupt: 

In [54]:
##### 

In [51]:
num_Y=np.argmax(Y,axis=1)
pd_Y=pd.DataFrame(num_Y)
index_set=[]
from sklearn.utils import shuffle
index_set.extend(list(shuffle(pd_Y[pd_Y[0]==0]).index)[:190000])
index_set.extend(list(shuffle(pd_Y[pd_Y[0]==1]).index)[:300000])
index_set.extend(list(shuffle(pd_Y[pd_Y[0]==2]).index)[:750000])
index_set.extend(list(shuffle(pd_Y[pd_Y[0]==3]).index)[:750000])
index_set.extend(list(shuffle(pd_Y[pd_Y[0]==4]).index)[:380000])
index_set.extend(list(shuffle(pd_Y[pd_Y[0]==5]).index)[:150000])
index_set = list(shuffle(index_set))
print(len(index_set))

2520000


In [52]:
train_x=[x_data[index_set] for x_data in X]
train_y=Y[index_set]

In [53]:
model_name = 'my_model_final.h5'
callbacks_list = [
        EarlyStopping(
            monitor='val_loss',
            patience=20
        ),
        ParallelModelCheckpoint(model,model_name,save_best_only=True,monitor='val_loss')

    ]
    
model_parallel=multi_gpu_model(model,gpus=4)
model_parallel.compile(loss='categorical_crossentropy', optimizer=optimizers.Adam(lr=0.0001),metrics=['accuracy'])


In [54]:
model_parallel.fit(train_x,train_y , epochs=200,batch_size=8192*4,validation_split=0.1,callbacks=callbacks_list)

Train on 2268000 samples, validate on 252000 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200

KeyboardInterrupt: 

In [65]:
print(0)

0


In [66]:
print(model_name)
my_model = load_model(model_name)
y_pred = my_model.predict(test_X)
pred_df = pd.DataFrame(y_pred)
pred_df.to_csv('pred_df_final.csv',index=False,header=None)
pred_df.head()

my_model_new_999.h5


Unnamed: 0,0,1,2,3,4,5
0,0.000329,0.004286,0.640718,0.347343,0.006559,0.000765
1,0.000132,0.003537,0.841296,0.152517,0.002025,0.000493
2,0.000322,0.042711,0.928967,0.025311,0.001773,0.000916
3,0.013003,0.001876,0.13051,0.696483,0.147285,0.010843
4,0.001211,0.227102,0.758821,0.009929,0.002386,0.00055


In [67]:
y_pred_1 = pd.read_csv('pred_nn_try_1_0.csv',header=None)
y_pred_1.head()

Unnamed: 0,0,1,2,3,4,5
0,0.001206,0.00851,0.602713,0.364567,0.018083,0.00492
1,0.000297,0.012773,0.8731,0.110121,0.002598,0.001111
2,0.000823,0.195743,0.78687,0.01313,0.002147,0.001287
3,0.015517,0.003229,0.106384,0.659162,0.195471,0.020236
4,0.001412,0.443846,0.541993,0.009397,0.002274,0.001078


In [67]:
result=pred_df.values
result=np.argmax(result,axis=1)+1
result=pd.DataFrame(result)
submission=pd.concat([test_id,result],axis=1)
submission.rename(columns={0:'label', 'uId':'id'},inplace=True)
submission.to_csv('submission.csv',index=False)
print(submission.label.value_counts()/len(submission))

4    0.337564
3    0.310506
5    0.141188
2    0.084006
1    0.082852
6    0.043884
Name: label, dtype: float64


In [61]:
submission.head()

Unnamed: 0,id,label
0,6708326,3
1,6708314,3
2,6708313,3
3,6708284,4
4,6708272,3


In [71]:
# result=pred_list[0]+pred_list[1]#+pred_list[2]+pred_list[3]+pred_list[4]
# result=np.argmax(result,axis=1)+1

In [66]:
save_file_list=[]
for i in range(split_size):
    save_file_list.append(np.array(pd.read_csv('pred_nn_'+prefix+str(i)+'.csv',header=None)))

In [72]:
result_1=save_file_list[0]+save_file_list[1]#+save_file_list[2]+save_file_list[3]+save_file_list[4]
# result_1=np.argmax(result_1,axis=1)+1

In [68]:
sum(result==result_1)

502495

In [70]:
for index,data in enumerate(result==result_1):
    if data==False:
        print(index)

7472
37811
331223
358076
482717


In [77]:
result[358076]

array([1.4564926e-35, 1.0000000e+00, 1.0000000e+00, 2.2958007e-11,
       3.7397034e-33, 0.0000000e+00], dtype=float32)

In [78]:
result_1[358076]

array([1.45649260e-35, 1.00000000e+00, 1.00000000e+00, 2.29580074e-11,
       3.73970356e-33, 0.00000000e+00])

In [64]:
pred_df = pd.read_csv('pred_nn_try_1_0.csv', header=None)
pred_df.shape

(1000000, 6)

In [65]:
pred_df.head()

Unnamed: 0,0,1,2,3,4,5
0,0.001206,0.00851,0.602713,0.364567,0.018083,0.00492
1,0.000297,0.012773,0.8731,0.110121,0.002598,0.001111
2,0.000823,0.195743,0.78687,0.01313,0.002147,0.001287
3,0.015517,0.003229,0.106384,0.659162,0.195471,0.020236
4,0.001412,0.443846,0.541993,0.009397,0.002274,0.001078


In [68]:
result=pred_df.values
result=np.argmax(result,axis=1)+1
result=pd.DataFrame(result)
submission=pd.concat([test_id,result],axis=1)
submission.rename(columns={0:'label'},inplace=True)
submission.to_csv('submission.csv',index=False)
print(submission.label.value_counts()/len(submission))

3    0.289325
4    0.276918
5    0.154590
2    0.108130
1    0.097343
6    0.073694
Name: label, dtype: float64


In [69]:
submission.head()

Unnamed: 0,uId,label
0,6708326,3
1,6708314,3
2,6708313,3
3,6708284,4
4,6708272,3


In [54]:
submission = pd.read_csv('submission_single_model.csv')

In [70]:
submission.rename(columns={'uId':'id'},inplace=True)

In [72]:
submission.head()

Unnamed: 0,id,label
0,6708326,3
1,6708314,3
2,6708313,3
3,6708284,4
4,6708272,3


In [73]:
submission.to_csv('submission.csv',index=False)

In [None]:
pred_lbl=pd.read_csv('../huawei/pred_lgb_guess.csv',header=None)

In [None]:
pred_lbl.shape