In [1]:
#!/usr/bin/env python 
# encoding: utf-8 

"""
@version: v1.0
@author: zhenglinghan
@contact: 422807471@qq.com
@software: PyCharm
@file: baseline.py
@time: 2020/5/7 22:23
"""
import numpy as np
import pandas as pd
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from deepctr.inputs import  DenseFeat, SparseFeat, VarLenSparseFeat, get_feature_names
from deepctr.models import DeepFM
from xdeepfm import xDeepFM
import tensorflow as tf

import os
import gc
import datetime as dt
import warnings
import joblib
from gensim.models import Word2Vec
import sys
from tqdm import tqdm
import time
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('precision', 5)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('max_colwidth', 200)
pd.set_option('display.width', 5000)

from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, roc_auc_score, f1_score , accuracy_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
    
from catboost import CatBoostClassifier
path_train = '../data/train_preliminary/'
path_test = '../data/test/'



# log
class Logger(object):
    def __init__(self, fileN="Default.log"):
        self.terminal = sys.stdout
        self.log = open(fileN, "a",encoding='utf-8')

    def write(self, message):
        self.terminal.write(message)
        self.log.write(message)

    def flush(self):
        pass
    

def reduce_mem_usage(df,features, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in tqdm(features):
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def split_self(x):
    key_ans = x .split(' ')
    for key in key_ans:
        if key not in key2index:
            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))

if __name__ == "__main__":
    
    sys.stdout = Logger("log_deepfm.txt")
    
    datatrain = pd.read_hdf(path_train+'data_train.h5',key='ad')
    datatest = pd.read_hdf(path_test+'data_test.h5',key='ad')

    datatrainlog = pd.read_hdf(path_train+'data_train.h5',key='click_log')
    datatestlog = pd.read_hdf(path_test+'data_test.h5',key='click_log')

    datatrainlabel = pd.read_hdf(path_train + 'data_train.h5', key='user')

    # 2个模型
    
    datatestlabel = pd.DataFrame(list(datatestlog['user_id'].unique()), columns=['user_id'])
    datalabel = pd.concat([datatrainlabel,datatestlabel],ignore_index=True)
    
    del datatrainlabel,datatestlabel
    gc.collect()
    
    datalog = pd.read_hdf('datalog.h5')
    datalabel = pd.read_hdf('datalabel0516_all.h5')# 里面特征很多 不过几个序列特征被我截断到200个了

    print('features finish')
    gc.collect()
    # 建模
    datalabel['age'] = datalabel['age']-1
    datalabel['gender'] = datalabel['gender']-1
    
    sparse_features = ['click_times_max_max',
'click_times_max_min',
'click_times_max_sum',
'click_times_sum_max',
'click_times_sum_min','time_max', 'time_min']# 几个类别特征
    dense_features = [i for i in datalabel.columns if i not in sparse_features +['user_id','age','gender',
'creative_ids_list',
 'ad_ids_list',
 'product_categorys_list',
 'advertiser_ids_list',
 'product_ids_list',
 'industrys_list',
 'click_timess_list']]
    
    datalabel[sparse_features] = datalabel[sparse_features].fillna('-1')
    datalabel[dense_features] = datalabel[dense_features].fillna(0)
    
    # 区分 train test
    traindata = datalabel.loc[~datalabel['age'].isna()].copy()
    testdata = datalabel.loc[datalabel['age'].isna()].copy().reset_index(drop=True)
    
    targets = ['age','gender']

    # 1.Label Encoding for sparse features,and process sequence features
    for feat in sparse_features:
        lbe = LabelEncoder()
        lbe.fit(datalabel[feat])
        traindata[feat] = lbe.transform(traindata[feat])
        testdata[feat] = lbe.transform(testdata[feat])
#     mms = MinMaxScaler(feature_range=(0, 1))
#     traindata[dense_features] = mms.fit_transform(traindata[dense_features])
#     testdata[dense_features] = mms.transform(testdata[dense_features])# 处理了，但是没有放入模型
    print('process 1 finish')
    # 2.count #unique features for each sparse field and generate feature config for sequence feature

    fixlen_feature_columns = [SparseFeat(feat, datalabel[feat].nunique(), embedding_dim=16)
                              for feat in sparse_features]
    
    linear_feature_columns = fixlen_feature_columns.copy()
    dnn_feature_columns = fixlen_feature_columns.copy()
    
    # 3.generate input data for model
    model_input = {name: traindata[name] for name in sparse_features}  #  
    model_input_test = {name: testdata[name] for name in sparse_features}  #  
    # 只用一个多值特征
    for var in ['creative_ids_list']:
        # preprocess the sequence feature
        key2index = {}
        genres_list = list(map(split_self, traindata[var].values))
        genres_length = np.array(list(map(len, genres_list)))
        genres_list_test = list(map(split_self, testdata[var].values))
        genres_length_test = np.array(list(map(len, genres_list_test)))
        max_len = max([max(genres_length),max(genres_length_test)])
        print('max_len:',max_len)
        # Notice : padding=`post`
        genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )
        genres_list_test = pad_sequences(genres_list_test, maxlen=max_len, padding='post', )
        use_weighted_sequence = False
        if use_weighted_sequence:
            varlen_feature_columns = [VarLenSparseFeat(SparseFeat(var,vocabulary_size=len(
            key2index) + 1, embedding_dim=16), maxlen=max_len, combiner='mean',
                                                   weight_name=var+'_weight')]  # Notice : value 0 is for padding for sequence input feature
        else:
            varlen_feature_columns = [VarLenSparseFeat(SparseFeat(var, vocabulary_size=len(
            key2index) + 1, embedding_dim=16), maxlen=max_len, combiner='mean',
                                                   weight_name=None)]  # Notice : value 0 is for padding for sequence input feature
        
        model_input[var] = genres_list
#         model_input[var + "_weight"] = np.random.randn(traindata.shape[0], max_len, 1) # 有毒没用
        model_input_test[var] = genres_list_test
#         model_input_test[var + "_weight"] = np.random.randn(testdata.shape[0], max_len, 1) # 有毒没用
        
        linear_feature_columns += varlen_feature_columns
        dnn_feature_columns += varlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) 
    del datalabel
    gc.collect()
#     features = [i for i in datalabel.columns if i != 'age' and i!='gender' and i!='user_id']
    fold_num = 3
    random_states = [ 0]  # 0, # 1111,1024,2019,2020
    models = []
    train_ccuracy_score = []
    val_ccuracy_score = []
    verbose = True
    print('modeling!')

    for label in targets:
        pred_y = np.zeros((len(testdata),traindata[label].nunique()))
        for i in range(len(random_states)):
            for index in range(fold_num):
                model = xDeepFM(linear_feature_columns, 
                dnn_feature_columns, 
                task='multiclass',
                dnn_hidden_units=(64, 64, 64),
                cin_layer_size=(64,64,64),
                cin_activation='relu', 
                l2_reg_linear=0.1,
#                 l2_reg_embedding=0.05, # 有毒参数
#                 l2_reg_dnn=0.05,  # 有毒参数
#                 l2_reg_cin=0.05,  # 有毒参数
#                 init_std=0.01,  # 有毒参数
                seed=2020, 
#                 dnn_dropout=0.5,
                dnn_activation='relu', 
                num_class=traindata[label].nunique())
                model.compile(optimizer=tf.keras.optimizers.Adam(0.001), loss="sparse_categorical_crossentropy", metrics=['acc'], )
                
                history = model.fit(model_input, traindata[label].values,
                        batch_size=2048, epochs=3, verbose=2, validation_split=0.2, )
                if verbose:
                    print(history)
                model.save('xdeepfm_model_base_{}.h5'.format(index))
                # model = keras.models.load_model('xdeepfm_model_{}.h5'.format(index))
                y_test_prob  = model.predict(model_input_test)               
                pred_y += y_test_prob /fold_num/len(random_states)
        testdata['predicted_{}'.format(label)] = np.argmax( pred_y,axis=1)+1
    subs = testdata[['user_id','predicted_age','predicted_gender']]
    print(subs['predicted_age'].value_counts())
    print(subs['predicted_gender'].value_counts())
    subs.to_csv('submission_basefeaturexdeepfm_base.csv',index=False,encoding='utf-8')



DeepCTR version 0.7.5 detected. Your version is 0.7.4.
Use `pip install -U deepctr` to upgrade.Changelog: https://github.com/shenweichen/DeepCTR/releases/tag/v0.7.5


features finish
process 1 finish
max_len: 200
modeling!
Train on 720000 samples, validate on 180000 samples
Epoch 1/3
720000/720000 - 126s - loss: 1.8533 - acc: 0.2704 - val_loss: 1.6541 - val_acc: 0.3382
Epoch 2/3
720000/720000 - 127s - loss: 1.5163 - acc: 0.4081 - val_loss: 1.5961 - val_acc: 0.3826
Epoch 3/3
720000/720000 - 126s - loss: 1.3158 - acc: 0.5179 - val_loss: 1.7247 - val_acc: 0.3710
<tensorflow.python.keras.callbacks.History object at 0x7f66a62b48d0>
Train on 720000 samples, validate on 180000 samples
Epoch 1/3
720000/720000 - 124s - loss: 1.8342 - acc: 0.2802 - val_loss: 1.6334 - val_acc: 0.3441
Epoch 2/3
720000/720000 - 127s - loss: 1.5023 - acc: 0.4144 - val_loss: 1.6061 - val_acc: 0.3830
Epoch 3/3
720000/720000 - 124s - loss: 1.3037 - acc: 0.5251 - val_loss: 1.7450 - val_acc: 0.3681
<tensorflow.python.keras.callbacks.History object at 0x7f66a4012110>
Train on 720000 samples, validate on 180000 samples
Epoch 1/3
720000/720000 - 125s - loss: 1.8279 - acc: 0.2855 - val_lo