In [8]:
!pip install tensorflow -i https://pypi.tuna.tsinghua.edu.cn/simple

In [13]:
!./kesci_submit -token ***************** -file /home/kesci/work/sub.csv

Kesci Submit Tool 4.0.0

> 已验证Token
> 提交文件 /home/kesci/work/sub.csv (38.83 KiB), Target Qiniu
> 已上传 100 %
> 文件已上传        
> 服务器响应: 200 提交成功，请等待评审完成
> 提交完成


In [1]:
from itertools import chain

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import optimizers, layers, losses

import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# from deepctr.feature_column import  SparseFeat, DenseFeat, get_feature_names, build_input_features, get_linear_logit, DEFAULT_GROUP_NAME, input_from_feature_columns
# from deepctr.feature_column import build_input_features, get_linear_logit, input_from_feature_columns
# from deepctr.layers.core import PredictionLayer, DNN
# from deepctr.layers.interaction import SENETLayer, BilinearInteraction
# from deepctr.layers.utils import concat_func, add_func, combined_dnn_input

from deepctr.feature_column import  SparseFeat, DenseFeat, get_feature_names, build_input_features, get_linear_logit, DEFAULT_GROUP_NAME, input_from_feature_columns
from deepctr.layers.core import PredictionLayer, DNN
from deepctr.layers.interaction import FM
from deepctr.layers.utils import concat_func, add_func, combined_dnn_input


In [2]:
pre1 = pd.read_csv('/home/kesci/work/test_pre.csv').dropna(axis=0,how='any')
pre1 = pre1['id'].astype(int)
pre2 = pd.read_csv('/home/kesci/work/train_pre.csv').dropna(axis=0,how='any')
pre2 = pre2['id'].astype(int)
pre3 = pre1.append(pre2).values.flatten().tolist()

data_test = pd.read_csv('test_.csv')

now = data_test['ID'].astype(int).values.flatten().tolist()
know_test_id = list(set(now)&set(pre3))
need_test_id = list(set(now).difference(set(pre3)))

pre1_ = pd.read_csv('/home/kesci/work/test_pre.csv')
pre2_ = pd.read_csv('/home/kesci/work/train_pre.csv')
pre3_ = pre1_.append(pre2_)

pre_know_id = pd.DataFrame({'id':know_test_id})
pre_need_id = pd.DataFrame({'ID':need_test_id})

pre_konw = pd.merge(pre_know_id, pre3_, on=['id'],how='left')

test_ = pd.merge(pre_need_id, data_test, on=['ID'],how='left')
test = test_.drop(columns=['ID'])

data_train = pd.read_csv('train_.csv')
train = data_train.drop(columns=['ID','肝炎'])
label = data_train['肝炎']

data = train.append(test)
data.columns = ['Age','Gender','Area','Weight','Height','Body_mass_index',
                'Obesity_waistline','Waist','Highest_blood_pressure','Minimum_blood_pressure',
                'Good_Cholesterol','Bad_Cholesterol','Total_Cholesterol','Blood_lipid_abnormality',
                'PVD','Sports_activities','Education','Unmarried','Revenue','Source_of_care',
                'Poor_vision','Drinking','Hypertension','Family_hypertension','Diabetes',
                'Family_diabetes','Family_hepatitis','Chronic_fatigue','ALF']

test.shape

(908, 29)

In [3]:
dense_features=['Revenue','Sports_activities','Age','Weight','Height',
                'Body_mass_index','Waist','Highest_blood_pressure',
                'Minimum_blood_pressure','Good_Cholesterol','Bad_Cholesterol',
                'Total_Cholesterol']
sparse_features = list(set(data.columns.tolist()).difference(set(dense_features)))

for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])

# 2.count #unique features for each sparse field,and record dense feature field name

fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
                        for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                        for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

# 3.generate input deepfm_data for model
# train, test = train_test_split(deepfm_data, test_size=0.2)

deepfm_train = data.head(train.shape[0])
deepfm_test = data.tail(test.shape[0])

deepfm_train = {name:deepfm_train[name] for name in feature_names}
deepfm_test = {name:deepfm_test[name] for name in feature_names}


In [4]:
def multi_category_focal_loss2(gamma=2., alpha=.25):
    """
    Usage:
     model.compile(loss=[multi_category_focal_loss2(
         alpha=0.35, gamma=2)], metrics=["accuracy"], optimizer=adam)
    """
    epsilon = 1.e-7
    gamma = float(gamma)
    alpha = tf.constant(alpha, dtype=tf.float32)

    def multi_category_focal_loss2_fixed(y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        y_pred = tf.clip_by_value(y_pred, epsilon, 1. - epsilon)

        alpha_t = y_true * alpha + \
            (tf.ones_like(y_true) - y_true) * (1 - alpha)
        y_t = tf.multiply(y_true, y_pred) + tf.multiply(1 - y_true, 1 - y_pred)
        ce = -tf.math.log(y_t)
        weight = tf.pow(tf.subtract(1., y_t), gamma)
        fl = tf.multiply(tf.multiply(weight, ce), alpha_t)
        loss = tf.reduce_mean(fl)
        return loss

    return multi_category_focal_loss2_fixed

In [5]:
 def M( linear_feature_columns, dnn_feature_columns, fm_group=[DEFAULT_GROUP_NAME], dnn_hidden_units=(128, 128),
        l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_dnn=0, seed=1024, dnn_dropout=0,
        dnn_activation='elu', dnn_use_bn=False, task='binary'):
    
    features = build_input_features(linear_feature_columns + dnn_feature_columns)

    inputs_list = list(features.values())

    linear_logit = get_linear_logit(features, linear_feature_columns, seed=seed, prefix='linear',
                                    l2_reg=l2_reg_linear)

    group_embedding_dict, dense_value_list = input_from_feature_columns(features, dnn_feature_columns, l2_reg_embedding,
                                                                        seed, support_group=True)

    fm_logit = add_func([FM()(concat_func(v, axis=1))
                         for k, v in group_embedding_dict.items() if k in fm_group])

    dnn_input = combined_dnn_input(list(chain.from_iterable(
        group_embedding_dict.values())), dense_value_list)

    dnn_output = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout,
                     dnn_use_bn, seed)(dnn_input)


    dnn_logit = tf.keras.layers.Dense(
        1, use_bias=False, activation=None)(dnn_output)

    final_logit = add_func([linear_logit, fm_logit, dnn_logit])
    output = PredictionLayer(task)(final_logit)

    model = Model(inputs=[features], outputs=[output])

    model.compile(optimizer=optimizers.Adam(2.5e-4),
                loss={'prediction_layer':losses.binary_crossentropy},# multi_category_focal_loss2(alpha=0.35, gamma=2)
                metrics=['AUC'])
    return model

In [6]:
model = M(linear_feature_columns=linear_feature_columns, 
        dnn_feature_columns=dnn_feature_columns, task='binary')
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
PVD (InputLayer)                [(None, 1)]          0                                            
__________________________________________________________________________________________________
Obesity_waistline (InputLayer)  [(None, 1)]          0                                            
__________________________________________________________________________________________________
Diabetes (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
Family_diabetes (InputLayer)    [(None, 1)]          0                                            
_______________________________________________________________________________________

In [7]:
input_train = deepfm_train
model.fit(input_train,
            {'prediction_layer':label},
            validation_split=0.3,
            epochs=25,
            batch_size=100,)

Epoch 1/25


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7fdfe8310390>

In [8]:
input_test = deepfm_test
ans_mtx = model.predict(input_test,
                        batch_size=100)

In [9]:
ans_sub = pd.DataFrame({'ID':test_['ID'].astype(int),'hepatitis':ans_mtx.flatten()})

In [10]:
pre_konw.columns = ['ID','hepatitis']
ans_sub = ans_sub.append(pre_konw)#.drop_duplicates(['ID'])

In [11]:
ans_sub.to_csv('sub.csv',index=0)

In [12]:
ans_sub

Unnamed: 0,ID,hepatitis
0,6146,0.004193
1,8196,0.003300
2,8197,0.008630
3,8198,0.047996
4,8199,0.110326
5,6152,0.040158
6,6151,0.005426
7,8202,0.009288
8,6154,0.001273
9,8204,0.008440
