In [None]:
import pandas as pd
import seaborn as sns
from catboost import CatBoostClassifier
from matplotlib import pyplot as plt
from tqdm import tqdm
import time
import gc
import numpy as np
from scipy.stats import entropy
from gensim.models import Word2Vec
from sklearn.metrics import *
from base import Cache
import joblib

import warnings

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('precision', 5)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('max_colwidth', 200)
pd.set_option('display.width', 5000)

def reduce_mem(df, use_float16=False):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    tm_cols = df.select_dtypes('datetime').columns
    for col in df.columns:
        if col in tm_cols:
            continue
        col_type = df[col].dtypes
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type).find('int') > -1:
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(
                        np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(
                        np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(
                        np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(
                        np.int64).max:
                    df[col] = df[col].astype(np.int64)
            elif str(col_type).find('float') > -1:
                if use_float16 and c_min > np.finfo(
                        np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(
                        np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

print('loading data start!')
gc.collect()
# step2 基础特征+编码特征
df = Cache.reload_cache('CACHE_data_step_2_feature_0917_r5.pkl')
del df['communication_onlinerate'],df['level_0']
gc.collect()
print(df.shape)

# step3 特征
df_window0 = Cache.reload_cache('CACHE_data_step_3_features_0_0917_r5.pkl')
df_window1 = Cache.reload_cache('CACHE_data_step_3_features_1_0917_r5.pkl')
df_window2 = Cache.reload_cache('CACHE_data_step_3_features_2_0917_r5.pkl')

# # step4 uid 特征
# df_uid = Cache.reload_cache('CACHE_data_step_4_feature_0917_r5.pkl')
# df_uid = df_uid[['index','uid_pt_d_total_counts',
#  'uid_pt_d_task_id_counts',
#  'uid_pt_d_task_id_sm_curr_rate',
#  'uid_pt_d_task_id_rank_sm_curr_rate',
#  'uid_pt_d_creat_type_cd_counts',
#  'uid_pt_d_creat_type_cd_sm_curr_rate',
#  'uid_pt_d_creat_type_cd_rank_sm_curr_rate',
#  'uid_pt_d_adv_id_counts',
#  'uid_pt_d_adv_id_sm_curr_rate',
#  'uid_pt_d_adv_id_rank_sm_curr_rate',
#  'uid_pt_d_adv_prim_id_counts',
#  'uid_pt_d_adv_prim_id_sm_curr_rate',
#  'uid_pt_d_adv_prim_id_rank_sm_curr_rate',
#  'uid_pt_d_dev_id_counts',
#  'uid_pt_d_dev_id_sm_curr_rate',
#  'uid_pt_d_dev_id_rank_sm_curr_rate',
#  'uid_pt_d_inter_type_cd_counts',
#  'uid_pt_d_inter_type_cd_sm_curr_rate',
#  'uid_pt_d_inter_type_cd_rank_sm_curr_rate',
#  'uid_pt_d_spread_app_id_counts',
#  'uid_pt_d_spread_app_id_sm_curr_rate',
#  'uid_pt_d_spread_app_id_rank_sm_curr_rate',
#  'uid_pt_d_tags_counts',
#  'uid_pt_d_tags_sm_curr_rate',
#  'uid_pt_d_tags_rank_sm_curr_rate',
#  'uid_pt_d_app_first_class_counts',
#  'uid_pt_d_app_first_class_sm_curr_rate',
#  'uid_pt_d_app_first_class_rank_sm_curr_rate',
#  'uid_pt_d_app_second_class_counts',
#  'uid_pt_d_app_second_class_sm_curr_rate',
#  'uid_pt_d_app_second_class_rank_sm_curr_rate',
#  'uid_pt_d_indu_name_counts',
#  'uid_pt_d_indu_name_sm_curr_rate',
#  'uid_pt_d_indu_name_rank_sm_curr_rate',
#  'uid_pt_d_slot_id_counts',
#  'uid_pt_d_slot_id_sm_curr_rate',
#  'uid_pt_d_slot_id_rank_sm_curr_rate',
#  'uid_pt_d_net_type_counts',
#  'uid_pt_d_net_type_sm_curr_rate',
#  'uid_pt_d_net_type_rank_sm_curr_rate',
#  'uid_pt_d_slot_id_net_type_counts',
#  'uid_pt_d_slot_id_net_type_sm_curr_rate',
#  'uid_pt_d_slot_id_net_type_rank_sm_curr_rate']]

df = df.merge(df_window0,on='index',how='left')
df = df.merge(df_window1,on='index',how='left')
df = df.merge(df_window2,on='index',how='left')
# df = df.merge(df_uid,on='index',how='left')
print(df.shape)
df['label'] = df['label'].fillna(-1).astype(int)
# for var in tqdm(df.columns):
#     if str(df[var].dtype).find('float32')>-1:
#         df[var] = df[var].astype(np.float16)
del df_window0,df_window1,df_window2# ,df_uid
gc.collect()
print('loading data finish!')

droplist = []
set_tst =df.query('pt_d==8').copy()
for var in df.columns:
    if var not in ['index','uid','pt_d','label','id']:
        if set_tst[var].nunique()<2:
            droplist.append(var)
print('droplist:',droplist)
df = df.drop(droplist,axis=1)
gc.collect()

# #线下数据集的切分
X_train = df[df["pt_d"]<7].copy()
y_train = X_train["label"].astype('int32')
X_valid = df[df["pt_d"]==7]
y_valid = X_valid["label"].astype('int32')
test_df = df[df["pt_d"]==8].copy()
# 筛选特征
drop_fea = ['pt_d','label','communication_onlinerate','index','uid','id']
feature= [x for x in X_train.columns if x not in drop_fea]
print(len(feature))
print(feature)

#线下验证
cate_fea = []
clf = CatBoostClassifier(iterations=10000, depth=6,learning_rate=0.1, loss_function='Logloss',cat_features=cate_fea
                        ,verbose=True,eval_metric='AUC',counter_calc_method='Full',task_type='GPU',metric_period=1000)
clf.fit(
    X_train[feature], y_train.astype('int32'),
    eval_set=[(X_valid[feature],y_valid.astype('int32'))],
    early_stopping_rounds=200,
    verbose=True,
    use_best_model=True,
)
joblib.dump(clf,'./models/ctb_local0917_0.pkl')
y_predprob = clf.predict_proba(X_valid[feature])[:, 1] 

y_pre = clf.predict_proba(test_df[feature])[:, 1]  
auc_score =roc_auc_score(y_valid, y_predprob)
print("AUC Score (Valid): %f" % auc_score)

#查看模型的特征重要性
import matplotlib.pyplot as plt 
from matplotlib import cm
%matplotlib inline
score = pd.DataFrame()
score['fea_name'] = clf.feature_names_
score['fea']=clf.feature_importances_
score = score.sort_values(['fea'], ascending=False)
temp = pd.DataFrame()
temp = score[:60]
color = cm.jet(temp['fea']/temp['fea'].max())
plt.figure(figsize=(10, 15))
plt.barh(temp['fea_name'],temp['fea'],height =0.8,color=color,alpha=0.8)
plt.show()

# 线上提交的模型训练
clf1 = CatBoostClassifier(iterations=clf.best_iteration_, depth=6,learning_rate=0.1, loss_function='Logloss'
                        ,eval_metric='AUC',counter_calc_method='Full',task_type='GPU',metric_period=50)
clf1.fit(
    df[df["pt_d"]<=7][feature], df[df["pt_d"]<=7]['label'].astype('int32'),
    verbose=True,
    use_best_model=True,
)
joblib.dump(clf1,'./models/ctb_sub0917_0.pkl')
y_pre = clf1.predict_proba(test_df[feature])[:, 1]    

res = pd.DataFrame()
res['id'] = test_df['id'].astype('int32')
res['probability'] = y_pre
res.to_csv('baseline0917_0_{}.csv'.format(auc_score),index = False)

loading data start!


[2020-09-17 13:00:22] - __init__.py[line:127] - INFO: Successfully Reload: /home/tione/notebook/huawei/cached_data/CACHE_data_step_2_feature_0917_r5.pkl


(8601298, 304)


[2020-09-17 13:00:25] - __init__.py[line:127] - INFO: Successfully Reload: /home/tione/notebook/huawei/cached_data/CACHE_data_step_3_features_0_0917_r5.pkl
[2020-09-17 13:00:29] - __init__.py[line:127] - INFO: Successfully Reload: /home/tione/notebook/huawei/cached_data/CACHE_data_step_3_features_1_0917_r5.pkl
[2020-09-17 13:00:32] - __init__.py[line:127] - INFO: Successfully Reload: /home/tione/notebook/huawei/cached_data/CACHE_data_step_3_features_2_0917_r5.pkl
[2020-09-17 13:00:36] - __init__.py[line:127] - INFO: Successfully Reload: /home/tione/notebook/huawei/cached_data/CACHE_data_step_4_feature_0917_r5.pkl
