In [None]:
import pandas as pd
# for循环显示进度条
from tqdm import tqdm
import warnings
import gc
import os
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from pylab import *

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')

In [None]:
df_train = pd.read_csv('训练集')
df_test = pd.read_csv('测试集')

In [None]:
# 合并训练集测试集
df_feature = df_train.append(df_test, sort=False)

# step1:数据分析

In [None]:
# 数据集描述
pd.options.display.max_info_columns = 200
df_feature.info()

In [None]:
# 统计信息
df_feature['特征'].describe()

In [None]:
# 显示离散特征频次信息
df_feature.loc[:,'特征1'].value_counts()

In [None]:
# 绘制续保影响因素柱状图
# 可以查看离散特征中不同属性值的标签分布差异
df_train.groupby('离散特征')['标签'].mean().plot(kind='bar',figsize=(8,6))

In [None]:
# 两个因素并行分析
ageplot=df_train.pivot_table(index='离散特征1',columns='离散特征2',values='标签',aggfunc='mean')
ageplot.plot(figsize=(15,6),style='-o',grid='True')

In [None]:
# 绘制箱型图
# 可以查看连续特征的数据分布状况
df_feature['特征1'].plot(kind='box')

In [None]:
# 连续变量
# 绘制连续变量的直方图分布
df_feature['early_return_amount_3mon'].hist(histtype='stepfilled', bins=50)

In [None]:
# 年龄中位数填充
df_feature['p1_age']=df_feature['p1_age'].fillna(median(df_feature['p1_age'].dropna()))

# step2:对df_feature特征工程

In [None]:
# 使用与操作
df_train = df_train[~((df_train['特征1']==0) & (df_train['特征2']>0))]

# 文本等信息正则化处理

In [None]:
# 车型可以拆分为 品牌+用途    (车型：福特CAF7152A轿车)
# 提取车型中开始部分的汉字([\u4e00-\u9fa5]+) 
# 可提取  福特
df_feature['品牌'] = df_feature['车型'].str.extract('([\u4e00-\u9fa5]+)',expand=False)  # 括号包裹起来的为提取部分
# 提取后面部分汉字
# 抽取车用途,[\u4e00-\u9fa5]+至少一个汉字、[A-Za-z0-9]+至少一个字母或者数字
# 可提取  轿车
df_feature['用途'] = df_feature['车型'].str.extract('[\u4e00-\u9fa5]+[A-Za-z0-9]+([\u4e00-\u9fa5]+)',expand=False)

# 改组特征

In [None]:
# 平均
df_feature['离散-连续-组合特征—平均'] = df_feature.groupby('离散')['连续'].transform('mean')
# 方差
df_feature['方差'] = df_feature.groupby('离散')['连续'].transform('std')
# 流量平滑特征，签单时间/平均签单时间
df_feature['平滑特征'] = df_feature['原始特征']/(df_feature['离散-连续-组合特征—平均']+1e-5)
# 黄金组合特征：原始数值特征、基于A关于B的均值特征、B减去A关于B的均值特征,三个一起使用
df_feature['特征-平均'] = df_feature['特征'] - df_feature['平均']
# 组内归一化特征
df_feature['归一化'] = (df_feature['特征'] - df_feature['平均'])/(df_feature['方差']+1e-9)

# 时间类型数据处理

In [None]:
df_feature['time'] = pd.to_datetime(df_feature['regdate'])
# object类型转化为daterime类型
# 提取时间特征中年份信息
df_feature['year'] = df_feature['regdate_time'].dt.year
# 提取时间特征中月份信息
df_feature['month'] = df_feature['regdate_time'].dt.month

# 分箱

In [None]:
# 对连续特征进行分箱处理
# 等距分箱
df_feature['p1_age_gender']=pd.cut(df_feature['p1_age'],10)
# 等频分箱
df_feature['p1_age_gender']=pd.qcut(df_feature['p1_age'],10)

# apply函数

In [None]:
# 将出生月份“6月”变为6
# 针对字符串中的数字判定化为int类型,汉字自动过滤
df_feature['month'] = df_feature['b_month'].apply(lambda x: int(x[:-1]) if type(x) != float else 0)

# 删除列

In [None]:
df_feature.drop(['特征1','特征2'],axis=1,inplace=True)

# 有序特征编码

In [None]:
rating_dic = {'黄铜':0,'白银':1,'黄金':2,'铂金':3,'钻石':4,'黑钻':5}
df_feature['段位'] = df_feature['等级'].map(rating_dic)

# 计数 Frequency编码，可以得到热度信息

In [None]:
for f in [['特征1'], ['特征2']]:
    # pandas中的groupby()函数：选择某一列进行分组。(加上size()函数---统计本列中的元素出现频次)
    # reset_index()：重新分配索引
    df_temp = df_feature.groupby(f).size().reset_index()
    # 或df['color_count'] = df['color'].map(df['color'].value_counts())
    
    # 新增标题命名
    df_temp.columns = f + ['{}_count'.format('_'.join(f))]
    # 合并两个表格
    df_feature = df_feature.merge(df_temp, how='left')

# leave-one-out mean-target编码

In [None]:
# leave-one-out mean-target编码
# 训练集编码：用所有样本对应标签的均值作为编码
# 测试集编码：用训练集样本对应标签的均值作为编码
# 但是原始的这种mean-target编码非常容易过拟合，所以有了K-fold mean-target编码
def stat(df, df_merge, group_by, agg):
    group = df.groupby(group_by).agg(agg)

    columns = []
    for on, methods in agg.items():
        for method in methods:
            columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))
    group.columns = columns
    group.reset_index(inplace=True)
    df_merge = df_merge.merge(group, on=group_by, how='left')

    del (group) # 删除中间变量
    # 内存不够用强制回收
    gc.collect()

    return df_merge

def statis_feat(df_know, df_unknow):
    # tqdm可以使for循环显示进度条
    # 籍贯、机构、
    for f in tqdm(['特征1', '特征2']):
        df_unknow = stat(df_know, df_unknow, [f], {'标签名称': ['mean']})

    return df_unknow

# 5折交叉
df_train = df_feature[~df_feature['标签'].isnull()]
df_train = df_train.reset_index(drop=True) # 重排序号
df_test = df_feature[df_feature['标签'].isnull()]

df_stas_feat = None
# 将训练集分为5份交叉训练
kfold = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)

# 训练集编码
for train_index, val_index in kfold.split(df_train, df_train['y1_is_purchase']):
    df_fold_train = df_train.iloc[train_index]
    df_fold_val = df_train.iloc[val_index]

    df_fold_val = statis_feat(df_fold_train, df_fold_val)
    df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)

    del(df_fold_train)
    del(df_fold_val)
    gc.collect()

# 测试集编码
df_test = statis_feat(df_train, df_test)
df_feature = pd.concat([df_stas_feat, df_test], axis=0)

del(df_stas_feat)
del(df_train)
del(df_test)
gc.collect()

# 编码转换

In [None]:
for f in list(df_feature.select_dtypes('object')):
    # 标的号、初登日期不进行转换
    if f in ['不转换的特征1', '不转换的特征2']:
        continue
    le = LabelEncoder()
    # .fit_transform先编码再标准化
    df_feature[f] = le.fit_transform(df_feature[f].astype('str')).astype('int')

# 划分训练集测试集

In [None]:
# 将合并后的训练集测试集重新拆分回来
df_train = df_feature[df_feature['标签'].notnull()]
df_test = df_feature[df_feature['标签'].isnull()]

# step3:导入lgb模型训练

In [None]:
# 定义预测结果表格
prediction = df_test[['carid']]
prediction['label'] = 0

In [None]:
# 指定标签
ycol = '标签'
# 筛除不参与训练特征
feature_names = list(filter(lambda x: x not in [ycol,'不参与训练的特征'], df_train.columns))


# LGB：LGBMClassifier;LGBMRegressor
model = lgb.LGBMClassifier(num_leaves=105, # 取值应<=2^(max_depth),超过此值会导致过拟合        
                           max_depth=13,  # 树的最大深度,-1表示不限制树深：模型过拟合时可以考虑降低它,推荐数值[3,5,6,7,9,12,15,17,25]
                           learning_rate=0.1, # 通常0.01，0.001，0.003...
                           n_estimators=20000, # 树的个数,越多训练时间增加n_estimators/num_iterations/num_round/num_boost_round都代表
                           subsample=0.8, # 默认为1等价于bagging_fraction，控制每棵树随机采样比例，
                           feature_fraction=0.6, # 等价于colsample_bytree，每次迭代中随机选择80%的特征来建树
                           min_data_in_leaf = 121, # 每个叶节点最少样本数量，过拟合时候使用
                           reg_alpha = 11,
                           reg_lambda = 2.3,
                           random_state=seed,
                           n_jobs = 2,  # 核心数目,几核CPU
                           metric=None
                           )
# 记录训练集预测值
oof = []
# 记录特征重要性
df_importance_list = []


kfold = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train[feature_names], df_train[ycol])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))
    
    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['valid_train','test'],
                          eval_set=[(X_train,Y_train),(X_val, Y_val)],
                          verbose=100, # 隔100代显示一次信息
                          eval_metric='auc',  # 误差函数
                          early_stopping_rounds=100  # 在50轮内验证集指标不提升就停止迭代,经验法则为num_iterations的10%。
                          )
    print('\n训练结束！！！')
    
    pred_val = lgb_model.predict_proba(X_val, num_iteration=lgb_model.best_iteration_)[:, 1]
    # 每一折验证集的数据保存
    df_oof = df_train.iloc[val_idx][['carid', ycol]].copy()
    df_oof['pred'] = pred_val  # 有3列数据ID、真实值、预测值
    oof.append(df_oof)
    
    # 预测验证集
    pred_test = lgb_model.predict_proba(df_test[feature_names], num_iteration=lgb_model.best_iteration_)[:, 1]
    prediction['label'] += (pred_test/5)  # 每次预测的1/5加进去,乘以0.8防止出现大于一的情况，或者归一化

    df_importance = pd.DataFrame({'column': feature_names,'importance': lgb_model.feature_importances_,})
    df_importance_list.append(df_importance)  # 记录特征重要性

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()

In [None]:
# 显示特征重要性排名表
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg('mean').sort_values(ascending=False).reset_index()
df_importance

In [None]:
# 计算训练集auc分数
df_oof = pd.concat(oof)
score = roc_auc_score(df_oof['y1_is_purchase'], df_oof['pred'])
score

# CV调参

In [None]:
ycol = '标签'
feature_names = list(filter(lambda x: x not in [ycol,'carid'], df_train.columns))
# 训练集测试集
X_train = df_train[feature_names]
Y_train = df_train[ycol]

from sklearn.model_selection import GridSearchCV
### 我们可以创建lgb的sklearn模型，使用上面选择的(学习率，评估器数目)
model_lgb = lgb.LGBMClassifier(objective='binary',
                                max_depth=7,
                                num_leaves=105,
                                max_bin =  45,
                                min_data_in_leaf = 101,
                                learning_rate=0.1, 
                                n_estimators=500, 
                                metric='auc', 
                                bagging_fraction = 0.6, # 次要，0.5-1.0取值相同
                                feature_fraction = 0.8,  # 主要
                                reg_alpha = 1,
                                reg_lambda = 0.7
                                )
params_test1={
    'min_data_in_leaf': [81,91,101,111,121,131]
}
gsearch1 = GridSearchCV(estimator=model_lgb, 
                        param_grid=params_test1, 
                        scoring='roc_auc', 
                        cv=5, 
                        verbose=1, 
                        n_jobs=2
                        )
gsearch1.fit(X_train,Y_train)

In [None]:
# 显示调参训练细节
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

# step4:保存结果

In [None]:
# K折训练
os.makedirs('sub', exist_ok=True)
prediction.to_csv(f'sub/{score}.csv', index=False)
prediction.to_csv(f'sub/sub.csv', index=False)