### 编者按
- 生成最后模型一共有3份代码：
- 1）生成w2v特征：构建w2v特征，将特征喂给第一期的训练集，项目第二期的训练集和最终的测试集
- 2）stacking 特征代码：主要作用是将项目第一期的训练分布通过嫁接的方式存储到第二期项目
- 3）生成最终结果：主要用于将最后结果预测出最终的结果

#### 导入所需的包

In [1]:
import os
import pandas as pd
import lightgbm as lgb
import numpy as np
#从sklearn 评价函数中导入f1-score
from sklearn.metrics import f1_score

#### 读取数据

In [2]:
#设置根路径
path = './'
#设置w2v路径
w2v_path = path + 'w2v'
# 读取第一期训练集，读取第二期训练集，测试集
train = pd.read_csv(path + 'input/train_2.csv') #第二期训练集
test = pd.read_csv(path + 'input/test_2.csv') # 第二期测试集
train_first = pd.read_csv(path + 'input/train_all.csv') # 第一期训练集

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


#### 设置命名为data_type 列的值
- 0表示第二期项目
- 1表示第一期项目

In [3]:
#将第二期训练集 data_type 为0，第一期训练集设置为1
train['data_type'] = 0
test['data_type'] = 0
train_first['data_type'] = 1

#### 将train,test,train_first合并
- 合并主要用于做特征，并将特征合并

In [4]:
#合并三份数据集统一为data-数据的label 为current_service
data = pd.concat([train, test, train_first], ignore_index=True).fillna(0)
data['label'] = data.current_service.astype(int)

In [5]:
# 将data 中\\N 替换为999
data = data.replace('\\N', 999)
data['gender'] = data.gender.astype(int)

#### 设置原始特征列

In [6]:
# 设置原始类别特征列
origin_cate_feature = ['service_type', 'complaint_level', 'contract_type', 'gender', 'is_mix_service',
                       'is_promise_low_consume',
                       'many_over_bill', 'net_service']
#设置原始数值特征
origin_num_feature = ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee',
                      'age', 'contract_time',
                      'former_complaint_fee', 'former_complaint_num',
                      'last_month_traffic', 'local_caller_time', 'local_trafffic_month', 'month_traffic',
                      'online_time', 'pay_num', 'pay_times', 'service1_caller_time', 'service2_caller_time']

#### 将数值特征转成float 型

In [7]:
for i in origin_num_feature:
    data[i] = data[i].astype(float)

#### 将w2v特征喂至 data中

In [8]:
# 读取w2v 特征
w2v_features = []
for col in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee']:
    df = pd.read_csv(w2v_path + '/' + col + '.csv')
    #将读取后的w2v值去重
    df = df.drop_duplicates([col])
    fs = list(df)
    fs.remove(col)
    w2v_features += fs
    #将w2v特征merge 进data数据里面
    data = pd.merge(data, df, on=col, how='left')
#设置统计count 特征

#### 新建count 特征列表，用于存储count特征

In [9]:
count_feature_list = []

#### 封装count 特征函数

In [10]:
# 封装特征计算count 特征
def feature_count(data, features=[]):
  
    if len(set(features)) != len(features):
        print('equal feature !!!!')
        return data
    #计算count 特征命名
    new_feature = 'count'
    for i in features:
        new_feature += '_' + i.replace('add_', '')
    #尝试删除原有特征名称
    try:
        del data[new_feature]
    except:
        pass
    #临时计算临时的 特征的count特征
    temp = data.groupby(features).size().reset_index().rename(columns={0: new_feature})
    
    #将临时特征merge 进入data的DataFrame中
    data = data.merge(temp, 'left', on=features)
    #将count特征列表加入新特征
    count_feature_list.append(new_feature)
    return data

#### 计算一系列的count 特征
- 计算 ['1_total_fee','2_total_fee','3_total_fee','4_total_fee','former_complaint_fee','pay_num','contract_time','last_month_traffic','online_time'] 每个列表的count特征
- 计算['service_type','contract_type']与其他类型的组合特征

In [11]:
#计算1-4月份费用的count特征
data = feature_count(data, ['1_total_fee'])
data = feature_count(data, ['2_total_fee'])
data = feature_count(data, ['3_total_fee'])
data = feature_count(data, ['4_total_fee'])
data = feature_count(data, ['former_complaint_fee'])
data = feature_count(data, ['pay_num'])
data = feature_count(data, ['contract_time'])
data = feature_count(data, ['last_month_traffic'])
data = feature_count(data, ['online_time'])

# 计算组合特征的count 特征，举例子：'service_type'+'1_total_fee' 特征
for i in ['service_type', 'contract_type']:
    data = feature_count(data, [i, '1_total_fee'])
    data = feature_count(data, [i, '2_total_fee'])
    data = feature_count(data, [i, '3_total_fee'])
    data = feature_count(data, [i, '4_total_fee'])

    data = feature_count(data, [i, 'former_complaint_fee'])

    data = feature_count(data, [i, 'pay_num'])
    data = feature_count(data, [i, 'contract_time'])
    data = feature_count(data, [i, 'last_month_traffic'])
    data = feature_count(data, [i, 'online_time'])

#### 计算差值特征

In [12]:
#计算同类型的特征的差值（key）
data['diff_total_fee_1'] = data['1_total_fee'] - data['2_total_fee']
data['diff_total_fee_2'] = data['2_total_fee'] - data['3_total_fee']
data['diff_total_fee_3'] = data['3_total_fee'] - data['4_total_fee']
data['pay_num_1_total_fee'] = data['pay_num'] - data['1_total_fee']
data['last_month_traffic_rest'] = data['month_traffic'] - data['last_month_traffic']

#### 计算ratio 特征

In [13]:
#计算电话时长的 ratio 除以特征
data['total_caller_time'] = data['service2_caller_time'] + data['service1_caller_time']
data['service2_caller_ratio'] = data['service2_caller_time'] / data['total_caller_time']
data['local_caller_ratio'] = data['local_caller_time'] / data['total_caller_time']

#计算流量的占比和累加特征
data['total_month_traffic'] = data['local_trafffic_month'] + data['month_traffic']
data['month_traffic_ratio'] = data['month_traffic'] / data['total_month_traffic']
data['last_month_traffic_ratio'] = data['last_month_traffic'] / data['total_month_traffic']
#猜测金额对当前月份的金额的占比以判断是否需要变更套餐
data['rest_traffic_ratio'] = (data['last_month_traffic_rest'] * 15 / 1024) / data['1_total_fee']

#### 猜测套餐金额特征

In [14]:
#猜测套餐剩余金额
data['1_total_fee_call_fee'] = data['1_total_fee'] - data['service1_caller_time'] * 0.15
data['1_total_fee_call2_fee'] = data['1_total_fee'] - data['service2_caller_time'] * 0.15
data['1_total_fee_trfc_fee'] = data['1_total_fee'] - (
        data['month_traffic'] - 2 * data['last_month_traffic']) * 0.3

data.loc[data.service_type == 1, '1_total_fee_trfc_fee'] = None

#### 计算4个月 的最大值，最小值，平均值特征

In [15]:
total_fee = []
for i in range(1, 5):
    total_fee.append(str(i) + '_total_fee')
# def get_mean(x1,x2,x3,x4):
#     x = np.mean([x1,x2,x3,x4])
#     return x
# data['total_fee_mean'] = data.apply(lambda row:get_mean(row['0_total_fee'],row['1_total_fee'],row['2_total_fee'],row['3_total_fee']),axis=1)
# data['total_fee_mean'] =data.apply(lambda row:(row['0_total_fee']+row['1_total_fee']+row['2_total_fee']+row['3_total_fee'])/4)
#计算4个月份的最大值，最小值，平均值
data['total_fee_mean'] = data[total_fee].mean(1)
data['total_fee_max'] = data[total_fee].max(1)
data['total_fee_min'] = data[total_fee].min(1)

#### 再次预处理数据

In [16]:
#金额没有负值，将小于0的金额转为0
data['last_month_traffic_rest'][data['last_month_traffic_rest'] < 0] = 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


#### 汇总count特征，ratio,猜测金额以及其余的部分特征：命名为diff_feature_list

In [17]:
# 差值特征
diff_feature_list = ['diff_total_fee_1', 'diff_total_fee_2', 'diff_total_fee_3', 'last_month_traffic_rest',
                     'rest_traffic_ratio',
                     'total_fee_mean', 'total_fee_max', 'total_fee_min', 'total_caller_time', 'service2_caller_ratio',
                     'local_caller_ratio',
                     'total_month_traffic', 'month_traffic_ratio', 'last_month_traffic_ratio', 'pay_num_1_total_fee',
                     '1_total_fee_call_fee', '1_total_fee_call2_fee', '1_total_fee_trfc_fee']


#### 汇总 类别特征和 数值特征
- 类别特征cate_feature
- 数值特征 num_feature

In [18]:
#汇总所有特征，原始类别特征，原始数值特征，count 特征 ，差值特征，以及w2v特征
cate_feature = origin_cate_feature
num_feature = origin_num_feature + count_feature_list + diff_feature_list + w2v_features

#### 将类别特征转成‘category’类型，数值特征转成float型

In [19]:
# lgb模型可以指定 category 特征（类别特征）
for i in cate_feature:
    data[i] = data[i].astype('category')
# 将数值特征 转成float特征
for i in num_feature:
    data[i] = data[i].astype(float)
#总的特征汇总成类别特征和数值特征
feature = cate_feature + num_feature

#### 打印特征长，并看看有哪些 特征

In [20]:
print(len(feature), feature)

110 ['service_type', 'complaint_level', 'contract_type', 'gender', 'is_mix_service', 'is_promise_low_consume', 'many_over_bill', 'net_service', '1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'age', 'contract_time', 'former_complaint_fee', 'former_complaint_num', 'last_month_traffic', 'local_caller_time', 'local_trafffic_month', 'month_traffic', 'online_time', 'pay_num', 'pay_times', 'service1_caller_time', 'service2_caller_time', 'count_1_total_fee', 'count_2_total_fee', 'count_3_total_fee', 'count_4_total_fee', 'count_former_complaint_fee', 'count_pay_num', 'count_contract_time', 'count_last_month_traffic', 'count_online_time', 'count_service_type_1_total_fee', 'count_service_type_2_total_fee', 'count_service_type_3_total_fee', 'count_service_type_4_total_fee', 'count_service_type_former_complaint_fee', 'count_service_type_pay_num', 'count_service_type_contract_time', 'count_service_type_last_month_traffic', 'count_service_type_online_time', 'count_contract_type_1_total_f

#### 筛选出label 非999999 的特征

In [21]:
#筛选出999999套餐
data = data[data.label != 999999]

#### 筛选出项目第一期的训练集
- 作为第一步stacking特征的训练集

In [22]:
#筛选出第一期的训练集
train_x = data[(data.data_type == 1)][feature]
train_y = data[(data.data_type == 1)][['label']]

In [23]:
train_y.label.value_counts()

90063345    287219
89950166    133224
89950167     73842
99999828     52939
90109916     38096
89950168     33462
99999827     32531
99999826     29054
90155946     22037
99999830     21236
99999825     20350
Name: label, dtype: int64

In [24]:
#筛选出第二期的训练集
test_x = data[(data.data_type == 0) & (data.label != 0)][feature]
test_y = data[(data.data_type == 0) & (data.label != 0)].label

#### 自定义macro F1score损失函数

In [25]:
#自定义 macro F1 score
def evalerror(y, preds):
    labels = y
    preds = preds.reshape(-1, 11)
    preds = preds.argmax(axis = 1)
#     preds = preds.reshape(-1, 11)
#     print(preds.shape)
    f_score = f1_score( labels,preds, average = 'macro')
    return 'f1_score', f_score, True


In [26]:
#初始化 sklearn 接口 lgb 分类模型-objective='multiclass' 表示调用多分类分类模型
lgb_model = lgb.LGBMClassifier(
    boosting_type="gbdt", num_leaves=120, reg_alpha=0, reg_lambda=0.,
    max_depth=-1, n_estimators=200, objective='multiclass',
    subsample=0.9, colsample_bytree=0.5, subsample_freq=1,
    learning_rate=0.1, random_state=2018, n_jobs=-1
)

#### 直接训练模型

In [27]:
# 直接训练模型，没有跑五折？思考一下为什么？
'''
eval_metric 评测函数
categorical_feature  指定类别特征
verbose 每隔多少步可视化训练过程
'''
lgb_model.fit(train_x, train_y,eval_set=[(train_x, train_y)], categorical_feature=cate_feature,verbose=10,early_stopping_rounds=50)
# 计算模型最好得分
print(lgb_model.best_score_)

  return f(*args, **kwargs)


Training until validation scores don't improve for 50 rounds
[10]	training's multi_logloss: 0.407755
[20]	training's multi_logloss: 0.229439
[30]	training's multi_logloss: 0.17654
[40]	training's multi_logloss: 0.151181
[50]	training's multi_logloss: 0.13504
[60]	training's multi_logloss: 0.122897
[70]	training's multi_logloss: 0.112713
[80]	training's multi_logloss: 0.104386
[90]	training's multi_logloss: 0.0971693
[100]	training's multi_logloss: 0.0907238
[110]	training's multi_logloss: 0.0848396
[120]	training's multi_logloss: 0.0794773
[130]	training's multi_logloss: 0.0746602
[140]	training's multi_logloss: 0.0702773
[150]	training's multi_logloss: 0.0661323
[160]	training's multi_logloss: 0.0624024
[170]	training's multi_logloss: 0.0588668
[180]	training's multi_logloss: 0.0555485
[190]	training's multi_logloss: 0.0524379
[200]	training's multi_logloss: 0.0494703
Did not meet early stopping. Best iteration is:
[200]	training's multi_logloss: 0.0494703
defaultdict(<class 'collecti

#### 生成次级模型所需要的特征-并导出数据

In [28]:
#生成次级模型 特征
stacking_path = path + '/stack'
if not os.path.exists(stacking_path):
    print(stacking_path)
    #形成 stack 路径
    os.makedirs(stacking_path)
    #计算多分类模型的概率
    train_proba = lgb_model.predict_proba(test_x[feature]) # 预测项目第二期 训练集stacking label 概率结果
    test_proba = lgb_model.predict_proba(data[data.label == 0][feature]) # 预测项目第二期 测试集stacking label 概率结果
    print(len(train_proba), len(test_proba))
    #筛选出训练集的user_id数据
    stacking_train = data[(data.data_type == 0) & (data.label != 0)][['user_id']]
    #生成test user_id数据
    stacking_test = data[data.label == 0][['user_id']]
    #生成train test stacking特征
    for i in range(11):
        stacking_train['stacking_' + str(i)] = train_proba[:, i]
        stacking_test['stacking_' + str(i)] = test_proba[:, i]
    stacking_train.to_csv(stacking_path + '/train.csv', index=False)
    stacking_test.to_csv(stacking_path + '/test.csv', index=False)
    
#计算线下f1 score 得分

score = f1_score(y_true=test_y, y_pred=lgb_model.predict(test_x), average=None)
print(score)

.//stack
374653 160566
[0.81052451 0.938907   0.94464223 0.99760262 0.99667031 0.9621267
 0.93300042 0.81823386 0.75729884 0.83941541 0.52507559]


In [29]:
train_y.value_counts()

label   
90063345    287219
89950166    133224
89950167     73842
99999828     52939
90109916     38096
89950168     33462
99999827     32531
99999826     29054
90155946     22037
99999830     21236
99999825     20350
dtype: int64