**定义字段选取字典**

In [3]:
import os
import pandas as pd
import numpy as np
from paddle.io import Dataset
from baseline_tools import Data2IdNorm,Data2IdEmb,value2numpy,make_dict_file
# None表示不使用，“emb”为Embedding预处理方案
TAGS = {'android_id': None,
        'apptype': "emb",
        'carrier':  "emb",
        'dev_height': "emb",
        'dev_ppi': "emb",
        'dev_width': "emb",
        'lan': "emb",
        'media_id': "emb",
        'ntt': "emb",
        'os':"emb",
        'osv': "emb",
        'package': "emb",
        'sid': None,
        'timestamp': "norm",
        'version': "emb",
        'fea_hash': "norm",
        'location': "emb",
        'fea1_hash': "norm",
        'cus_type': "emb"}

**数据预处理**

In [2]:
#我这里是对fea_hash，fea1_hash,timestamp这3个字段用norm处理，可是发现fea_hash数据集中有很多异常值，进行处理一下
# fea_hash字段中有许多异常数据，将他们改为最大值的一半：499997879.0
datas = pd.read_csv("train.csv")
#datas = datas["fea_hash"]

for ids,data in enumerate(datas["fea_hash"]):
    try:
        data = float(data)
    except:
        datas["fea_hash"][ids] = 499997879
        print(ids+1)
datas.to_csv("train.csv")
#fea_hash字段中有许多异常数据，将他们改为最大值的一半：499997879.0
datas = pd.read_csv("test.csv",dtype=str)
#datas = datas["fea_hash"]
#print(datas.head)

for ids,data in enumerate(datas["fea_hash"]):
    try:
        data = float(data)
    except:
        datas["fea_hash"][ids] = 499997879
        print(ids+1)
datas = datas
datas.to_csv("test.csv")

In [None]:
#生成字典,顺便计算一下norm字段中的最大值，用于归一化
%mkdir emb_dicts
TRAIN_PATH = "train.csv"
SAVE_PATH = "emb_dicts"
df = pd.read_csv(TRAIN_PATH, index_col=0)
# 对两组连续数据取对数，使数据呈正态分布（对比之前直接用效果有略微提升）
df['fea1_hash'] = np.log(df['fea1_hash'])
df['fea_hash'] = np.log(df['fea1_hash'])

pack = dict()
for tag, tag_method in TAGS.items():
    if tag_method != "emb":
        if tag_method == "norm":
            data = df.loc[:, tag]
            print("{}_max的倒数:{}".format(tag,1/float(data.max())),"--------",float(data.max())/2)
            print("{}_max:{}".format(tag,float(data.max())),"--------min:",float(data.min()))
            #print("{}_mean:{}".format(tag,data.mean()))
        continue
    data = df.loc[:, tag]
    dict_size = make_dict_file(data, SAVE_PATH, dict_name=tag)
    pack[tag] = dict_size + 1  # +1是为了增加字典中不存在的情况，提供一个默认值

with open(os.path.join(SAVE_PATH, "size.dict"), "w", encoding="utf-8") as f:
    f.write(str(pack))

print("全部生成完毕")

In [None]:
# 将上面计算出的权重结果复制到NORM_WEIGHT中
NORM_WEIGHT = {'timestamp': 6.409845522722902e-13,
                "fea_hash":0.3226648518870102,
                "fea1_hash":0.045085662640681215,
                "android_id":1.4086530e-06,
                "dev_height":0.00011081560283687943,
                "dev_ppi":0.001388888888888889,
                "dev_width":0.00011322463768115942
                }

**数据读取方法**

In [None]:
def get_size_dict(dict_path="./emb_dicts/size.dict"):
    """
    获取Embedding推荐大小
    :param dict_path: 由run_make_emb_dict.py生成的size.dict
    :return: 推荐大小字典{key: num}
    """
    with open(dict_path, "r", encoding="utf-8") as f:
        try:
            size_dict = eval(f.read())
        except Exception as e:
            print("size_dict打开失败，请检查", dict_path, "文件是否正常，报错信息如下:\n", e)
        return size_dict
# 定义数据读取方法
def read_file(use_mini_train, is_infer):

    # 选择文件名
    emb_dict_path="./emb_dicts"
    train_name = "mini_train" if use_mini_train else "train"
    file_name = "test" if is_infer else train_name
    # 根据文件名读取对应csv文件
    df = pd.read_csv(file_name + ".csv")
    # 对两组连续数据取对数，使数据呈正态分布（对比之前直接用效果有略微提升）
    df['fea1_hash'] = np.log(df['fea1_hash'])
    df['fea_hash'] = np.log(df['fea1_hash'])


    # 数据预处理
    cols = [tag for tag, tag_method in TAGS.items() if tag_method is not None]
    methods = dict()
    for col in cols:
        # ===== 预处理方法注册 =====
        if TAGS[col] == "emb":
            methods[col] = Data2IdEmb(dict_path=emb_dict_path, dict_name=col).get_method()
        elif TAGS[col] == "norm":
            methods[col] = Data2IdNorm(norm_weight=NORM_WEIGHT[col]).get_method()
        else:
            raise Exception(str(TAGS) + "是未知的预处理方案，请选手在此位置使用elif注册")

    pack = []
    
    # 遍历指定数量的字段
    for i in df.index:
        ll = []
        for col in cols:    
            sample = df.loc[i, col]
            sample = methods[col](sample)
            ll.append(sample)
            
        pack.append(ll)

    pack = np.array(pack)
    # 如果是训练集返回标签和数据，测试集只返回数据
    if file_name == train_name:
        labels = []
        for i in df["label"]:
            labels.append(i)
       
        labels = np.array(labels)
        return pack, labels, df
    else:
        return pack, df



**训练部分**

In [None]:
import sklearn.model_selection as ms
from xgboost import XGBClassifier
from xgboost import plot_importance
from matplotlib import pyplot
import sklearn.metrics as sm


# 调了半天参数不如用默认参数
# model =  XGBClassifier(
#         booster='gbtree',
#         eval_metric='auc',
#         n_estimators=200,
#         learning_rate =0.07,
#         max_depth=6,
#         min_child_weight=1,

#         gamma=0.3,
#         subsample=0.7,
#         colsample_bytree=0.7,
#         colsample_level=0.7,
#         objective= 'binary:logistic',
#         nthread=4,
#         scale_pos_weight=2,
#         reg_alpha=5.4,
#         reg_lambda=1,
#         seed=27,
#         alpha=0.1,
#         eta= 0.1, 
#         silent=0)
model =  XGBClassifier()
# 获取训练集结果
train_reader = read_file(use_mini_train=False, is_infer=False)
# 交叉验证
bili = int(round(len(train_reader[1]) * 0.95, 0))
train_data = train_reader[0].squeeze()
train_lable = train_reader[1]
eval_set = [(train_data[bili + 1:], train_lable[bili + 1:])]
# 训练
model.fit(train_data[:bili], train_lable[:bili], eval_metric="logloss", eval_set=eval_set, verbose=True)
pred_test_y = model.predict(train_data[bili + 1:])
# 评估指标报告
cr = sm.classification_report(train_lable[bili + 1:], pred_test_y)
print(cr)


**特征重要性可视化**

In [None]:
plot_importance(model)
pyplot.show()

**推理部分**

In [None]:
infer_reader, df = read_file(use_mini_train=False, is_infer=True)
infer_output = model.predict(infer_reader.squeeze())
result_df = df["sid"]
result_df = pd.DataFrame({"sid": np.array(result_df, dtype="int64"), "label": infer_output})
RESULT_FILE = "./result1.csv" 
result_df.to_csv(RESULT_FILE, index=False)