In [7]:
import numpy as np
import pandas as pd
import collections
from tqdm import tqdm
from sklearn.metrics import *
from imblearn.ensemble import EasyEnsembleClassifier  # adaboost
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import cross_val_score
import pickle


ModuleNotFoundError: No module named 'src'

In [8]:
def get_metrics(y_true, y_pred, y_score):# 传入真实值预测值 返回MAE RMSE MAPE R2的列表
    ans=[]
    ans.append(accuracy_score(y_true,y_pred))
    ans.append(precision_score(y_true,y_pred))
    ans.append(recall_score(y_true,y_pred))
    ans.append(roc_auc_score(y_true,y_score))
    ans.append(balanced_accuracy_score(y_true, y_pred))
    ans.append(f1_score(y_true, y_pred))
    ans.append(confusion_matrix(y_true, y_pred))
    return ans

In [2]:
def customize_score(true_value, predict):
    return precision_score(true_value, predict)

In [6]:
def xgb_cv(max_depth,learning_rate,n_estimators,gamma,subsample,
            reg_alpha,reg_lambda):  # 传入参数
    params = {}
    params['max_depth'] = int(round(max_depth))
    params['learning_rate'] = learning_rate
    params['n_estimators'] = int(n_estimators)
    params["gamma"] = gamma
    params['subsample'] = subsample
    params['reg_alpha'] = reg_alpha
    params['reg_lambda'] = reg_lambda

    my_scorer = make_scorer(customize_score, greater_is_better=True)
    base_learner = xgbsk.XGBClassifier(max_depth=7, learning_rate=0.05, n_estimators=10,  # 第三个数据集250
                                           verbosity=0, objective="binary:logistic",
                                           n_jobs=-1, gamma=0, min_child_weight=0.5, max_delta_step=0,
                                           subsample=1, colsample_bytree=0.5, colsample_bylevel=1,
                                           reg_alpha=0, reg_lambda=1, scale_pos_weight=1,  # init weight
                                           random_state=666, missing=0, importance_type="total_gain",
                                           use_label_encoder=False
                                           , eval_metric='logloss')
    base_learner.set_params(**params)#设置参数
    model = EasyEnsembleClassifier(sampling_strategy=0.5, n_jobs=1, verbose=0, n_estimators=10,
                                       base_estimator=base_learner, random_state=0)

    # 使用precision来评估
    scores = cross_val_score(model, df_train.loc[:, "volume_vale_1":].values
                                 , df_train.loc[:, "label"].values, cv=4, scoring=my_scorer, n_jobs=-1)
    return np.mean(scores)  # 返回评价指标  最小化还是最大化

In [None]:
def get_test_performance(df_train, best_params):

    base_learner = XGBClassifier(max_depth=7, learning_rate=0.05, n_estimators=10,  # 第三个数据集250
                                       verbosity=0, objective="binary:logistic",
                                       n_jobs=-1, gamma=0, min_child_weight=0.5, max_delta_step=0,
                                       subsample=1, colsample_bytree=0.5, colsample_bylevel=1,
                                       reg_alpha=0, reg_lambda=1, scale_pos_weight=1,  # init weight
                                       random_state=666, missing=0, importance_type="total_gain",
                                       use_label_encoder=False
                                       , eval_metric='logloss')
    base_learner.set_params(**best_params)  # 设置参数
    model = EasyEnsembleClassifier(sampling_strategy=0.5, n_jobs=1, verbose=0, n_estimators=10,
                                   base_estimator=base_learner, random_state=0)

    model.fit(df_train.loc[:, "volume_vale_1":].values, df_train.loc[:, "label"].values)
    print("-------")
    ans_label = model.predict(df_train.loc[:, "volume_vale_1":].values);  # 2
    print(" 训练集的结果 label=1 的个数", sum(ans_label))
    ans_score = model.predict_proba(df_train.loc[:, "volume_vale_1":].values);  # 2
    test_ans = get_metrics(df_train.loc[:, "label"].values, ans_label, ans_score[:, 1])
    print("训练集metrics  acc=%.3f precision=%.3f recall=%.3f \n auc=%.3f balanced_acc=%.3f f1=%.3f" % (test_ans[0],
             

In [10]:
def split_data(df_all, method="order"):#划分数据集
    with open(data_path,"rb") as f:
        df_train=pickle.load(f)
    df_train["label"]=df_train["ratio"].apply(lambda x:1 if x>1.5 else 0)
    df_train=df_train.sort_values("date_start",ascending=True).reset_index(drop=True)

    train_index=[i for i in range(df_train.shape[0]) if i<=int(df_train.shape[0]*0.8)]
    test_index = [i for i in range(df_train.shape[0]) if i > int(df_train.shape[0] * 0.8)]
    df_test=df_train.iloc[test_index,:].reset_index(drop=True)
    df_train=df_train.iloc[train_index,:].reset_index(drop=True)
    return df_train,df_test

In [9]:
def go(df_train, df_test):
    past = time.time()
    print("生成一个网络数据集之后 用模型对他进行评价")
    print("训练集大小",df_train.shape)
    print("测试集大小",df_test.shape)


    xgb_bo = BayesianOptimization(xgb_cv,
        {'max_depth': (4, 20),
         'learning_rate': (0.001, 0.5),
         'n_estimators': (10, 500),
         'gamma': (0.001, 3),
         'subsample': (0.2, 0.99),
         'reg_alpha': (0.01, 0.99),
         'reg_lambda': (0.01, 0.99)})
    xgb_bo.maximize(init_points=5, n_iter=2000)  # init_points表示初始点，n_iter代表迭代次数（即采样数）
    print("调参结果",xgb_bo.max)
    with open("best_params.pkl", "wb") as f:
        pickle.dump(xgb_bo.max, f)#保存

    get_test_performance(df_train,df_test,xgb_bo.max)
    print("消耗时间", (time.time() - past) / 60)



In [None]:
df_train,df_test=split_data(data_path,method="order")#[:,"volume_differ_1":"refund_10-15差值"]
print(df_train.shape,df_test.shape)
for model in ["Boosting"]:
    print("----当前任务----", model)
    go(df_train,df_test)