In [40]:
## 第5版 找到关键特征
# 5 fold
#  score =  88.1107

import pandas as pd
import numpy as np

train = pd.read_csv('./datasets/train.csv')
test1 = pd.read_csv('./datasets/test1.csv')

pd.set_option('display.max_rows', 100)

In [41]:
features = train.drop(['Unnamed: 0', 'label'], axis=1)
test1 = test1.drop(['Unnamed: 0'], axis=1)
labels = train['label']


In [42]:

if 'os' in features.columns:
    features, test1 = features.drop(['os'], axis=1), test1.drop(['os'], axis=1)
if 'sid' in features.columns:
    features = features.drop(['sid'], axis=1)

In [43]:
from utils.process_osv import  process_osv
from utils.process_lan import  process_lan
from utils.add_key_features import  add_key_features

In [44]:
features, test1 = process_osv(features, test1)
features, test1 = process_lan(features, test1)

selected_cols = ['apptype', 'carrier', 'dev_height', 'dev_ppi',
       'dev_width', 'media_id', 'package', 'version', 'fea_hash',
       'location', 'fea1_hash', 'cus_type']

features, test1 = add_key_features(train, features, test1, selected_cols)


In [45]:
features['android_id'].value_counts()
features['lan'].value_counts()
features['carrier'].value_counts()
features['ntt'].value_counts()
features['version'].value_counts()
test1['version'].value_counts()
test1['location'].value_counts()
test1['cus_type'].value_counts()
test1['lan'].value_counts()

# 类别特征
cate_features = ['lan', 'apptype', 'carrier', 'ntt', 'version', 'location', 'cus_type']

In [46]:
import time
from datetime import datetime

def get_date(features):
    if 'timestamp' not in features.columns:
        return
    features2 = features.copy()
    # 除以1000 转化为日期格式
    features2['timestamp'] = features2['timestamp'].apply(lambda x: datetime.fromtimestamp(x/1000))
    
    # 创建时间索引
    temp = pd.DatetimeIndex(features2['timestamp'])
    features2['year'] = temp.year
    features2['month'] = temp.month
    features2['day'] = temp.day
    features2['week_day'] = temp.weekday
    features2['hour'] = temp.hour
    features2['minute'] = temp.minute
    
    
    start_time = features2['timestamp'].min()
    features2['time_diff'] = features2['timestamp'] - start_time
    
    features2['time_diff'] = features2['time_diff'].dt.days * 24 + features2['time_diff'].dt.seconds / 3600
    
    features2.drop(['timestamp', 'year', 'month', 'week_day', 'minute'], axis = 1, inplace=True)
    return features2


features = get_date(features)

test1 = get_date(test1)

In [47]:
 from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# 训练集和测试机合并，统一LabelEncoder

all_df = pd.concat([train, test1])
all_df['lan'] = all_df['lan'].astype('str')
all_df['lan'] = le.fit_transform(all_df['lan'])

all_df['lan'].value_counts()

16    317520
13    238827
15     39668
5      26821
21     14993
4      10552
20       807
3        245
1        214
14       147
6        115
22        37
0         20
11        11
19         7
17         3
8          3
2          2
9          2
10         2
7          2
18         1
12         1
Name: lan, dtype: int64

In [50]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score

def ensemble_model(clf, train_x, train_y, test):
    sk = StratifiedKFold(n_splits=5, shuffle=True, random_state=2021)
    prob = []
    mean_acc = 0
    for k, (train_index, val_index) in enumerate(sk.split(train_x, train_y)):
        train_x_real = train_x.iloc[train_index]
        train_y_real = train_y.iloc[train_index]
        val_x = train_x.iloc[val_index]
        val_y = train_y.iloc[val_index]
        
        clf = clf.fit(train_x_real, train_y_real)
        val_y_pred = clf.predict(val_x)
        
        acc_val = accuracy_score(val_y, val_y_pred)
        print('第{}个子模型， acc{}'.format(k+1, acc_val))
        mean_acc += acc_val / 5
        
        test_y_pred = clf.predict_proba(test)[:, -1]
        prob.append(test_y_pred)
    print(mean_acc)
    mean_prob = sum(prob) / 5
    return mean_prob

In [56]:
features['fea_hash'].value_counts()[300:300]

402980

In [62]:
# 特征变换，对于数值过大的异常值，设定为0
features['fea_hash'] = features['fea_hash'].map(lambda x: 0 if len(str(x)) > 16 else int(x))
features['fea1_hash'] = features['fea1_hash'].map(lambda x: 0 if len(str(x)) > 16 else int(x))

features['version'] = features['version'].map(lambda x: int(x) if str(x).isdigit() else 0)
# 特征变换，对于数值过大的异常值，设定为0

features['lan'] = all_df[all_df['label'].notnull()]['lan']

# test data 
test_fea = test1[features.columns]

test_fea['fea_hash'] = features['fea_hash'].map(lambda x: 0 if len(str(x)) > 16 else int(x))
test_fea['fea1_hash'] = features['fea1_hash'].map(lambda x: 0 if len(str(x)) > 16 else int(x))
test_fea['version'] = features['version'].map(lambda x: int(x) if str(x).isdigit() else 0)
test_fea['lan'] = all_df[all_df['label'].isnull()]['lan']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_fea['fea_hash'] = features['fea_hash'].map(lambda x: 0 if len(str(x)) > 16 else int(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_fea['fea1_hash'] = features['fea1_hash'].map(lambda x: 0 if len(str(x)) > 16 else int(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_fea['version

In [64]:
import time
# v7 xgboost
# v8 使用 catboost
# v9 nn 归一化

import xgboost as xgb

max_depth = 6
n_estimators = 2000
subsample=0.75
colsample_bytree=0.75
reg_lambda = 0.8

clf = xgb.XGBClassifier(
    max_depth = max_depth,
    learning_rate = 0.05,
    n_estimators = n_estimators,
    objective = 'binary:logistic',
    tree_method = 'gpu_hist',
    subsample=subsample,
    colsample_bytree=colsample_bytree,
    min_child_samples = 3,
    eval_metric='auc',
    reg_lambda = reg_lambda
)
# clf.fit(features, labels, categorical_feature = cate_features)
result = ensemble_model(clf, features, labels, test_fea)
print(result)

a = pd.DataFrame(test1['sid'])
a['label'] = result

a['label'] = a['label'].apply(lambda x: 0 if x < 0.5 else 1)




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第1个子模型， acc0.88916




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第2个子模型， acc0.88961




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第3个子模型， acc0.88926




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第4个子模型， acc0.88882




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第5个子模型， acc0.88994
0.8893580000000001
[0.0650456  0.4270019  0.069857   ... 0.9209366  0.96171397 0.96894646]


TypeError: can only concatenate str (not "int") to str

In [67]:

a = pd.DataFrame(test1['sid'])
a['label'] = result

a['label'] = a['label'].apply(lambda x: 0 if x < 0.5 else 1)

    
import datetime
timestr = datetime.datetime.now().strftime('%y-%m-%d %H %M %S')



n_estimators = 2000
subsample=0.75
colsample_bytree=0.75
reg_lambda = 0.8

a.to_csv('result/v7' + 'max_depth=6subsample=0.75colsample_bytree=0.75reg_lambda = 0.8' + '.csv', index=False)

In [69]:
import time
# v7 xgboost
# v8 使用 catboost
# v9 nn 归一化

import xgboost as xgb

max_depth = 7
n_estimators = 2000
subsample=0.75
colsample_bytree=0.75
reg_lambda = 0.8

clf = xgb.XGBClassifier(
    max_depth = max_depth,
    learning_rate = 0.05,
    n_estimators = n_estimators,
    objective = 'binary:logistic',
    tree_method = 'gpu_hist',
    subsample=subsample,
    colsample_bytree=colsample_bytree,
    min_child_samples = 3,
    eval_metric='auc',
    reg_lambda = reg_lambda
)
# clf.fit(features, labels, categorical_feature = cate_features)
result = ensemble_model(clf, features, labels, test_fea)
print(result)

a = pd.DataFrame(test1['sid'])
a['label'] = result

a['label'] = a['label'].apply(lambda x: 0 if x < 0.5 else 1)




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第1个子模型， acc0.88875




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第2个子模型， acc0.88911




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第3个子模型， acc0.88941




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第4个子模型， acc0.8884




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第5个子模型， acc0.88958
0.8890499999999999
[0.07575818 0.4792742  0.10179408 ... 0.905814   0.964489   0.96795166]


In [70]:

a = pd.DataFrame(test1['sid'])
a['label'] = result

a['label'] = a['label'].apply(lambda x: 0 if x < 0.5 else 1)

    
import datetime
timestr = datetime.datetime.now().strftime('%y-%m-%d %H %M %S')



n_estimators = 2000
subsample=0.75
colsample_bytree=0.75
reg_lambda = 0.8

a.to_csv('result/v7' + 'max_depth=7subsample=0.75colsample_bytree=0.75reg_lambda = 0.8' + '.csv', index=False)

In [72]:
import time
# v7 xgboost
# v8 使用 catboost
# v9 nn 归一化

import xgboost as xgb

max_depth = 6
n_estimators = 2000
subsample=0.72
colsample_bytree=0.72
reg_lambda = 0.7

clf = xgb.XGBClassifier(
    max_depth = max_depth,
    learning_rate = 0.05,
    n_estimators = n_estimators,
    objective = 'binary:logistic',
    tree_method = 'gpu_hist',
    subsample=subsample,
    colsample_bytree=colsample_bytree,
    min_child_samples = 3,
    eval_metric='auc',
    reg_lambda = reg_lambda
)
# clf.fit(features, labels, categorical_feature = cate_features)
result = ensemble_model(clf, features, labels, test_fea)
print(result)

a = pd.DataFrame(test1['sid'])
a['label'] = result

a['label'] = a['label'].apply(lambda x: 0 if x < 0.5 else 1)




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第1个子模型， acc0.88869




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第2个子模型， acc0.89011




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第3个子模型， acc0.88923




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第4个子模型， acc0.88889




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第5个子模型， acc0.88961
0.889306
[0.06745136 0.44707346 0.09409674 ... 0.92551994 0.9630693  0.9683505 ]


In [73]:

a = pd.DataFrame(test1['sid'])
a['label'] = result

a['label'] = a['label'].apply(lambda x: 0 if x < 0.5 else 1)

    
import datetime
timestr = datetime.datetime.now().strftime('%y-%m-%d %H %M %S')



n_estimators = 2000
subsample=0.75
colsample_bytree=0.75
reg_lambda = 0.8

a.to_csv('result/v7' + 'max_depth=6subsample=0.72colsample_bytree=0.72reg_lambda = 0.7' + '.csv', index=False)

In [74]:
import time
# v7 xgboost
# v8 使用 catboost
# v9 nn 归一化

import xgboost as xgb

max_depth = 5
n_estimators = 2000
subsample=0.72
colsample_bytree=0.72
reg_lambda = 0.7

clf = xgb.XGBClassifier(
    max_depth = max_depth,
    learning_rate = 0.05,
    n_estimators = n_estimators,
    objective = 'binary:logistic',
    tree_method = 'gpu_hist',
    subsample=subsample,
    colsample_bytree=colsample_bytree,
    min_child_samples = 3,
    eval_metric='auc',
    reg_lambda = reg_lambda
)
# clf.fit(features, labels, categorical_feature = cate_features)
result = ensemble_model(clf, features, labels, test_fea)
print(result)

a = pd.DataFrame(test1['sid'])
a['label'] = result

a['label'] = a['label'].apply(lambda x: 0 if x < 0.5 else 1)




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第1个子模型， acc0.88908




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第2个子模型， acc0.88933




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第3个子模型， acc0.88893




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第4个子模型， acc0.88863




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第5个子模型， acc0.88973
0.88914
[0.07495724 0.36987686 0.0668446  ... 0.92707384 0.9641236  0.9648153 ]


In [75]:

a = pd.DataFrame(test1['sid'])
a['label'] = result

a['label'] = a['label'].apply(lambda x: 0 if x < 0.5 else 1)

    
import datetime
timestr = datetime.datetime.now().strftime('%y-%m-%d %H %M %S')



n_estimators = 2000
subsample=0.75
colsample_bytree=0.75
reg_lambda = 0.8

a.to_csv('result/v7' + 'max_depth=5subsample=0.72colsample_bytree=0.72reg_lambda = 0.7' + '.csv', index=False)

In [80]:
import time
# v7 xgboost
# v8 使用 catboost
# v9 nn 归一化

import xgboost as xgb

# 88.1407

max_depth = 5
n_estimators = 5000
subsample=0.72
colsample_bytree=0.72
reg_lambda = 1.2

clf = xgb.XGBClassifier(
    max_depth = max_depth,
    learning_rate = 0.05,
    n_estimators = n_estimators,
    objective = 'binary:logistic',
    tree_method = 'gpu_hist',
    subsample=subsample,
    colsample_bytree=colsample_bytree,
    min_child_samples = 3,
    eval_metric='auc',
    reg_lambda = reg_lambda
)
# clf.fit(features, labels, categorical_feature = cate_features)
result = ensemble_model(clf, features, labels, test_fea)
print(result)


a = pd.DataFrame(test1['sid'])
a['label'] = result

a['label'] = a['label'].apply(lambda x: 0 if x < 0.5 else 1)

    
import datetime
timestr = datetime.datetime.now().strftime('%y-%m-%d %H %M %S')

a.to_csv('result/v7' + 'max_depth=5subsample=0.72colsample_bytree=0.72reg_lambda = 1.2' + '.csv', index=False)



Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第1个子模型， acc0.88817




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第2个子模型， acc0.88934




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第3个子模型， acc0.88874




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第4个子模型， acc0.88878




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第5个子模型， acc0.88949
0.888904
[0.10891336 0.4991042  0.08520729 ... 0.92479163 0.9577077  0.9695404 ]


In [81]:
import time
# v7 xgboost
# v8 使用 catboost
# v9 nn 归一化

# 88.1393

import xgboost as xgb

max_depth = 5
n_estimators = 5000
subsample=0.72
colsample_bytree=0.72
reg_lambda = 2

clf = xgb.XGBClassifier(
    max_depth = max_depth,
    learning_rate = 0.05,
    n_estimators = n_estimators,
    objective = 'binary:logistic',
    tree_method = 'gpu_hist',
    subsample=subsample,
    colsample_bytree=colsample_bytree,
    min_child_samples = 3,
    eval_metric='auc',
    reg_lambda = reg_lambda
)
# clf.fit(features, labels, categorical_feature = cate_features)
result = ensemble_model(clf, features, labels, test_fea)
print(result)


a = pd.DataFrame(test1['sid'])
a['label'] = result

a['label'] = a['label'].apply(lambda x: 0 if x < 0.5 else 1)

    
import datetime
timestr = datetime.datetime.now().strftime('%y-%m-%d %H %M %S')

a.to_csv('result/v7' + 'max_depth=5subsample=0.72colsample_bytree=0.72reg_lambda = 2' + '.csv', index=False)



Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第1个子模型， acc0.88841




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第2个子模型， acc0.88907




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第3个子模型， acc0.88895




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第4个子模型， acc0.88806




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第5个子模型， acc0.88933
0.888764
[0.08944175 0.45126265 0.09689205 ... 0.9098509  0.9605802  0.9691149 ]


In [83]:
import time
# v7 xgboost
# v8 使用 catboost
# v9 nn 归一化

import xgboost as xgb

#88.1993

max_depth = 4
n_estimators = 5000
subsample=0.72
colsample_bytree=0.72
reg_lambda = 1.2

clf = xgb.XGBClassifier(
    max_depth = max_depth,
    learning_rate = 0.05,
    n_estimators = n_estimators,
    objective = 'binary:logistic',
    tree_method = 'gpu_hist',
    subsample=subsample,
    colsample_bytree=colsample_bytree,
    min_child_samples = 3,
    eval_metric='auc',
    reg_lambda = reg_lambda
)
# clf.fit(features, labels, categorical_feature = cate_features)
result = ensemble_model(clf, features, labels, test_fea)
print(result)


a = pd.DataFrame(test1['sid'])
a['label'] = result

a['label'] = a['label'].apply(lambda x: 0 if x < 0.5 else 1)

    
import datetime
timestr = datetime.datetime.now().strftime('%y-%m-%d %H %M %S')

a.to_csv('result/v7' + 'max_depth=4subsample=0.72colsample_bytree=0.72reg_lambda = 1.2' + '.csv', index=False)



Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第1个子模型， acc0.88798




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第2个子模型， acc0.88919




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第3个子模型， acc0.88837




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第4个子模型， acc0.88825




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


第5个子模型， acc0.88964
0.888686
[0.07341367 0.4156049  0.07484739 ... 0.9254135  0.96811694 0.966746  ]


In [None]:
import time
# v7 xgboost
# v8 使用 catboost
# v9 nn 归一化

import xgboost as xgb


max_depth = 4
n_estimators = 5000
subsample=0.72
colsample_bytree=0.72
reg_lambda = 5

clf = xgb.XGBClassifier(
    max_depth = max_depth,
    learning_rate = 0.05,
    n_estimators = n_estimators,
    objective = 'binary:logistic',
    tree_method = 'gpu_hist',
    subsample=subsample,
    colsample_bytree=colsample_bytree,
    min_child_samples = 3,
    eval_metric='auc',
    reg_lambda = reg_lambda
)
# clf.fit(features, labels, categorical_feature = cate_features)
result = ensemble_model(clf, features, labels, test_fea)
print(result)


a = pd.DataFrame(test1['sid'])
a['label'] = result

a['label'] = a['label'].apply(lambda x: 0 if x < 0.5 else 1)

    
import datetime
timestr = datetime.datetime.now().strftime('%y-%m-%d %H %M %S')

a.to_csv('result/v7' + 'max_depth=4subsample=0.72colsample_bytree=0.72reg_lambda = 5' + '.csv', index=False)