In [1]:
### 基础导包
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

In [2]:
### 模型导包
import xgboost as xgb
from sklearn.metrics import accuracy_score 

In [3]:
### 基础设置
pd.set_option('display.float_format',lambda x: '%.2f'%x)
pd.set_option('display.expand_frame_repr',False)
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
#pd.set_option('display.max_rows', None)
pd.set_eng_float_format(accuracy=1, use_eng_prefix=True)

In [4]:
train = pd.read_csv("./processed/train_for_model.csv")
test = pd.read_csv("./processed/test_for_model.csv")


In [5]:
def split_data(data,y_feature):
    return train_test_split(data,data[y_feature],test_size=0.20,random_state=20220319)
    raise NotImplementedError

In [6]:
 params = {
              'booster': 'gbtree',
              'objective': 'multi:softmax', # 多分类的问题、
              'num_class': 2, # 类别数，与 multisoftmax 并用
              # 'objective': 'multi:softprob', # 多分类概率
              # 'objective': 'binary:logistic',
            #   'eval_metric': 'auc',
              'gamma': 0.1, # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
              'max_depth': 8, # 构建树的深度，越大越容易过拟合
              'alpha': 0, # L1正则化系数
              'lambda': 10, # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
              'subsample': 0.7, # 随机采样训练样本
              'colsample_bytree': 0.5, # 生成树时进行的列采样
              'min_child_weight': 3,
              # 这个参数默认是 1，是每个叶子里面 h 的和至少是多少，对正负样本不均衡时的 0-1 分类而言
              # ，假设 h 在 0.01 附近，min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
              # 这个参数非常影响结果，控制叶子节点中二阶导的和的最小值，该参数值越小，越容易 overfitting。
              'silent': 0, # 设置成1则没有运行信息输出，最好是设置为0.
              'eta': 0.03, # 如同学习率
              'seed': 20220319 ,
              'nthread': -1, # cpu 线程数
              'missing': 1,
              'scale_pos_weight': 1.656 # 用来处理正负样本不均衡的问题,通常取：sum(negative cases) / sum(positive cases)
              }


In [7]:
### 注意只对train划出验证集来，test按兵不动
x_train,x_eval,y_train,y_eval = split_data(train,'Survived')   # 注意，train 里面含有标签 ，因此划分完毕之后应该手动去除标签
x_train.drop(columns=['Survived'] , axis= 0 , inplace=True) 
x_eval.drop(columns=['Survived'] , axis= 0 , inplace=True) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [8]:
dtrain = xgb.DMatrix(data= x_train , label = y_train)
dtest = xgb.DMatrix(data = x_eval , label = y_eval)
watchlist = [(dtrain , 'train') , (dtest , 'eval')]

In [11]:
xgb_model = xgb.train(
                    params , 
                    dtrain , 
                    num_boost_round=200 ,
                    evals=watchlist , 
                    early_stopping_rounds=10 ,
                    verbose_eval=500
                      )

Parameters: { missing, scale_pos_weight, silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-mlogloss:0.68406	eval-mlogloss:0.68542
[199]	train-mlogloss:0.32988	eval-mlogloss:0.43533


In [12]:
z = xgb.DMatrix(test , label=None , missing=-1)

In [17]:
y_pre = xgb_model.predict(z  , ntree_limit=xgb_model.best_ntree_limit)

In [21]:
y_pre


array([0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 0., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0.,
       0., 1., 1., 0., 0., 0., 1., 1., 0., 1., 0., 1., 1., 0., 0., 0., 0.,
       0., 1., 0., 1., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 1.,
       1., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 1., 0., 1., 0., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 1., 1., 1.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 1.

In [26]:
pre_int = []
for x in y_pre :
    pre_int.append(int(x))

In [27]:
pre_int

[0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,


In [28]:
submit = pd.read_csv("./work/gender_submission.csv")
submit["Survived"] =  pre_int

In [29]:
submit.to_csv("./submit/submit_xgb.csv" , index=0)  # 将预测文件保存

In [30]:
xgb_model.save_model("./model/xgboost.json")