## 深入理解xgboost模型三

### sklearn逻辑回归

In [1]:
from sklearn.datasets import load_breast_cancer

In [2]:
# 加载数据集
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target

In [3]:
X.shape

(569, 30)

In [4]:
y.shape

(569,)

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [7]:
print(X.shape, X_train.shape, X_test.shape, y.shape, y_train.shape, y_test.shape)

(569, 30) (455, 30) (114, 30) (569,) (455,) (114,)


In [8]:
from sklearn.linear_model import LogisticRegression

In [9]:
# 生成模型
lr = LogisticRegression(max_iter=1000)

In [10]:
# 训练模型
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
y_pred = lr.predict(X=X_test)

In [12]:
from sklearn.metrics import classification_report

In [13]:
print(classification_report(y_true=y_test, y_pred=y_pred, target_names=["Benign", "Malignant"]))

              precision    recall  f1-score   support

      Benign       1.00      0.88      0.94        42
   Malignant       0.94      1.00      0.97        72

    accuracy                           0.96       114
   macro avg       0.97      0.94      0.95       114
weighted avg       0.96      0.96      0.96       114



### xgboost逻辑回归

In [14]:
import xgboost as xgb

In [15]:
# 生成数据
xgb_train = xgb.DMatrix(X_train, y_train)

In [16]:
# 参数
params = {"objective":"reg:logistic", "booster":"gblinear"}

In [17]:
# 生成训练模型
model = xgb.train(params=params, dtrain=xgb_train)

In [18]:
# 预测
y_pred = model.predict(xgb.DMatrix(X_test))

In [19]:
# 这里预测结果为概率
print(y_pred)

[4.32575673e-01 3.87444347e-01 9.62227821e-01 1.37321681e-01
 8.50990057e-01 2.47148797e-01 3.40264151e-03 1.42727837e-01
 9.96158421e-01 7.93546736e-01 9.74221587e-01 7.87277222e-01
 6.78809285e-02 9.31225836e-01 8.46819222e-01 7.08813012e-01
 9.28504884e-01 9.42662358e-01 8.91862571e-01 2.44233236e-02
 9.57943082e-01 9.53470647e-01 1.04785919e-01 9.23746049e-01
 3.60998809e-01 8.07484150e-01 6.40015900e-01 3.71876694e-02
 5.70018892e-06 1.87422141e-01 6.94824048e-05 9.61340904e-01
 2.96510346e-02 4.25174385e-01 9.96145368e-01 9.39764738e-01
 4.21916932e-01 8.49101067e-01 8.51741493e-01 9.25813973e-01
 9.96450067e-01 8.45932126e-01 9.29343760e-01 8.43761683e-01
 7.62668967e-01 4.56138141e-02 9.96396482e-01 9.96479452e-01
 7.27292061e-01 7.14597479e-02 5.05780727e-02 1.14327740e-05
 9.47773933e-01 9.86772120e-01 9.83441472e-01 8.91702533e-01
 9.31580007e-01 4.35441464e-01 9.19424236e-01 9.91948366e-01
 9.74674821e-01 5.20324633e-02 8.96234632e-01 6.14954889e-01
 9.69263375e-01 9.300961

In [20]:
import numpy as np
y_pred = np.round(y_pred)

In [21]:
print(classification_report(y_true=y_test, y_pred=y_pred, target_names=["class 0", "class 1"]))

              precision    recall  f1-score   support

     class 0       0.95      0.86      0.90        42
     class 1       0.92      0.97      0.95        72

    accuracy                           0.93       114
   macro avg       0.93      0.91      0.92       114
weighted avg       0.93      0.93      0.93       114

