## 深入理解xgboost六

### 毒蘑菇二分类问题

In [1]:
def load_fmap(file_name):
    fmap = {}
    nmap = {}
    
    for line in open(file_name):
        # 以空字符（空格、换行、制表符等）为分割符分割一行
        arr = line.split()
        # 解析每行中的特征名称、取值等，其中，idx为初始特征索引，ftype为初始特征名称，content为该特征取值说明
        if arr[0].find(".") != -1:
            idx = int(arr[0].strip("."))
            assert idx not in fmap
            fmap[idx] = {}
            ftype = arr[1].strip(":")
            content = arr[2]
        else:
            content = arr[0]
            
        # 解析取值
        # fmap是为特征的每个取值分配一个唯一标示的索引，nmap为处理后的新特征重新命名
        for it in content.split(","):
            if it.strip() == "":
                continue
            key, value = it.split("=")
            fmap[idx][value] = len(nmap) + 1
            nmap[len(nmap)] = ftype + "=" + key
    return fmap, nmap

In [2]:
def write_nmap(file_object, nmap):
    for i in range(len(nmap)):
        file_object.write("%d\t%s\ti\n" % (i, nmap[i]))

In [3]:
# 开始解析数据
fmap, nmap = load_fmap("dataset/agaricus-lepiota/agaricus-lepiota.fmap")
print(fmap)

# 保存处理后的新特征索引和名称的映射
file_object = open("dataset/agaricus-lepiota/feature_map.txt", "w")
write_nmap(file_object, nmap)
file_object.close()

{1: {'b': 1, 'c': 2, 'x': 3, 'f': 4, 'k': 5, 's': 6}, 2: {'f': 7, 'g': 8, 'y': 9, 's': 10}, 3: {'n': 11, 'b': 12, 'c': 13, 'g': 14, 'r': 15, 'p': 16, 'u': 17, 'e': 18, 'w': 19, 'y': 20}, 4: {'t': 21, 'f': 22}, 5: {'a': 23, 'l': 24, 'c': 25, 'y': 26, 'f': 27, 'm': 28, 'n': 29, 'p': 30, 's': 31}, 6: {'a': 32, 'd': 33, 'f': 34, 'n': 35}, 7: {'c': 36, 'w': 37, 'd': 38}, 8: {'b': 39, 'n': 40}, 9: {'k': 41, 'n': 42, 'b': 43, 'h': 44, 'g': 45, 'r': 46, 'o': 47, 'p': 48, 'u': 49, 'e': 50, 'w': 51, 'y': 52}, 10: {'e': 53, 't': 54}, 11: {'b': 55, 'c': 56, 'u': 57, 'e': 58, 'z': 59, 'r': 60, '?': 61}, 12: {'f': 62, 'y': 63, 'k': 64, 's': 65}, 13: {'f': 66, 'y': 67, 'k': 68, 's': 69}, 14: {'n': 70, 'b': 71, 'c': 72, 'g': 73, 'o': 74, 'p': 75, 'e': 76, 'w': 77, 'y': 78}, 15: {'n': 79, 'b': 80, 'c': 81, 'g': 82, 'o': 83, 'p': 84, 'e': 85, 'w': 86, 'y': 87}, 16: {'p': 88, 'u': 89}, 17: {'n': 90, 'o': 91, 'w': 92, 'y': 93}, 18: {'n': 94, 'o': 95, 't': 96}, 19: {'c': 97, 'e': 98, 'f': 99, 'l': 100, 'n'

通过新特征索引处理原始数据，生成转化后的数据

In [4]:
file_object = open("dataset/agaricus-lepiota/agaricus.txt", "w")

for line in open("dataset/agaricus-lepiota/agaricus-lepiota.data"):
    arr = line.split(",")
    if arr[0] == "p":
        file_object.write("1")
    else:
        assert arr[0] == "e"
        file_object.write("0")
    
    for i in range(1, len(arr)):
        file_object.write(" %d:1" % (fmap[i][arr[i].strip()]))
    file_object.write("\n")

file_object.close()

### xgboost二分类

In [5]:
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split

In [6]:
X, y = load_svmlight_file("dataset/agaricus-lepiota/agaricus.txt")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [7]:
import xgboost as xgb

In [8]:
xgb_train = xgb.DMatrix(X_train, y_train)
xgb_test = xgb.DMatrix(X_test, y_test)

In [9]:
# 参数
params = {"objective": "binary:logistic",
          "booster": "gbtree",
          "max_depth": 6}

In [10]:
num_round = 10
watch_list = [(xgb_train, "training"), (xgb_test, "testing")]

In [11]:
model = xgb.train(params=params, dtrain=xgb_train, num_boost_round=num_round, evals=watch_list)

[0]	training-logloss:0.43920	testing-logloss:0.44076
[1]	training-logloss:0.29856	testing-logloss:0.30134
[2]	training-logloss:0.20988	testing-logloss:0.21370
[3]	training-logloss:0.15053	testing-logloss:0.15531
[4]	training-logloss:0.10947	testing-logloss:0.11515
[5]	training-logloss:0.08039	testing-logloss:0.08693
[6]	training-logloss:0.05937	testing-logloss:0.06536
[7]	training-logloss:0.04419	testing-logloss:0.04978
[8]	training-logloss:0.03303	testing-logloss:0.03809
[9]	training-logloss:0.02472	testing-logloss:0.02874


In [12]:
model.save_model("model/agaricus.model")
model.dump_model("model/agaricus.json")

In [13]:
booster = xgb.Booster()
booster.load_model("model/agaricus.model")

In [14]:
y_pred = booster.predict(xgb_test)
y_pred

array([0.02283082, 0.97443706, 0.97443706, ..., 0.97741723, 0.02283082,
       0.02367815], dtype=float32)

In [15]:
import numpy as np
from sklearn.metrics import classification_report

In [16]:
y_pred = np.round(y_pred)

print(classification_report(y_true=y_test, y_pred=y_pred, labels=[1, 0], target_names=["有毒", "无毒"]))

              precision    recall  f1-score   support

          有毒       1.00      0.99      1.00       805
          无毒       0.99      1.00      1.00       820

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



In [17]:
booster.dump_model("model/agaricus.nice.model", "dataset/agaricus-lepiota/feature_map.txt")