In [7]:
# /usr/bin/python
# -*- encoding:utf-8 -*-

import xgboost as xgb
import numpy as np

# 1、xgBoost的基本使用
# 2、自定义损失函数的梯度和二阶导
# 3、binary:logistic/logitraw

# 定义f: theta * x
def log_reg(y_hat, y):
    p = 1.0 / (1.0 + np.exp(-y_hat))
    g = p - y.get_label()
    h = p * (1.0 - p)
    return g, h


def error_rate(y_hat, y):
    return 'error', float(sum(y.get_label() != (y_hat > 0.5))) / len(y_hat)

In [8]:
# 读取数据
data_train = xgb.DMatrix('../dataset/agaricus_train.txt')
data_test = xgb.DMatrix('../dataset/agaricus_test.txt')
print data_train
print type(data_train)

# 设置参数
param = {'max_depth': 3, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}  # logitraw
# param = {'max_depth': 3, 'eta': 0.3, 'silent': 1, 'objective': 'reg:logistic'}
watchlist = [(data_test, 'eval'), (data_train, 'train')]
n_round = 7
# bst = xgb.train(param, data_train, num_boost_round=n_round, evals=watchlist)
bst = xgb.train(param, data_train, num_boost_round=n_round, evals=watchlist, obj=log_reg, feval=error_rate)

# 计算错误率
y_hat = bst.predict(data_test)
y = data_test.get_label()
print y_hat
print y
error = sum(y != (y_hat > 0.5))
error_rate = float(error) / len(y_hat)
print '样本总数：\t', len(y_hat)
print '错误数目：\t%4d' % error
print '错误率：\t%.5f%%' % (100 * error_rate)

<xgboost.core.DMatrix object at 0x7fdb93817d50>
<class 'xgboost.core.DMatrix'>
[0]	eval-error:0.016139	train-error:0.014433
[1]	eval-error:0.016139	train-error:0.014433
[2]	eval-error:0.016139	train-error:0.014433
[3]	eval-error:0.016139	train-error:0.014433
[4]	eval-error:0.002483	train-error:0.003071
[5]	eval-error:0.002483	train-error:0.003071
[6]	eval-error:0.002483	train-error:0.003071
[  6.09937888e-06   9.84727502e-01   6.09937888e-06 ...,   9.99932647e-01
   4.45600620e-07   9.99932647e-01]
[ 0.  1.  0. ...,  1.  0.  1.]
样本总数：	1611
错误数目：	   4
错误率：	0.24829%


In [9]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split  # cross_validation
def iris_type(s):
    it = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
    return it[s]


path = '../dataset/iris.data'  # 数据文件路径
data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type})
data = pd.read_csv(path, header=None)
x, y = data[range(4)], data[4]
y = pd.Categorical(y).codes
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=50)

data_train = xgb.DMatrix(x_train, label=y_train)
data_test = xgb.DMatrix(x_test, label=y_test)
watch_list = [(data_test, 'eval'), (data_train, 'train')]
param = {'max_depth': 2, 'eta': 0.3, 'silent': 1, 'objective': 'multi:softmax', 'num_class': 3}

bst = xgb.train(param, data_train, num_boost_round=6, evals=watch_list)
y_hat = bst.predict(data_test)
result = y_test.reshape(1, -1) == y_hat
print '正确率:\t', float(np.sum(result)) / len(y_hat)
print 'END.....\n'

[0]	eval-merror:0.04	train-merror:0.04
[1]	eval-merror:0.04	train-merror:0.04
[2]	eval-merror:0.02	train-merror:0.02
[3]	eval-merror:0.02	train-merror:0.02
[4]	eval-merror:0.02	train-merror:0.02
[5]	eval-merror:0.02	train-merror:0.02
正确率:	0.98
END.....



In [10]:
def read_data(path):
    y = []
    row = []
    col = []
    values = []
    r = 0       # 首行
    for d in open(path):
        d = d.strip().split()      # 以空格分开
        y.append(int(d[0]))
        d = d[1:]
        for c in d:
            key, value = c.split(':')
            row.append(r)
            col.append(int(key))
            values.append(float(value))
        r += 1
    x = scipy.sparse.csr_matrix((values, (row, col))).toarray()
    y = np.array(y)
    return x, y

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
import scipy.sparse
x, y = read_data('../dataset/agaricus_train.txt')
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, train_size=0.6)

# Logistic回归
lr = LogisticRegression(penalty='l2')
lr.fit(x_train, y_train.ravel())
y_hat = lr.predict(x_test)
print 'Logistic回归正确率：', accuracy_score(y_test, y_hat)

# XGBoost
data_train = xgb.DMatrix(x_train, label=y_train)
data_test = xgb.DMatrix(x_test, label=y_test)
watch_list = [(data_test, 'eval'), (data_train, 'train')]
param = {'max_depth': 3, 'eta': 1, 'silent': 0, 'objective': 'multi:softmax', 'num_class': 3}
bst = xgb.train(param, data_train, num_boost_round=4, evals=watch_list)
y_hat = bst.predict(data_test)
print 'XGBoost正确率：', accuracy_score(y_test, y_hat)

Logistic回归正确率： 1.0
[0]	eval-merror:0.035687	train-merror:0.040696
[1]	eval-merror:0.007291	train-merror:0.009982
[2]	eval-merror:0.000767	train-merror:0.000512
[3]	eval-merror:0.000767	train-merror:0.000512
XGBoost正确率： 0.999232540292


In [15]:
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split  # cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 

data = np.loadtxt('../dataset/wine.data', dtype=float, delimiter=',')
y, x = np.split(data, (1,), axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=0.5)

# Logistic回归
lr = LogisticRegression(penalty='l2')
lr.fit(x_train, y_train.ravel())
y_hat = lr.predict(x_test)
print 'Logistic回归正确率：', accuracy_score(y_test, y_hat)

# XGBoost
y_train[y_train == 3] = 0
y_test[y_test == 3] = 0
data_train = xgb.DMatrix(x_train, label=y_train)
data_test = xgb.DMatrix(x_test, label=y_test)
watch_list = [(data_test, 'eval'), (data_train, 'train')]
params = {'max_depth': 3, 'eta': 1, 'silent': 0, 'objective': 'multi:softmax', 'num_class': 3}
bst = xgb.train(params, data_train, num_boost_round=2, evals=watch_list)
y_hat = bst.predict(data_test)
print 'XGBoost正确率：', accuracy_score(y_test, y_hat)

Logistic回归正确率： 0.943820224719
[0]	eval-merror:0.011236	train-merror:0
[1]	eval-merror:0	train-merror:0
XGBoost正确率： 1.0
