In [1]:
import numpy as np
import dpxgboost as xgb
from matplotlib import pyplot as plt
import argparse

import pandas as pd

from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score



In [2]:
LD = pd.read_excel('dataset/soybean_data.xlsx', sheet_name = 0, index_col="Unnamed: 0")
LQ = pd.read_excel('dataset/soybean_data.xlsx', sheet_name = 1, index_col="Unnamed: 0")
LY = pd.read_excel('dataset/soybean_data.xlsx', sheet_name = 2, index_col="Unnamed: 0")
QJ = pd.read_excel('dataset/soybean_data.xlsx', sheet_name = 3, index_col="Unnamed: 0")

In [3]:
soy_data = pd.concat([LD, LQ, LY, QJ],axis=0,sort=False)

In [4]:
labels_list = soy_data.index
prefix_labels_list = []
for name in labels_list:
    tmp_name = ""
    for token in name:
        if token.isalpha():
            tmp_name += token
    prefix_labels_list.append(tmp_name)
len(prefix_labels_list)

408

In [5]:
labels_counts_dict = dict()
for name in prefix_labels_list:
    if name in labels_counts_dict:
        labels_counts_dict[name] += 1
    else:
        labels_counts_dict[name] = 1
print(labels_counts_dict)

{'LD': 96, 'LQ': 101, 'LY': 111, 'QJ': 100}


In [6]:
soy_data.index = prefix_labels_list

In [7]:
features = soy_data
del soy_data

In [8]:
features = features.fillna(features.mean())

In [9]:
type(features)

pandas.core.frame.DataFrame

In [10]:
features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 408 entries, LD to QJ
Columns: 2454 entries, 10995.44 to 3998.858
dtypes: float64(2454)
memory usage: 7.6+ MB


In [11]:
char2num_dict = dict()
num = 0
for key in labels_counts_dict:
    char2num_dict[key] = num
    num += 1
del num

In [12]:
char2num_dict

{'LD': 0, 'LQ': 1, 'LY': 2, 'QJ': 3}

In [13]:
labels = []
for i in features.index:
    labels.append(char2num_dict[i])

In [14]:
features = np.array(features)
labels = np.array(labels)

In [15]:
features = normalize(X = features, axis = 1)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size = 0.2
)

In [17]:
kRows = 326
kCols = 2454
kClasses = 4                    # number of classes

kRounds = 100                    # number of boosting rounds.

# Generate some random data for demo.
X = np.random.randn(kRows, kCols)
y = np.random.randint(0, 4, size=kRows)

m = xgb.DMatrix(X_train, y_train)

In [18]:
X.shape, X_train.shape

((326, 2454), (326, 2454))

In [19]:
y.shape, y_train.shape

((326,), (326,))

In [20]:
def softmax(x):
    '''Softmax function with x as input vector.'''
    e = np.exp(x)
    return e / np.sum(e)


def softprob_obj(predt: np.ndarray, data: xgb.DMatrix):
    '''Loss function.  Computing the gradient and approximated hessian (diagonal).
    Reimplements the `multi:softprob` inside XGBoost.

    '''
    print('======data======',type(data))
    
    labels = data.get_label()
    if data.get_weight().size == 0:
        # Use 1 as weight if we don't have custom weight.
        weights = np.ones((kRows, 1), dtype=float)
    else:
        weights = data.get_weight()

    # The prediction is of shape (rows, classes), each element in a row
    # represents a raw prediction (leaf weight, hasn't gone through softmax
    # yet).  In XGBoost 1.0.0, the prediction is transformed by a softmax
    # function, fixed in later versions.
    
    print("predt.shape", predt.shape )
    print("(kRows, kClasses)", (kRows, kClasses))
    
    assert predt.shape == (kRows, kClasses)

    grad = np.zeros((kRows, kClasses), dtype=float)
    hess = np.zeros((kRows, kClasses), dtype=float)

    eps = 1e-6

    # compute the gradient and hessian, slow iterations in Python, only
    # suitable for demo.  Also the one in native XGBoost core is more robust to
    # numeric overflow as we don't do anything to mitigate the `exp` in
    # `softmax` here.
    for r in range(predt.shape[0]):
        target = labels[r]
        p = softmax(predt[r, :])
        for c in range(predt.shape[1]):
            assert target >= 0 or target <= kClasses
            g = p[c] - 1.0 if c == target else p[c]
            g = g * weights[r]
            h = max((2.0 * p[c] * (1.0 - p[c]) * weights[r]).item(), eps)
            grad[r, c] = g
            hess[r, c] = h

    # Right now (XGBoost 1.0.0), reshaping is necessary
    grad = grad.reshape((kRows * kClasses, 1))
    hess = hess.reshape((kRows * kClasses, 1))
    
    grad = grad + np.random.normal(loc=0,scale=0.002,size=grad.shape)
    hess = hess + np.random.normal(loc=0,scale=0.002,size=hess.shape)
    return grad, hess

In [21]:
def predict(booster: xgb.Booster, X):
    '''A customized prediction function that converts raw prediction to
    target class.

    '''
    # Output margin means we want to obtain the raw prediction obtained from
    # tree leaf weight.
    predt = booster.predict(X, output_margin=True)
    out = np.zeros(kRows)
    for r in range(predt.shape[0]):
        # the class with maximum prob (not strictly prob as it haven't gone
        # through softmax yet so it doesn't sum to 1, but result is the same
        # for argmax).
        i = np.argmax(predt[r])
        out[r] = i
    return out


def merror(predt: np.ndarray, dtrain: xgb.DMatrix):
    y = dtrain.get_label()
    # Like custom objective, the predt is untransformed leaf weight when custom objective
    # is provided.

    # With the use of `custom_metric` parameter in train function, custom metric receives
    # raw input only when custom objective is also being used.  Otherwise custom metric
    # will receive transformed prediction.
    assert predt.shape == (kRows, kClasses)
    out = np.zeros(kRows)
    for r in range(predt.shape[0]):
        i = np.argmax(predt[r])
        out[r] = i

    assert y.shape == out.shape

    errors = np.zeros(kRows)
    errors[y != out] = 1.0
    return 'PyMError', np.sum(errors) / kRows


def plot_history(custom_results, native_results):
    fig, axs = plt.subplots(2, 1)
    ax0 = axs[0]
    ax1 = axs[1]

    pymerror = custom_results['train']['PyMError']
    merror = native_results['train']['merror']

    x = np.arange(0, kRounds, 1)
    ax0.plot(x, pymerror, label='Custom objective')
    ax0.legend()
    ax1.plot(x, merror, label='multi:softmax')
    ax1.legend()

    plt.show()

In [22]:
del m

In [23]:
m = xgb.DMatrix(X_train, y_train)

In [24]:
custom_results = {}
booster_custom = xgb.train({'num_class': kClasses,
                                'disable_default_eval_metric': True},
                               m,
                               num_boost_round=kRounds,
                               obj=softprob_obj,
                               custom_metric=merror,
                               evals_result=custom_results,
                               evals=[(m, 'train')])

predt_custom = predict(booster_custom, m)

predt.shape (326, 4)
(kRows, kClasses) (326, 4)
[0]	train-PyMError:0.00307
predt.shape (326, 4)
(kRows, kClasses) (326, 4)
[1]	train-PyMError:0.00000
predt.shape (326, 4)
(kRows, kClasses) (326, 4)
[2]	train-PyMError:0.00000
predt.shape (326, 4)
(kRows, kClasses) (326, 4)
[3]	train-PyMError:0.00000
predt.shape (326, 4)
(kRows, kClasses) (326, 4)
[4]	train-PyMError:0.00000
predt.shape (326, 4)
(kRows, kClasses) (326, 4)
[5]	train-PyMError:0.00000
predt.shape (326, 4)
(kRows, kClasses) (326, 4)
[6]	train-PyMError:0.00000
predt.shape (326, 4)
(kRows, kClasses) (326, 4)
[7]	train-PyMError:0.00000
predt.shape (326, 4)
(kRows, kClasses) (326, 4)
[8]	train-PyMError:0.00000
predt.shape (326, 4)
(kRows, kClasses) (326, 4)
[9]	train-PyMError:0.00000
predt.shape (326, 4)
(kRows, kClasses) (326, 4)
[10]	train-PyMError:0.00000
predt.shape (326, 4)
(kRows, kClasses) (326, 4)
[11]	train-PyMError:0.00000
predt.shape (326, 4)
(kRows, kClasses) (326, 4)
[12]	train-PyMError:0.00000
predt.shape (326, 4)
(

[65]	train-PyMError:0.00000
predt.shape (326, 4)
(kRows, kClasses) (326, 4)
[66]	train-PyMError:0.00000
predt.shape (326, 4)
(kRows, kClasses) (326, 4)
[67]	train-PyMError:0.00000
predt.shape (326, 4)
(kRows, kClasses) (326, 4)
[68]	train-PyMError:0.00000
predt.shape (326, 4)
(kRows, kClasses) (326, 4)
[69]	train-PyMError:0.00000
predt.shape (326, 4)
(kRows, kClasses) (326, 4)
[70]	train-PyMError:0.00000
predt.shape (326, 4)
(kRows, kClasses) (326, 4)
[71]	train-PyMError:0.00000
predt.shape (326, 4)
(kRows, kClasses) (326, 4)
[72]	train-PyMError:0.00000
predt.shape (326, 4)
(kRows, kClasses) (326, 4)
[73]	train-PyMError:0.00000
predt.shape (326, 4)
(kRows, kClasses) (326, 4)
[74]	train-PyMError:0.00000
predt.shape (326, 4)
(kRows, kClasses) (326, 4)
[75]	train-PyMError:0.00000
predt.shape (326, 4)
(kRows, kClasses) (326, 4)
[76]	train-PyMError:0.00000
predt.shape (326, 4)
(kRows, kClasses) (326, 4)
[77]	train-PyMError:0.00000
predt.shape (326, 4)
(kRows, kClasses) (326, 4)
[78]	train-P

In [25]:
len(y_test)

82

In [26]:
len(predt_custom[:82])

82

In [27]:
m_test = xgb.DMatrix(X_test, y_test)

In [28]:
predt_custom = predict(booster_custom, m_test)

In [29]:
predt_custom

array([3., 3., 0., 0., 1., 2., 1., 0., 1., 0., 3., 2., 0., 0., 3., 1., 1.,
       2., 2., 0., 3., 2., 2., 3., 1., 1., 3., 3., 0., 0., 0., 2., 1., 3.,
       3., 3., 3., 3., 3., 0., 1., 2., 1., 1., 0., 2., 3., 1., 0., 2., 2.,
       3., 2., 1., 1., 1., 2., 1., 3., 3., 3., 3., 1., 0., 2., 3., 2., 2.,
       2., 1., 3., 2., 3., 1., 0., 2., 2., 0., 3., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [30]:
len(predt_custom)

326

In [31]:
accuracy = accuracy_score(y_test, predt_custom[:len(y_test)])
print("accuarcy: %.2f%%" % (accuracy * 100.0))

accuarcy: 76.83%
