In [71]:
#https://www.kaggle.com/tannistha/hyperparameter-grid-search-with-xgboost/edit
#https://github.com/IBM/xgboost-smote-detect-fraud
#https://stats.stackexchange.com/questions/179835/how-to-build-a-confusion-matrix-for-a-multiclass-classifier

In [1]:
#!/usr/bin/python

from __future__ import division

import numpy as np
import xgboost as xgb
import pandas as pd
from itertools import product
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search
from sklearn.model_selection import KFold
from collections import Counter
from imblearn.over_sampling import SMOTE 



In [2]:
data_spectrum =  pd.read_csv('Spectrum_baseline.csv') #'Spectrum_baseline.csv'
labels = pd.read_csv('Spectrum_labels.csv').values
labels_new = pd.DataFrame({'Class':labels[:,1].astype('int64')})
total_df = pd.concat([data_spectrum, labels_new], axis =1)
feature = total_df.columns
data = total_df[feature[1:]].values

In [3]:
sz = data.shape
np.random.shuffle(data)

In [4]:
train = data[:int(sz[0] * 0.7), :]
test = data[int(sz[0] * 0.7):, :]

In [5]:
train_X = train[:, :1014]
train_Y = train[:, 1015]
test_X = test[:, :1014]
test_Y = test[:, 1015]

In [6]:
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['scale_pos_weight']:[0.33, 0.7, 0.1,0.3,0.4]
param['eta'] = 0.1
param['max_depth'] = 6
param['silent'] = 1
param['nthread'] = 4
param['num_class'] = 5

In [7]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_sample(train_X, train_Y)#ratio={1:1000, 3:1000, 4:1000}
X_res, y_res = sm.fit_sample(train_X, train_Y)
xg_train = xgb.DMatrix(X_res, label=y_res)
xg_test = xgb.DMatrix(test_X, label=test_Y)
watchlist = [(xg_train, 'train'), (xg_test, 'test')]
num_round = 5

In [8]:
bst = xgb.train(param, xg_train, num_round, watchlist)
# get prediction
pred = bst.predict(xg_test)
error_rate = np.sum(pred != test_Y) / test_Y.shape[0]
print('Test error using softmax = {}'.format(error_rate))

[0]	train-merror:0.328696	test-merror:0.562482
[1]	train-merror:0.281806	test-merror:0.53714
[2]	train-merror:0.246983	test-merror:0.522575
[3]	train-merror:0.234869	test-merror:0.515875
[4]	train-merror:0.226762	test-merror:0.50568
Test error using softmax = 0.5056801631226333


In [9]:
# do the same thing again, but output probabilities
param['objective'] = 'multi:softprob'
bst = xgb.train(param, xg_train, num_round, watchlist)
# Note: this convention has been changed since xgboost-unity
# get prediction, this is in 1D array, need reshape to (ndata, nclass)
pred_prob = bst.predict(xg_test).reshape(test_Y.shape[0], 5)
pred_label = np.argmax(pred_prob, axis=1)
error_rate = np.sum(pred_label != test_Y) / test_Y.shape[0]
print('Test error using softprob = {}'.format(error_rate))

[0]	train-merror:0.328696	test-merror:0.562482
[1]	train-merror:0.281806	test-merror:0.53714
[2]	train-merror:0.246983	test-merror:0.522575
[3]	train-merror:0.234869	test-merror:0.515875
[4]	train-merror:0.226762	test-merror:0.50568
Test error using softprob = 0.5056801631226333


In [10]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(pred, test_Y)
recall = np.diag(cm) / np.sum(cm, axis = 1)
precision = np.diag(cm) / np.sum(cm, axis = 0)
print(recall, precision)

[0.61368209 0.21698113 0.82556591 0.34792123 0.28571429] [0.88791849 0.55421687 0.33640803 0.46086957 0.55016181]


In [24]:
[0.58225108 0.09482759 0.60494959 0.40322581 0.51612903] [0.3915575  0.30136986 0.70815451 0.07309942 0.04984424]
[0.44293194 0.11981567 0.63309353 0.23228346 0.21621622] [0.60601719 0.46636771 0.23783784 0.34604106 0.2741433 ]#auto
[0.65180467 0.175      0.58859397 0.44444444 0.33962264] [0.40501319 0.030837   0.89917808 0.11180124 0.05980066]#ratio={1:1000, 3:1000, 4:1000}


SyntaxError: invalid syntax (<ipython-input-24-2954b3d8c564>, line 1)