In [1]:
import numpy as np
from datasets import list_datasets, load_dataset
from numpy import genfromtxt

In [2]:
X1 = genfromtxt('record/survey_rotten_tomatoes(train)_basicquestions_unifiedqa-t5-large.csv', delimiter=',')

In [3]:
X2 = genfromtxt('record/survey_rotten_tomatoes(train)_moviequestions_unifiedqa-t5-large.csv', delimiter=',')

In [4]:
X = np.concatenate((X1, X2), axis=1)

In [5]:
print(X1.shape, X2.shape, X.shape)

(8530, 19) (8530, 78) (8530, 97)


In [6]:
dataset = load_dataset('rotten_tomatoes', split='train')

Using custom data configuration default
Reusing dataset rotten_tomatoes_movie_review (C:\Users\Yizhou Zhao\.cache\huggingface\datasets\rotten_tomatoes_movie_review\default\1.0.0\9198dbc50858df8bdb0d5f18ccaf33125800af96ad8434bc8b829918c987ee8a)


In [7]:
len(dataset)

8530

In [8]:
y = np.asarray([item['label'] for item in dataset])

In [9]:
y.shape

(8530,)

In [10]:
shuffler = np.random.permutation(len(X))
X = X[shuffler]
y = y[shuffler]


In [11]:
X_train = X[:1024]
y_train = y[:1024]

X_test = X[-1024:]
y_test = y[-1024:]

In [12]:
import xgboost as xgb

In [13]:
# use DMatrix for xgbosot
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [14]:
# set xgboost params
param = {
    'max_depth': 15,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 2}  # the number of classes that exist in this datset
num_round = 20  # the number of training iterations

In [15]:
#------------- numpy array ------------------
# training and testing - numpy matrices
bst = xgb.train(param, dtrain, num_round)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [16]:
from sklearn.metrics import precision_score

In [17]:
# extracting most confident predictions
preds = bst.predict(dtrain)
best_preds = np.asarray([np.argmax(line) for line in preds])
print("Training Numpy array precision:", precision_score(y_train, best_preds, average='macro'))

Training Numpy array precision: 0.9631037075597201


In [18]:
preds = bst.predict(dtest)
# extracting most confident predictions
best_preds = np.asarray([np.argmax(line) for line in preds])
print("Test Numpy array precision:", precision_score(y_test, best_preds, average='macro'))

Test Numpy array precision: 0.8397780740876974


In [22]:
print(*np.array(bst.trees_to_dataframe()['Feature']))

f21 f2 f61 f25 f0 f10 f12 f4 Leaf f85 f12 f5 f73 f7 f79 f1 Leaf f51 f27 f17 f14 f27 Leaf f95 Leaf Leaf Leaf f81 f8 f0 f63 f7 Leaf Leaf Leaf f1 Leaf Leaf Leaf f16 f49 f9 Leaf Leaf Leaf f93 f17 f11 f14 f12 f37 f10 f1 f4 f18 Leaf Leaf f0 Leaf f69 f15 Leaf Leaf Leaf Leaf f52 f15 Leaf Leaf f18 f9 Leaf Leaf Leaf Leaf f16 f15 f10 Leaf f35 Leaf Leaf Leaf f15 Leaf f43 Leaf f17 f20 Leaf Leaf f44 f10 Leaf Leaf f14 f5 f11 f9 Leaf Leaf Leaf Leaf Leaf Leaf Leaf Leaf Leaf f10 f34 Leaf f8 Leaf Leaf Leaf f10 Leaf Leaf Leaf f17 f10 f17 f13 Leaf Leaf Leaf Leaf Leaf Leaf f11 Leaf Leaf Leaf Leaf f16 f10 f12 Leaf Leaf Leaf Leaf f73 f17 f5 f16 Leaf Leaf Leaf Leaf f9 Leaf Leaf Leaf f12 Leaf Leaf Leaf f6 Leaf f11 Leaf Leaf Leaf f21 f2 f61 f25 f0 f10 f12 f4 Leaf f85 f12 f5 f73 f7 f79 f1 Leaf f51 f27 f17 f14 f27 Leaf f95 Leaf Leaf Leaf f81 f8 f0 f63 f7 Leaf Leaf Leaf f1 Leaf Leaf Leaf f16 f49 f9 Leaf Leaf Leaf f93 f17 f11 f14 f12 f37 f10 f1 f4 f18 Leaf Leaf f0 Leaf f69 f15 Leaf Leaf Leaf Leaf f52 f15 Leaf Leaf f

 Leaf Leaf f63 f25 Leaf Leaf Leaf f87 f31 Leaf f17 f10 f17 Leaf f49 f5 f16 f31 Leaf f31 Leaf Leaf f35 Leaf f57 Leaf Leaf Leaf Leaf Leaf Leaf f17 f16 Leaf f16 f16 f5 f9 f51 Leaf f14 Leaf Leaf Leaf Leaf Leaf Leaf Leaf Leaf Leaf f4 Leaf Leaf Leaf f5 Leaf f14 f14 f14 Leaf f0 f15 f15 f15 Leaf Leaf Leaf Leaf Leaf Leaf f9 Leaf Leaf f15 Leaf Leaf Leaf f11 f8 f16 Leaf f16 f13 f12 Leaf Leaf f6 Leaf Leaf Leaf f15 Leaf f6 Leaf f51 Leaf Leaf Leaf f11 Leaf Leaf Leaf f0 Leaf f9 Leaf f15 Leaf f15 Leaf Leaf Leaf f14 f14 Leaf Leaf f11 f16 Leaf f10 Leaf Leaf Leaf Leaf f1 f16 f12 Leaf Leaf Leaf f12 Leaf f14 Leaf f11 Leaf Leaf Leaf Leaf Leaf Leaf Leaf f61 f0 f59 f20 f14 f33 f29 f74 Leaf f11 f15 f83 f9 Leaf f23 f70 Leaf f15 f1 f19 f9 f77 f67 f77 f93 f93 f10 f2 Leaf f39 Leaf Leaf f27 Leaf Leaf Leaf Leaf f63 f65 f1 Leaf f73 Leaf Leaf Leaf Leaf f73 f57 f14 f1 f13 f1 Leaf Leaf Leaf Leaf Leaf f43 Leaf Leaf Leaf f37 Leaf Leaf Leaf Leaf f14 Leaf f9 f25 f11 f8 f16 f2 Leaf Leaf Leaf Leaf Leaf Leaf Leaf Leaf Leaf f21

In [20]:
bst.trees_to_dataframe()

Unnamed: 0,Tree,Node,ID,Feature,Split,Yes,No,Missing,Gain,Cover
0,0,0,0-0,f21,0.0,0-1,0-2,0-1,222.969208,512.000000
1,0,1,0-1,f2,0.0,0-3,0-4,0-3,25.411980,297.500000
2,0,2,0-2,f61,0.0,0-5,0-6,0-5,10.235039,214.500000
3,0,3,0-3,f25,0.0,0-7,0-8,0-7,4.230057,206.500000
4,0,4,0-4,f0,0.0,0-9,0-10,0-9,8.462448,91.000000
...,...,...,...,...,...,...,...,...,...,...
5331,39,92,39-92,Leaf,,,,,0.075851,3.160275
5332,39,93,39-93,Leaf,,,,,0.064282,3.016533
5333,39,94,39-94,Leaf,,,,,-0.008224,2.201080
5334,39,95,39-95,Leaf,,,,,-0.023019,6.286396


In [24]:
bst.get_score(importance_type='gain')

{'f21': 19.96075466319167,
 'f2': 2.02445627476015,
 'f25': 0.8300875767705884,
 'f4': 0.34354396372621215,
 'f1': 0.3602534208681481,
 'f0': 0.8179100411159582,
 'f11': 0.20852000105105728,
 'f52': 0.19519388669999999,
 'f17': 0.25666004916723395,
 'f10': 0.4051910423717647,
 'f20': 0.3993054240714284,
 'f34': 0.6719252913750001,
 'f15': 0.2580535464613955,
 'f14': 0.21389155756091574,
 'f63': 0.3211525117894737,
 'f12': 0.4323473358887998,
 'f18': 0.3827996561219565,
 'f44': 0.27484002825000003,
 'f8': 0.3375952246428571,
 'f9': 0.3120180021491527,
 'f37': 0.36022252925833337,
 'f85': 0.6664879687111109,
 'f51': 0.4510916776814814,
 'f7': 0.4849869640714286,
 'f16': 0.2514209960652172,
 'f5': 0.312922788752174,
 'f73': 0.607460633868889,
 'f6': 0.22496841073928572,
 'f13': 0.21105931155000004,
 'f27': 0.3575682994281249,
 'f35': 0.22726164443076918,
 'f61': 6.305094468821427,
 'f49': 0.5068948980357143,
 'f95': 0.577767406,
 'f69': 0.30885669680291666,
 'f43': 0.26761373809322725,
 '