In [38]:
import warnings; warnings.filterwarnings("ignore")

In [39]:
import numpy as np
import pandas as pd
import xgboost; print(xgboost.__version__)
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

1.2.0


</br>

* Loading Arrays

In [40]:
X_kmer = np.load('K-mer.npy')
X_revk = np.load('rev-k-mer.npy')
X_gapk = np.load('gapped_k_mer.npy')

</br>

* Generating Y

In [41]:
Y  = [1 for i in range(490)]
Y += [0 for i in range(591)]
Y = np.array(Y)
print(Y.shape)

(1081,)


</br>

* Shuffle

In [42]:
X_kmer, X_revk, X_gapk, Y = shuffle(X_kmer, X_revk, X_gapk, Y, random_state=0)

print(X_kmer.shape)
print(Y.shape)

(1081, 340)
(1081,)


</br>

* Test-Train Split

In [43]:
Xktrain, Xktest, Yktrain, Yktest = train_test_split(X_kmer, Y, test_size=0.30, random_state=101)
Xrtrain, Xrtest, Yrtrain, Yrtest = train_test_split(X_revk, Y, test_size=0.30, random_state=101)
Xgtrain, Xgtest, Ygtrain, Ygtest = train_test_split(X_gapk, Y, test_size=0.30, random_state=101)

</br>

* Scaling

In [44]:
scaler = StandardScaler()
Xktrain = scaler.fit_transform(Xktrain)
Xktest = scaler.transform(Xktest)

In [45]:
scaler = StandardScaler()
Xrtrain = scaler.fit_transform(Xrtrain)
Xrtest = scaler.transform(Xrtest)

In [46]:
scaler = StandardScaler()
Xgtrain = scaler.fit_transform(Xgtrain)
Xgtest = scaler.transform(Xgtest)

</br>

* Feature Selection

In [47]:
modelk = xgboost.XGBClassifier()
modelk.fit(Xktrain, Yktrain)

modelr = xgboost.XGBClassifier()
modelr.fit(Xrtrain, Yrtrain)

modelg = xgboost.XGBClassifier()
modelg.fit(Xgtrain, Ygtrain)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [48]:
importantFeatures = modelk.feature_importances_
SFk = importantFeatures.argsort()[::-1][:int(len(importantFeatures)*0.50)]

importantFeatures = modelr.feature_importances_
SFr = importantFeatures.argsort()[::-1][:int(len(importantFeatures)*0.50)]

importantFeatures = modelg.feature_importances_
SFg = importantFeatures.argsort()[::-1][:int(len(importantFeatures)*0.50)]

In [49]:
SFk

array([293,  10,  13,   1, 141, 109,  72, 113, 230, 232, 262,   2, 194,
       131,  60, 226, 267, 336, 183, 171,   8,  31, 178,  42, 132, 103,
       217, 264, 162,  16, 191,  90, 321, 328, 312, 258,  45, 281, 195,
       196, 174, 306, 337, 266, 135, 286,  58, 148,  75, 168, 124,  66,
       270, 129, 139,  91, 242, 176,   3, 151, 252,  43, 233, 333, 156,
       185,  40,  12,  29,  26,  80, 107, 147,   7,  20, 325,  14, 105,
        99, 298,  70,  21,  61,  41,  83,  34,  65, 138, 137,  19, 173,
       106, 209,  77, 301,  17,   0,  92,  24, 255,   4,  39, 245, 244,
       179, 120, 204, 319, 133,  73,  67, 278, 303, 329, 207,  74, 199,
       284,  84,  30,  23,  22, 305,  44,  63, 210,  28, 330, 172,  81,
        94, 102, 159, 136, 308, 260, 316, 299, 275, 269, 127, 259,  95,
       152,  38, 119, 224, 221,  55, 323,  48, 111,  35, 161,  53, 250,
        27,  76, 307,  79,  18, 318, 283, 205, 214, 121,  97, 320,  78,
       213])

In [50]:
SFr

array([ 10, 158, 317,  45, 268,   2, 239, 256,  13, 156, 236, 221, 143,
       154,  27, 231,  28, 127,   1, 126, 161,  91,  58,  37,  90, 297,
        78, 153,  16, 138,  18,  48,  76, 106, 176, 174, 181,  43,   7,
       338,   9,   8, 146, 108, 276,  59, 286, 212, 263, 327, 281, 250,
        21,   3, 208, 308,  30, 119, 135,  34, 326,  85, 255, 188,  87,
        83, 131, 216, 309, 264, 117,  57,  19, 229,  73,   4, 260, 306,
         0,  25,  17,  53,  46,  51, 191, 273,  12,  44,  35, 253,  52,
        89, 321, 155,  67, 139, 205,  79, 107, 102,  68, 262,  94, 238,
       104, 307, 213,  99, 304, 230,  47,  32,  60, 165, 168, 329,  75,
       257,  77,  39, 190, 323, 217, 280, 101, 246, 311, 209, 299, 197,
       183,  62,  20, 271,  55, 248, 207, 334, 265, 312,  14,  36,  65,
       318, 179,  40, 322, 148, 147, 291,  95, 111,  70, 136,  42, 305,
        24, 337, 123, 290, 259, 226,  29, 172, 214, 228, 296, 328, 227,
       244])

In [51]:
SFg

array([41,  6,  9, 44, 19, 21, 12, 60, 25,  5, 35, 57, 26, 42, 43,  2, 71,
       34, 32, 10, 58, 74, 51, 73, 31, 37, 30, 38, 53, 69, 49, 65, 47, 76,
        3, 67, 22,  4, 55, 77])

</br>

* Feature Elimination

In [52]:
Xktrain = Xktrain[:,SFk]
Xrtrain = Xrtrain[:,SFr]
Xgtrain = Xgtrain[:,SFg]

In [53]:
Xktest = Xktest[:,SFk]
Xrtest = Xrtest[:,SFr]
Xgtest = Xgtest[:,SFg]

In [54]:
Xtrain = np.concatenate((Xktrain,Xrtrain, Xgtrain), axis=1)
Xtest = np.concatenate((Xktest, Xrtest, Xgtest), axis=1)

</br>


* Prediction

In [56]:
model = xgboost.XGBClassifier()
model.fit(Xtrain, Yktrain)
Yp = model.predict(Xtest)

In [57]:
print(accuracy_score(y_true=Yktest, y_pred=Yp))

0.7169230769230769
