# XGBoost Model

## Imports

In [None]:
# Preprocessing and encoding variables
import pandas as pd
import numpy as np
from scipy.stats import uniform, randint

# Using Skicit-learn to split data into training and testing sets

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# Import the classifier we are using
import xgboost as xgb
from xgboost import XGBClassifier

# Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Visualising feature importance and making plots
import matplotlib.pyplot as plt
import seaborn as sns

# Hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV

# Confusion matrix
from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix, precision_score, recall_score, roc_auc_score

In [None]:
G1 = pd.read_csv('../input/genomes/G1_translated.csv').iloc[:, 1:]
G2 = pd.read_csv('../input/genomes/G2_translated.csv').iloc[:, 1:]
G3 = pd.read_csv('../input/genomes/G3_translated.csv').iloc[:, 1:]
G4 = pd.read_csv('../input/genomes/G4_translated.csv').iloc[:, 1:]
G5 = pd.read_csv('../input/genomes/G5_translated.csv').iloc[:, 1:]
G6 = pd.read_csv('../input/genomes/G6_translated.csv').iloc[:, 1:]
G7 = pd.read_csv('../input/genomes/G7_translated.csv').iloc[:, 1:]
G8 = pd.read_csv('../input/genomes/G8_translated.csv').iloc[:, 1:]
G9 = pd.read_csv('../input/genomes/G9_translated.csv').iloc[:, 1:]
G10 = pd.read_csv('../input/genomes/G10_translated.csv').iloc[:, 1:]
#G11 = pd.read_csv('../input/genomes/G11_translated.csv').iloc[:, 1:]
#G12 = pd.read_csv('../input/genomes/G12_translated.csv').iloc[:, 1:]
#G13 = pd.read_csv('../input/genomes/G13_translated.csv').iloc[:, 1:]
#G14 = pd.read_csv('../input/genomes/G14_translated.csv').iloc[:, 1:]
#G15 = pd.read_csv('../input/genomes/G15_translated.csv').iloc[:, 1:]

In [None]:
G1 = G1.sample(2000)
G2 = G2.sample(2000)
G3 = G3.sample(2000)
G4 = G4.sample(2000)
G5 = G5.sample(2000)
#G6 = G6.sample(4000)
G7 = G7.sample(2000)
G8 = G8.sample(2000)
#G9 = G9.sample(3000)
G10 = G10.sample(2000)

In [None]:
#frames = [G1, G2, G3, G4, G5]
#G = pd.concat(frames)

frames = [G1, G2, G3, G4, G5, G6, G7, G8, G9, G10]
G = pd.concat(frames)

In [None]:
G = G.dropna()

In [None]:
G.tail(5)

In [None]:
print('The shape of our dataframe is:', G.shape)
print('Rows:', G.shape[0])
print('Columns:', G.shape[1])

## Import single genome

In [None]:
G = pd.read_csv('../input/genomes-part1/G5.features.csv').iloc[:, 1:]

In [None]:
G.tail(5)

In [None]:
print('The shape of our dataframe is:', G.shape)

In [None]:
G = G.dropna()
print('The shape of our dataframe is:', G.shape)

## Preprocessing

### Encoding target variables

In [None]:
def encode_feature(array):
    """ Encode a categorical array into a number array
    
    :param array: array to be encoded
    :return: numerical array
    """
  
    encoder = preprocessing.LabelEncoder()
    encoder.fit(array)
    return encoder.transform(array)

In [None]:
class_names = ['CDS', 'LORF']
targets = G["Type"].values
print(targets)

In [None]:
targets = encode_feature(targets)
print(targets)

In [None]:
print('The shape of our dataframe is:', G.shape)
print('Rows:', G.shape[0])
print('Columns:', G.shape[1])

### Selecting features and targets and converting data to arrays

In [None]:
# Labels are the values we want to predict
labels = targets

# Remove the labels from the features -> axis 1 refers to the columns
features = G.drop(['Type','Genome', 'Dataset'], axis = 1)

# Saving feature names as list for later use
feature_names = list(features.columns)

# Convert to numpy array
features = np.array(features)

In [None]:
print('The shape of our features are:', features.shape)

### Split into training and testing sets

In [None]:
# Split the data into training and testing sets -> x = features and y = labels/targets
train_x, test_x, train_y, test_y = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [None]:
print('Training Features Shape:', train_x.shape)
print('Training Labels Shape:', train_y.shape)
print('Testing Features Shape:', test_x.shape)
print('Testing Labels Shape:', test_y.shape)

## Train model - default XGBoost

- **learning_rate**: step size shrinkage used to prevent overfitting. Range is [0,1]
- **max_depth**: determines how deeply each tree is allowed to grow during any boosting round.
- **subsample**: percentage of samples used per tree. Low value can lead to underfitting.
- **colsample_bytree**: percentage of features used per tree. High value can lead to overfitting.
- **n_estimators**: number of trees you want to build.
- **objective**: determines the loss function to be used like reg:linear for regression problems, reg:logistic for classification problems with only decision, binary:logistic for classification problems with probability. XGBoost also supports regularization parameters to penalize models as they become more complex and reduce them to simple (parsimonious) models.

- **gamma**: controls whether a given node will split based on the expected reduction in loss after the split. A higher value leads to fewer splits. Supported only for tree-based learners.
- **alpha**: L1 regularization on leaf weights. A large value leads to more regularization.
- **lambda**: L2 regularization on leaf weights and is smoother than L1 regularization.

In [None]:
# Create a Gaussian Classifier
xgb = XGBClassifier(objective="binary:logistic", random_state=42, eval_metric='logloss', use_label_encoder=False)

# Train the model using the training sets 
xgb.fit(train_x, train_y)

In [None]:
# Make predictions for test data
y_pred = xgb.predict(test_x)

In [None]:
# Evaluate predictions
print("Accuracy score:", metrics.accuracy_score(test_y, y_pred))
# Precision, Recall and Roc_AUC score
print("Precision score:", metrics.precision_score(test_y, y_pred))
print("Recall score:", metrics.recall_score(test_y, y_pred))
#print("ROC_AUC score:", metrics.roc_auc_score(test_y, y_pred))

### Confusion matrix

In [None]:
plot_confusion_matrix(xgb, test_x, test_y)  
plt.show()

#plt.savefig('confmatrix_g1_uneven.png', dpi=300, bbox_inches='tight', transparent=True)

## Hyperparamter searching

In [None]:
def display_scores(scores):
    print("Scores: {0}\nMean: {1:.3f}\nStd: {2:.3f}".format(scores, np.mean(scores), np.std(scores)))

In [None]:
def report_best_scores(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

params = {
    "colsample_bytree": uniform(0.7, 0.3),
    "gamma": uniform(0, 0.5),
    "learning_rate": uniform(0.03, 0.3), # default 0.1 
    "max_depth": randint(2, 6), # default 3
    "n_estimators": randint(100, 150), # default 100
    "subsample": uniform(0.6, 0.4)
}

In [None]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

params = {
    "gamma": uniform(0, 0.5),
    "learning_rate": uniform(0.03, 0.3), # default 0.1 
    "max_depth": randint(2, 6), # default 3
    "n_estimators": randint(100, 150), # default 100
}

xgb_search = RandomizedSearchCV(xgb, param_distributions=params, random_state=42, n_iter=10, cv=3, verbose=1, n_jobs=-1, return_train_score=True)

xgb_search.fit(train_x, train_y)

In [None]:
report_best_scores(xgb_search.cv_results_, 1)

## Early stopping

In [None]:
# If more than one evaluation metric are given the last one is used for early stopping
xgb_model = XGBClassifier(objective="binary:logistic", random_state=42, eval_metric="auc", use_label_encoder=False)

xgb_model.fit(train_x, train_y, early_stopping_rounds=5, eval_set=[(test_x, test_y)])

In [None]:
y_pred = xgb_model.predict(test_x)

# Evaluate predictions
accuracy = accuracy_score(test_y, y_pred)
print("Accuracy: %.4f%%" % (accuracy * 100.0))

In [None]:
print("best score: {0}, best iteration: {1}, best ntree limit {2}".format(xgb_model.best_score, xgb_model.best_iteration, xgb_model.best_ntree_limit))

## Train new model with best params

In [None]:
# Create a Gaussian Classifier
xgb = XGBClassifier(n_estimators=29, objective="binary:logistic", random_state=42, eval_metric='logloss', use_label_encoder=False)

# Train the model using the training sets 
xgb.fit(train_x, train_y)

In [None]:
# Make predictions for test data
y_pred = xgb.predict(test_x)

In [None]:
# Evaluate predictions
print("Accuracy score:", metrics.accuracy_score(test_y, y_pred))
# Precision, Recall and Roc_AUC score
print("Precision score:", metrics.precision_score(test_y, y_pred))
print("Recall score:", metrics.recall_score(test_y, y_pred))
print("ROC_AUC score:", metrics.roc_auc_score(test_y, y_pred))

## Selected features

In [None]:
feature_imp = pd.Series(xgb.feature_importances_, index = feature_names).sort_values(ascending=False)

In [None]:
selected = feature_imp.iloc[0:200]

In [None]:
selected_feat = list(selected.index)

In [None]:
important_indices = [feature_names.index(x) for x in selected_feat]

In [None]:
# Create a Gaussian Classifier
xgb_select = XGBClassifier(objective="binary:logistic", random_state=42, eval_metric='logloss', use_label_encoder=False)

# Select most important features
important_indices = [feature_names.index(x) for x in selected_feat]

train_important = train_x[:, important_indices]
test_important = test_x[:, important_indices]

In [None]:
# Train the model using the training sets 
xgb_select.fit(train_important, train_y)

In [None]:
# Make predictions and determine the error
pred_y = xgb_select.predict(test_important)

# Model Accuracy, how often is the classifier correct?
print("Accuracy score:", metrics.accuracy_score(test_y, pred_y))
# Precision, Recall and Roc_AUC score
print("Precision score:", metrics.precision_score(test_y, pred_y))
print("Recall score:", metrics.recall_score(test_y, pred_y))
print("ROC_AUC score:", metrics.roc_auc_score(test_y, pred_y))