In [1]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Importing the proteins data
k2 = []
k2d = []

f = open("../../../../../../Data/Proteins/DENV1/NS4B/DENV1_NS4B.txt", "r")
for x in f:
    if "DSS" in x:
        k2d.append(1)
    elif "DHF" in x:
        k2d.append(1)
    elif x[0] == ">":
        k2d.append(0)
    else:
        x = x.replace("\n", "")
        k2.append(x)

In [3]:
# Testing if the data was imported correctly
if len(k2) == len(k2d):
    print("Testing successful, both the disease column and the sequence column have equal lengths.")

Testing successful, both the disease column and the sequence column have equal lengths.


In [4]:
# Converting the arrays into DataFrames
k2 = pd.DataFrame(k2)

# Attaching the "Disease" label column to the input
k2["Disease"] = k2d

# Renaming the columns
k2 = k2.rename(index=str, columns={0: "Sequence", "Disease": "Disease"})

In [5]:
# Concatinating the input data into one DataFrame
print("The combined shape of the given data is:", str(k2.shape))
print("The length of the combined data is:", str(len(k2.index)))
print("Does the combined data have any null value? ->", k2.isnull().values.any())

# Shuffling the data and then taking a peek
k2 = k2.sample(frac = 1)
print(k2.head())

The combined shape of the given data is: (999, 2)
The length of the combined data is: 999
Does the combined data have any null value? -> False
                                              Sequence  Disease
327  NEMGLLETTKKDLGIGHVAVENHHHAAMLDVDLHPASAWTLYAVAT...        0
463  NEMGLLETTKKDLGIGHVAAENHQHATMLDVDLRPASAWTLYAVAT...        0
483  NEMGLLETTKKDLGIGHVAAENHQHATMLDVDLRPASAWTLYAVAT...        0
391  NEMGLLETTKKDLGIGHVAVENHHHATMLDVDLHPASAWTLYAVAT...        0
684  NEMGLLETTKKDLGIGHVAAENHHHAAMLDVDLHPASAWTLYAVAT...        1


In [6]:
# Function to convert sequence strings into k-mer words, default size = 6 (hexamer words)
def getKmers(sequence, size = 6):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]

k2['words'] = k2.apply(lambda x: getKmers(x['Sequence']), axis=1)
k2 = k2.drop('Sequence', axis=1)

k2_texts = list(k2['words'])
for item in range(len(k2_texts)):
    k2_texts[item] = ' '.join(k2_texts[item])

In [7]:
# Creating y and printing the shape of it
y = k2.iloc[:, 0].values
print("The shape of y is:", y.shape)

The shape of y is: (999,)


In [8]:
# Creating the Bag of Words model using CountVectorizer()
# This is equivalent to k-mer counting
# The n-gram size of 4 was previously determined by testing
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(4,4))
x = cv.fit_transform(k2_texts)

# Print the shape of x
print("The shape of x matrix is:", x.shape)

The shape of x matrix is: (999, 732)


In [9]:
# Splitting the human dataset into the training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 42, stratify=y)

# Printing the shapes of the train and test matrices
print("The shape of x_train is:", X_train.shape)
print("The shape of y_train is:", y_train.shape)
print("The shape of x_test is:", X_test.shape)
print("The shape of y_test is:", y_test.shape)

The shape of x_train is: (799, 732)
The shape of y_train is: (799,)
The shape of x_test is: (200, 732)
The shape of y_test is: (200,)


## Nested Cross-Validation

In [10]:
from sklearn.model_selection import cross_val_score
import xgboost as xgb
# search for an optimal value of eta for XGBoost
ETA_list = [0.0001, 0.001, 0.01, 0.1, 0.15, 0.2, 0.25, 0.3]
# empty list to store scores
scores = []
MD_list = [1, 2, 3, 4, 5, 6, 7, 8, 9]

In [11]:
# 1. we will loop through reasonable values of C
for eta in ETA_list:
    print("\nETA =", eta)
    for md in MD_list:
        # Defining the parameters
        param = {
            'eta': eta, 
            'max_depth': md,
            'num_class': 2} 

        steps = 20  # The number of training iterations
        print("\tMD =", md)
        D_train = xgb.DMatrix(X_train, label=y_train)
        D_test = xgb.DMatrix(X_test, label=y_test)
        model = xgb.train(param, D_train, steps)
        from sklearn.metrics import accuracy_score
        preds = model.predict(D_test)
        best_preds = np.asarray([np.argmax(line) for line in preds])

        print("accuracy = {}".format(accuracy_score(y_test, best_preds)))
        scores.append(accuracy_score(y_test, best_preds))
print("The maximum accuracy is", max(scores))


ETA = 0.0001
	MD = 1
accuracy = 0.6
	MD = 2
accuracy = 0.6
	MD = 3
accuracy = 0.6
	MD = 4
accuracy = 0.6
	MD = 5
accuracy = 0.6
	MD = 6
accuracy = 0.6
	MD = 7
accuracy = 0.6
	MD = 8
accuracy = 0.6
	MD = 9
accuracy = 0.6

ETA = 0.001
	MD = 1
accuracy = 0.6
	MD = 2
accuracy = 0.6
	MD = 3
accuracy = 0.6
	MD = 4
accuracy = 0.6
	MD = 5
accuracy = 0.6
	MD = 6
accuracy = 0.6
	MD = 7
accuracy = 0.6
	MD = 8
accuracy = 0.6
	MD = 9
accuracy = 0.6

ETA = 0.01
	MD = 1
accuracy = 0.6
	MD = 2
accuracy = 0.6
	MD = 3
accuracy = 0.6
	MD = 4
accuracy = 0.6
	MD = 5
accuracy = 0.6
	MD = 6
accuracy = 0.6
	MD = 7
accuracy = 0.6
	MD = 8
accuracy = 0.6
	MD = 9
accuracy = 0.6

ETA = 0.1
	MD = 1
accuracy = 0.6
	MD = 2
accuracy = 0.6
	MD = 3
accuracy = 0.6
	MD = 4
accuracy = 0.6
	MD = 5
accuracy = 0.6
	MD = 6
accuracy = 0.6
	MD = 7
accuracy = 0.6
	MD = 8
accuracy = 0.6
	MD = 9
accuracy = 0.6

ETA = 0.15
	MD = 1
accuracy = 0.6
	MD = 2
accuracy = 0.6
	MD = 3
accuracy = 0.6
	MD = 4
accuracy = 0.6
	MD = 5
accuracy =