In [1]:
# Importing the necessary libraries
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Importing the protein data
k2 = []
k2d = []

f = open("../../../../../../Data/Proteins/DENV1/2K/DENV1_2Kpeptide.txt", "r")
for x in f:
    if "DSS" in x:
        k2d.append(1)
    elif "DHF" in x:
        k2d.append(1)
    elif x[0] == ">":
        k2d.append(0)
    else:
        x = x.replace("\n", "")
        k2.append(x)
        
# Converting the array into DataFrame
k2 = pd.DataFrame(k2)

# Attaching the "Disease" label column to the input
k2["Disease"] = k2d

# Renaming the columns
k2 = k2.rename(index=str, columns={0: "Sequence", "Disease": "Disease"})

# clearing the memory
del k2d

In [3]:
print("The combined shape of the given data is:", str(k2.shape))
print("The length of the combined data is:", str(len(k2.index)))
print("Does the combined data have any null value? ->", k2.isnull().values.any())

k2 = k2.dropna(how = 'any', axis = 0) 

# Shuffling the data
k2 = k2.sample(frac = 1)

In [4]:
# Function to convert sequence strings into k-mer words, default size = 6 (hexamer words)
def getKmers(sequence, size = 6):
    return [sequence[x:x + size].lower() for x in range(len(sequence) - size + 1)]

k2['words'] = k2.apply(lambda x: getKmers(x['Sequence']), axis = 1)
k2 = k2.drop('Sequence', axis=1)

k2_texts = list(k2['words'])
for item in range(len(k2_texts)):
    k2_texts[item] = ' '.join(k2_texts[item])

The combined shape of the given data is: (999, 2)
The length of the combined data is: 999
Does the combined data have any null value? -> False
                    Sequence  Disease
766  TPQDNQLAYVVIGLLFMILTVAA        1
672  TPQDNQLAYVVIGLLFMILTVAA        1
553  TPQDNQLAYVVIGLLFMILTVAA        0
560  TPQDNQLAYVVIGLLFMILTVAA        0
926  TPQDNQLAYVVIGLLFMILTVAA        1


In [5]:
# Creating y and printing the shape of it
y = k2.iloc[:, 0].values
k2["Disease"].value_counts().sort_index().plot.bar()

# clearing the memory
del k2

In [6]:
# Creating the Bag of Words model using CountVectorizer()
# This is equivalent to k-mer counting
# The n-gram size of 4 was previously determined by testing
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(4,4))
x = cv.fit_transform(k2_texts)

# clearing the memory
del cv

The shape of y is: (999,)


In [7]:
# Splitting the human dataset into the training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 42, stratify=y)

# Printing the shapes of the train and test matrices
print("The shape of x_train is:", X_train.shape)
print("The shape of y_train is:", y_train.shape)
print("The shape of x_test is:", X_test.shape)
print("The shape of y_test is:", y_test.shape)

# clearing the memory
del x
del y

The shape of x matrix is: (999, 49)


## XGBoost

In [11]:
# Initialising the training process
import xgboost as xgb
D_train = xgb.DMatrix(X_train, label=y_train)
D_test = xgb.DMatrix(X_test, label=y_test)

# Defining the parameters
param = {
    'eta': 0.01, 
    'max_depth': 3,  
    'objective': 'multi:softprob',  
    'num_class': 2} 

steps = 20  # The number of training iterations

# Train the model
model = xgb.train(param, D_train, steps)

In [14]:
# Metrics
from sklearn.metrics import precision_score, recall_score, accuracy_score

preds = model.predict(D_test)
best_preds = np.asarray([np.argmax(line) for line in preds])

print("accuracy = {}".format(accuracy_score(y_test, best_preds)))
print("precision = {}".format(precision_score(y_test, best_preds, average='macro')))
print("recall = {}".format(recall_score(y_test, best_preds, average='macro')))

# clearing the memory
del model
del preds
del best_preds

accuracy = 0.6
precision = 0.3
recall = 0.5


  'precision', 'predicted', average, warn_for)
