In [1]:
# Importing the necessary libraries
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Importing the proteins data
ns2b = []
ns2bd = []
       
f = open("../../../../../../Data/Proteins/DENV1/NS2B/DENV1_NS2B.txt", "r")
for x in f:
    if "DSS" in x:
        ns2bd.append(1)
    elif "DHF" in x:
        ns2bd.append(1)
    elif x[0] == ">":
        ns2bd.append(0)
    else:
        x = x.replace("\n", "")
        ns2b.append(x)
        
# Converting the array into DataFrame
ns2b = pd.DataFrame(ns2b)

# Attaching the "Disease" label column to the input
ns2b["Disease"] = ns2bd

# Renaming the columns
ns2b = ns2b.rename(index=str, columns={0: "Sequence", "Disease": "Disease"})

# clearing the memory
del ns2bd

In [3]:
print("The combined shape of the given data is:", str(ns2b.shape))
print("The length of the combined data is:", str(len(ns2b.index)))
print("Does the combined data have any null value? ->", ns2b.isnull().values.any())

ns2b = ns2b.dropna(how = 'any',axis = 0) 

# Shuffling the data and then taking a peek
ns2b = ns2b.sample(frac = 1)

The combined shape of the given data is: (999, 2)
The length of the combined data is: 999
Does the combined data have any null value? -> False


In [4]:
# Function to convert sequence strings into k-mer words, default size = 6 (hexamer words)
def getKmers(sequence, size = 6):
    return [sequence[x:x + size].lower() for x in range(len(sequence) - size + 1)]

ns2b['words'] = ns2b.apply(lambda x: getKmers(x['Sequence']), axis=1)
ns2b = ns2b.drop('Sequence', axis=1)

ns2b_texts = list(ns2b['words'])
for item in range(len(ns2b_texts)):
    ns2b_texts[item] = ' '.join(ns2b_texts[item])

In [5]:
# Creating y and printing the shape of it
y = ns2b.iloc[:, 0].values
print("The shape of y is:", y.shape)

# clearing the memory
del ns2b

The shape of y is: (999,)


In [6]:
# Creating the Bag of Words model using CountVectorizer()
# This is equivalent to k-mer counting
# The n-gram size of 4 was previously determined by testing
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range = (4,4))
x = cv.fit_transform(ns2b_texts)

# clearing the memory
del cv

In [7]:
# Splitting the human dataset into the training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 42, stratify=y)

# Printing the shapes of the train and test matrices
print("The shape of x_train is:", X_train.shape)
print("The shape of y_train is:", y_train.shape)
print("The shape of x_test is:", X_test.shape)
print("The shape of y_test is:", y_test.shape)

# clearing the memory
del x
del y

The shape of x_train is: (799, 643)
The shape of y_train is: (799,)
The shape of x_test is: (200, 643)
The shape of y_test is: (200,)


## Naive Bayes

In [8]:
# Multinomial Naive Bayes Classifier #
# The alpha parameter was determined by grid search previously
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
X_train = X_train.toarray()
classifier.fit(X_train, y_train)

# Making a prediction of the test set sequences
X_test = X_test.toarray()
y_pred = classifier.predict(X_test)

# clearing the memory
del classifier
del X_train
del X_test
del y_train

In [9]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

print("Confusion matrix\n")
print(pd.crosstab(pd.Series(y_test, name='Actual'), pd.Series(y_pred, name='Predicted')))

def get_metrics(y_test, y_predicted):
    accuracy = accuracy_score(y_test, y_predicted)
    precision = precision_score(y_test, y_predicted, average='weighted')
    recall = recall_score(y_test, y_predicted, average='weighted')
    f1 = f1_score(y_test, y_predicted, average='weighted')
    return accuracy, precision, recall, f1

accuracy, precision, recall, f1 = get_metrics(y_test, y_pred)
print("accuracy = {} \nprecision = {} \nrecall = {} \nf1 = {}".format(accuracy, precision, recall, f1))

# clearing the memory
del y_pred
del accuracy
del precision
del recall
del f1

Confusion matrix

Predicted   0   1
Actual           
0          31  89
1           2  78
accuracy = 0.545 
precision = 0.750462710941753 
recall = 0.545 
f1 = 0.4957688338493293
