In [1]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Importing the proteins data
ns4a = []
ns4ad = []

f = open("../../../../../../Data/Proteins/DENV2/NS4A/DENV2_NS4A.txt", "r")
for x in f:
    if "DSS" in x:
        ns4ad.append(1)
    elif "DHF" in x:
        ns4ad.append(1)
    elif x[0] == ">":
        ns4ad.append(0)
    else:
        x = x.replace("\n", "")
        ns4a.append(x)

In [3]:
# Converting the arrays into DataFrames
ns4a = pd.DataFrame(ns4a)

# Attaching the "Disease" label column to the input
ns4a["Disease"] = ns4ad

# Renaming the columns
ns4a = ns4a.rename(index=str, columns={0: "Sequence", "Disease": "Disease"})

In [4]:
# Concatinating the input data into one DataFrame
print("The combined shape of the given data is:", str(ns4a.shape))
print("The length of the combined data is:", str(len(ns4a.index)))
print("Does the combined data have any null value? ->", ns4a.isnull().values.any())

ns4a = ns4a.dropna(how='any',axis=0)

# Shuffling the data and then taking a peek
ns4a = ns4a.sample(frac = 1)
print(ns4a.head())

The combined shape of the given data is: (603, 2)
The length of the combined data is: 603
Does the combined data have any null value? -> False
                                              Sequence  Disease
394  SLALNLITEMGRLPTFMTQKTRDALDNLAVLHTAEAGGRAYNHALS...        1
417  SLTLNLITEMGRLPTFMTQKARNALDNLAVLHTAEAGGRAYNHALS...        1
200  SLTLNLITEMGRLPTFMTQKARDALDNLAVLHTAEVGGRAYNHALS...        0
406  SLTLNLITEMGRLPTFMTQKARNALDNLAVLHTAEAGGRAYNHALS...        1
548  SLTLNLITEMGRLPTFMTQKTRDALDNLAVLHTAEAGGRAYNHALS...        1


In [5]:
# Function to convert sequence strings into k-mer words, default size = 6 (hexamer words)
def getKmers(sequence, size = 6):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]

ns4a['words'] = ns4a.apply(lambda x: getKmers(x['Sequence']), axis=1)
ns4a = ns4a.drop('Sequence', axis=1)

ns4a_texts = list(ns4a['words'])
for item in range(len(ns4a_texts)):
    ns4a_texts[item] = ' '.join(ns4a_texts[item])

In [6]:
# Creating y and printing the shape of it
y = ns4a.iloc[:, 0].values
print("The shape of y is:", y.shape)

The shape of y is: (603,)


In [7]:
# Creating the Bag of Words model using CountVectorizer()
# This is equivalent to k-mer counting
# The n-gram size of 4 was previously determined by testing
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(4,4))
x = cv.fit_transform(ns4a_texts)

# Print the shape of x
print("The shape of x matrix is:", x.shape)

The shape of x matrix is: (603, 436)


## Nested Cross-Validation

In [8]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
# search for an optimal value of C for Logistic Regression
C_list = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
# empty list to store scores
C_scores = []

In [9]:
# 1. we will loop through reasonable values of C
for c in C_list:
    # 2. run SVM with C = c
    svm = SVC(C = c, random_state = 0, kernel = "rbf")
    # 3. obtain cross_val_score for SVM with C = c
    scores = cross_val_score(svm, x, y, cv = 10, scoring = "accuracy")
    # 4. append mean of scores for C = c to C_scores list
    C_scores.append(scores.mean())
    
    print("The score for C =", c, "is:", scores.mean())



The score for C = 0.0001 is: 0.6368111512457164




The score for C = 0.001 is: 0.6368111512457164




The score for C = 0.01 is: 0.6368111512457164




The score for C = 0.1 is: 0.6368111512457164




The score for C = 1 is: 0.7266273038807076




The score for C = 10 is: 0.7415180142632212




The score for C = 100 is: 0.7249606372140409




The score for C = 1000 is: 0.7249879596184126




In [10]:
# search for an optimal value of C for Logistic Regression
C_list2 = [6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 25, 30]
# empty list to store scores
C_scores2 = []

In [11]:
# 1. we will loop through reasonable values of C
for c in C_list2:
    # 2. run SVM with C = c
    svm = SVC(C = c, random_state = 0, kernel = "rbf")
    # 3. obtain cross_val_score for SVM with C = c
    scores = cross_val_score(svm, x, y, cv = 10, scoring = "accuracy")
    # 4. append mean of scores for C = c to C_scores list
    C_scores2.append(scores.mean())
    
    print("The score for C =", c, "is:", scores.mean())



The score for C = 6 is: 0.7398513475965546




The score for C = 7 is: 0.7398513475965546




The score for C = 8 is: 0.7398513475965546




The score for C = 9 is: 0.7415180142632212




The score for C = 11 is: 0.7365180142632213




The score for C = 12 is: 0.7348786700009262




The score for C = 13 is: 0.7348786700009262




The score for C = 14 is: 0.7348786700009262




The score for C = 15 is: 0.7348786700009262




The score for C = 16 is: 0.7348786700009262




The score for C = 17 is: 0.7348786700009262




The score for C = 18 is: 0.7348786700009262




The score for C = 19 is: 0.7348786700009262




The score for C = 20 is: 0.7348786700009262




The score for C = 25 is: 0.7348786700009262




The score for C = 30 is: 0.7332120033342595


