In [1]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Importing the proteins data
ns2b = []
ns2bd = []

f = open("../../../../../../Data/Proteins/DENV1/NS2B/DENV1_NS2B.txt", "r")
for x in f:
    if "DSS" in x:
        ns2bd.append(1)
    elif "DHF" in x:
        ns2bd.append(1)
    elif x[0] == ">":
        ns2bd.append(0)
    else:
        x = x.replace("\n", "")
        ns2b.append(x)

In [3]:
# Converting the arrays into DataFrames
ns2b = pd.DataFrame(ns2b)

# Attaching the "Disease" label column to the input
ns2b["Disease"] = ns2bd

# Renaming the columns
ns2b = ns2b.rename(index=str, columns={0: "Sequence", "Disease": "Disease"})

In [4]:
# Concatinating the input data into one DataFrame
print("The combined shape of the given data is:", str(ns2b.shape))
print("The length of the combined data is:", str(len(ns2b.index)))
print("Does the combined data have any null value? ->", ns2b.isnull().values.any())

ns2b = ns2b.dropna(how='any',axis=0) 

# Shuffling the data and then taking a peek
ns2b = ns2b.sample(frac = 1)
print(ns2b.head())

The combined shape of the given data is: (999, 2)
The length of the combined data is: 999
Does the combined data have any null value? -> False
                                              Sequence  Disease
481  SWPLNEGIMAIGIVSILLSSLLKNDVPLAGPLIAGGMLIACYVISG...        0
773  SWPLNEGIMAVGIVSILLSALLKNDVPLAGPLIAGGMLIACYVISG...        1
623  SWPLNEGIMAVGIVSILLSSLLKNDVPLAGPLIAGGMLIACYVISG...        1
704  SWPLNEGIMAVGIVSILLSSLLKNDVPLAGPLIAGGMLIACYVISG...        1
741  SWPLNEGIMAVGIVSILLSSLLKNDVPLAGPLIAGGMLIACYVISG...        1


In [5]:
# Function to convert sequence strings into k-mer words, default size = 6 (hexamer words)
def getKmers(sequence, size = 6):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]

ns2b['words'] = ns2b.apply(lambda x: getKmers(x['Sequence']), axis=1)
ns2b = ns2b.drop('Sequence', axis=1)

ns2b_texts = list(ns2b['words'])
for item in range(len(ns2b_texts)):
    ns2b_texts[item] = ' '.join(ns2b_texts[item])

In [6]:
# Creating y and printing the shape of it
y = ns2b.iloc[:, 0].values
print("The shape of y is:", y.shape)

The shape of y is: (999,)


In [7]:
# Creating the Bag of Words model using CountVectorizer()
# This is equivalent to k-mer counting
# The n-gram size of 4 was previously determined by testing
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(4,4))
x = cv.fit_transform(ns2b_texts)

# Print the shape of x
print("The shape of x matrix is:", x.shape)

The shape of x matrix is: (999, 643)


## Nested Cross-Validation

In [8]:
from sklearn.model_selection import cross_val_score
# search for an optimal value of C for Logistic Regression
Tree_list = [10, 100, 1000, 2500, 5000]
# empty list to store scores
RF_scores = []

In [9]:
from sklearn.ensemble import RandomForestClassifier
# 1. we will loop through reasonable values of k
for trees in Tree_list:
    # 2. run RandomForestClassifier with trees trees
    lr = RandomForestClassifier(n_estimators = trees, criterion = 'entropy', random_state = 0)
    # 3. obtain cross_val_score for RandomForestClassifier with trees trees
    scores = cross_val_score(lr, x, y, cv = 10, scoring='accuracy')
    # 4. append mean of scores for trees trees to RF_scores list
    RF_scores.append(scores.mean())
    
    print("The score for", trees, "trees, is:", scores.mean())

The score for 10 trees, is: 0.5736185618561856
The score for 100 trees, is: 0.5716183618361835
The score for 1000 trees, is: 0.5665981598159815
The score for 2500 trees, is: 0.5656080608060805
The score for 5000 trees, is: 0.5635981598159816


In [10]:
# search for an optimal value of C for Logistic Regression
Tree_list2 = [6, 7, 8, 9, 11, 12 ,13, 14]
# empty list to store scores
RF_scores2 = []

In [11]:
# 1. we will loop through reasonable values of k
for trees in Tree_list2:
    # 2. run RandomForestClassifier with trees trees
    lr = RandomForestClassifier(n_estimators = trees, criterion = 'entropy', random_state = 0)
    # 3. obtain cross_val_score for RandomForestClassifier with trees trees
    scores = cross_val_score(lr, x, y, cv = 10, scoring='accuracy')
    # 4. append mean of scores for trees trees to RF_scores list
    RF_scores2.append(scores.mean())
    
    print("The score for", trees, "trees, is:", scores.mean())

The score for 6 trees, is: 0.5776688668866886
The score for 7 trees, is: 0.5776488648864887
The score for 8 trees, is: 0.5706682668266827
The score for 9 trees, is: 0.5756286628662867
The score for 11 trees, is: 0.5736185618561856
The score for 12 trees, is: 0.5755987598759876
The score for 13 trees, is: 0.5726185618561856
The score for 14 trees, is: 0.5816185618561857
