# Libraries and Functions

In [338]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Input, LSTM, Dense, Flatten, Conv1D, Conv2D, Dropout, BatchNormalization, MaxPooling1D
from keras.callbacks import EarlyStopping
import tensorflow as tf
from sklearn.metrics import matthews_corrcoef
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.initializers import Constant
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import math
import matplotlib.pyplot as plt

In [339]:
def parse_data(file_path):
    sequences, secondary_structures = [], []
    parsing_sequences = False
    with open(file_path, "r") as file:
        sequence = ""
        secondary_structure = ""
        for line in file:
            line = line.strip()
            if line == "<>" and not parsing_sequences:
                parsing_sequences = True
            elif line == "end" or line == "<end>":
                parsing_sequences = False
                sequence += "!"
                sequences.append(sequence)
                secondary_structures.append(secondary_structure)
                sequence = ""
                secondary_structure = ""
            elif parsing_sequences and line == "<>":
                sequences.append(sequence)
                secondary_structures.append(secondary_structure)
                sequence = ""
                secondary_structure = ""
            elif parsing_sequences:
                parts = line.split()
                if len(parts) == 2:
                    amino_acid, sec_structure = parts
                    sequence += amino_acid
                    secondary_structure += sec_structure
    return sequences, secondary_structures

def parse_data_2(file_path):
    sequences, secondary_structures = [], []
    parsing_sequences = False
    with open(file_path, "r") as file:
        for line in file:
            line = line.strip()
            if line == "<>":
                parsing_sequences = True
            elif line == "end":
                parsing_sequences = False
            elif parsing_sequences:
                parts = line.split()
                if len(parts) == 2:
                    amino_acid, sec_structure = parts
                    sequences.append(amino_acid)
                    secondary_structures.append(sec_structure)
    return sequences, secondary_structures

# Function to read aligned sequences from a file
def read_aligned_sequences(file_path):
    sequences = {}
    with open(file_path, "r") as file:
        sequence_number = None
        sequence = ""
        for line in file:
            line = line.strip()
            if line.startswith(">"):
                if sequence_number is not None:
                    sequences[sequence_number] = sequence
                sequence_number = int(line.split("_")[1])
                sequence = ""
            else:
                sequence += line
        if sequence_number is not None and sequence:
            sequences[sequence_number] = sequence
    return sequences

# Function to align structures with sequences
def align_structure(structures, sequences):
    aligned = {}
    for i in sequences.keys():
        align = []
        sequence = sequences[i]
        structure = structures[i-1]

        j = 0
        for char in sequence:
            if char == '-':
                align.append('-')
            else:
                align.append(structure[j])
                j += 1
        aligned[i] = align
    return aligned

# Function to encode a sequence using one-hot encoding
# def encode_sequence(sequence):
#     amino_acids = 'ACDEFGHIKLMNPQRSTVWY!'
#     num_amino_acids = len(amino_acids)
#     encoded_seq = np.zeros((len(sequence), num_amino_acids), dtype=int)
#     for i, aa in enumerate(sequence):
#         if aa in amino_acids:
#             encoded_seq[i, amino_acids.index(aa)] = 1
#     return encoded_seq

# Function to encode a structure
def encode_structure(structure):
    mapping = {'-': 3, '_': 0, 'e': 1, 'h': 2}
    return [mapping[char] for char in structure]

def calculate_accuracy(y_true, y_pred):

    if len(y_pred.shape) == 2:
        y_pred_labels = np.argmax(y_pred, axis=1)
    else:
        y_pred_labels = y_pred

    class_accuracies = {}
    for class_idx in range(3):
        class_pred_labels = y_pred_labels[y_true == class_idx]
        class_true_labels = y_true[y_true == class_idx]

        class_accuracy = np.sum(class_pred_labels == class_true_labels) / len(class_true_labels)

        # class_names = {0: 'coil', 1: 'β-sheet', 2: 'α-helix'}
        class_accuracies[class_idx] = class_accuracy

    # for class_name, accuracy in class_accuracies.items():
    #     print(f"Accuracy for class {class_name}: {accuracy}")

    total_correct_predictions = np.sum(y_pred_labels == y_true)
    total_accuracy = total_correct_predictions / len(y_true)

    # print("Total Accuracy:", total_accuracy)

    return total_accuracy, class_accuracies

def preprocess_data(aligned_sequences, aligned_sequence_structures, pssm, window_size):
    X = []
    y = []

    for seq_id, sequence in aligned_sequences.items():
        encoded_seq = encode_sequence_new(sequence, pssm)
        encoded_struct = encode_structure(aligned_sequence_structures[seq_id])

        # Pad the sequence symmetrically around each position
        pad_width = ((window_size // 2, window_size // 2), (0, 0))
        padded_seq = np.pad(encoded_seq, pad_width, mode='constant')

        for i in range(len(sequence)):
            # Extract window_size elements centered around the current position
            window_start = i
            window_end = i + window_size
            window = padded_seq[window_start:window_end]

            X.append(window)
            y.append(encoded_struct[i])

    # Remove instances with padding
    for i in range(len(y) - 1, -1, -1):
        if y[i] == 3:
            X.pop(i)
            y.pop(i)

    return np.array(X), np.array(y)

def encode_sequence_new(sequence, pssm):
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    num_amino_acids = len(amino_acids)
    encoded_seq = np.zeros((len(sequence), num_amino_acids), dtype=float)
    for i, aa in enumerate(sequence):
        if aa != "-":
        # for j, a in enumerate(amino_acids):
        #     encoded_seq[i, j] = pssm[i][a]
            # encoded_seq[i, amino_acids.index(aa)] = pssm[i][aa]
            encoded_seq[i, amino_acids.index(aa)] = 1
    return encoded_seq

def create_windows(sequences, secondary_structures, window_size=13):
    X, y = [], []
    padded_sequences = ['_' for _ in range(window_size // 2)] + sequences + ['_' for _ in range(window_size // 2)]
    for i in range(len(sequences)):
        window = padded_sequences[i:i + window_size]
        X.append(window)
        y.append(secondary_structures[i])
    return X, y

def create_windows_new(sequences, window_size):
    num_samples, num_features = sequences.shape
    padded_seq = np.zeros((num_samples, window_size, num_features))
    pad_size = window_size // 2

    for i in range(num_samples):
        start_index = max(0, i - pad_size)
        end_index = min(num_samples, i + pad_size + 1)
        
        # Calculate the indices for the windowed data
        window_start = pad_size - (i - start_index)
        window_end = window_start + (end_index - start_index)
        
        # Copy the data into the padded array
        padded_seq[i, window_start:window_end, :] = sequences[start_index:end_index, :]
    
    return padded_seq

def preprocess_data_2(predictions, window_size):
    sequence_length, num_classes = predictions.shape
    pad_width = ((window_size // 2, window_size // 2), (0, 0))

    # Define windowed_predictions shape considering all windows
    windowed_predictions = np.empty((sequence_length, window_size, num_classes))

    # Pad predictions
    padded_predictions = np.pad(predictions, pad_width, mode='constant')

    # Collect windows into a temporary list (optional)
    windows = []
    for j in range(len(padded_predictions) - window_size + 1):
        window_start = j
        window_end = j + window_size
        window = padded_predictions[window_start:window_end]
        windows.append(window)

    # Convert windows list to NumPy array and assign to windowed_predictions
    windowed_predictions = np.array(windows)

    return windowed_predictions

def preprocess_labels(predictions, window_size):
    num_samples = predictions.shape[0]
    num_classes = 1  # Since predictions is 1D, we have only 1 class
    pad_width = ((window_size // 2, window_size // 2), (0, 0))

    # Define windowed_predictions shape considering all windows
    windowed_predictions = np.empty((num_samples, window_size, num_classes))

    # Pad predictions
    padded_predictions = np.pad(predictions[:, None], pad_width, mode='constant')

    # Collect windows into windowed_predictions directly
    for i in range(num_samples):
        window_start = i
        window_end = i + window_size
        windowed_predictions[i] = padded_predictions[window_start:window_end]

    return windowed_predictions

def calculate_correlation_coefficients(y_test, test_result):
    class_names = {0: 'coil', 1: 'β-sheet', 2: 'α-helix'}
    correlation_coefficients = {}
    for class_idx in range(3):
        actual_labels = (y_test == class_idx).astype(int)
        predicted_labels = (test_result == class_idx).astype(int)
        correlation_coefficient = matthews_corrcoef(actual_labels, predicted_labels)
        correlation_coefficients[class_names[class_idx]] = correlation_coefficient

    # Print correlation coefficients
    for class_name, correlation_coefficient in correlation_coefficients.items():
        print(f"MCC for class {class_name}: {correlation_coefficient}")


def calculate_pssm(aligned_sequences):
    pssm = {}
    sequence_length = len(list(aligned_sequences.values())[0])
    total_sequences = len(aligned_sequences)

    # Initialize counts for each position
    for i in range(sequence_length):
        pssm[i] = {'A': 0, 'C': 0, 'D': 0, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 0, 'L': 0,
                   'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0, '-': 0}

    # Count occurrences of each amino acid at each position
    for sequence in aligned_sequences.values():
        for i, aa in enumerate(sequence):
            if aa != "-":
                pssm[i][aa] += 1
                
    # Normalize counts to frequencies and calculate background frequencies
    background_frequencies = {aa: 0 for aa in pssm[0].keys()}
    for i in range(sequence_length):
        total_count = sum(pssm[i].values())
        for aa in pssm[i]:
            if total_count != 0:
                pssm[i][aa] /= total_count
            background_frequencies[aa] += pssm[i][aa] / total_sequences

    # Calculate log-odds scores
    for i in range(sequence_length):
        for aa in pssm[i]:
            if background_frequencies[aa] != 0:
                ratio = pssm[i][aa] / background_frequencies[aa]
                if ratio != 0:
                    pssm[i][aa] = math.log2(ratio)

    return pssm

def plot_history(history):
    # Plot training & validation accuracy values
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('Model accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    plt.show()

    # Plot training & validation loss values
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    plt.show()

# Data processing
Training and test files are preprocessed into different window sizes and profiled using a position-specific score matrices (pssm)

In [340]:
train_path = 'Q_and_s_data/protein-secondary-structure.train.txt'
train_sequences, train_structures = parse_data(train_path)
aligned_sequences = read_aligned_sequences("Fasta_and_msa/training_msa.txt")
aligned_sequence_structures = align_structure(train_structures, aligned_sequences)
pssm_train = calculate_pssm(aligned_sequences)

test_path = 'Q_and_s_data/protein-secondary-structure.test.txt'
test_sequences, test_structures = parse_data(test_path)
test_aligned_sequences = read_aligned_sequences("Fasta_and_msa/test_msa.txt")
test_aligned_sequence_structures = align_structure(test_structures, test_aligned_sequences)
pssm_test = calculate_pssm(test_aligned_sequences)

X_train_13, y_train = preprocess_data(aligned_sequences, aligned_sequence_structures, pssm_train, 13)
X_train_11, y_train = preprocess_data(aligned_sequences, aligned_sequence_structures, pssm_train, 11)
X_train_9, y_train = preprocess_data(aligned_sequences, aligned_sequence_structures, pssm_train, 9)
X_train_7, y_train = preprocess_data(aligned_sequences, aligned_sequence_structures, pssm_train, 7)
X_train_5, y_train = preprocess_data(aligned_sequences, aligned_sequence_structures, pssm_train, 5)

X_test_13, y_test = preprocess_data(test_aligned_sequences, test_aligned_sequence_structures, pssm_test, 13)
X_test_11, y_test = preprocess_data(test_aligned_sequences, test_aligned_sequence_structures, pssm_test, 11)
X_test_9, y_test = preprocess_data(test_aligned_sequences, test_aligned_sequence_structures, pssm_test, 9)
X_test_7, y_test = preprocess_data(test_aligned_sequences, test_aligned_sequence_structures, pssm_test, 7)
X_test_5, y_test = preprocess_data(test_aligned_sequences, test_aligned_sequence_structures, pssm_test, 5)

In [341]:
# For window size 13
middle = X_train_13.shape[1] // 2
new_train = X_train_13[:, middle, :]
X_train_13 = create_windows_new(new_train, 13)

middle = X_test_13.shape[1] // 2
new_test = X_test_13[:, middle, :]
X_test_13 = create_windows_new(new_test, 13)

# For window size 11
middle = X_train_11.shape[1] // 2
new_train_11 = X_train_11[:, middle, :]
X_train_11 = create_windows_new(new_train_11, 11)

middle = X_test_11.shape[1] // 2
new_test_11 = X_test_11[:, middle, :]
X_test_11 = create_windows_new(new_test_11, 11)

# For window size 9
middle = X_train_9.shape[1] // 2
new_train_9 = X_train_9[:, middle, :]
X_train_9 = create_windows_new(new_train_9, 9)

middle = X_test_9.shape[1] // 2
new_test_9 = X_test_9[:, middle, :]
X_test_9 = create_windows_new(new_test_9, 9)

# For window size 7
middle = X_train_7.shape[1] // 2
new_train_7 = X_train_7[:, middle, :]
X_train_7 = create_windows_new(new_train_7, 7)

middle = X_test_7.shape[1] // 2
new_test_7 = X_test_7[:, middle, :]
X_test_7 = create_windows_new(new_test_7, 7)

# For window size 5
middle = X_train_5.shape[1] // 2
new_train_5 = X_train_5[:, middle, :]
X_train_5 = create_windows_new(new_train_5, 5)

middle = X_test_5.shape[1] // 2
new_test_5 = X_test_5[:, middle, :]
X_test_5 = create_windows_new(new_test_5, 5)

In [342]:
# train_sequences, train_structures = parse_data_2(train_path)
# x13, y = create_windows(train_sequences, train_structures, 13)
# x11, y = create_windows(train_sequences, train_structures, 11)
# x9, y = create_windows(train_sequences, train_structures, 9)
# x7, y = create_windows(train_sequences, train_structures, 7)
# x5, y = create_windows(train_sequences, train_structures, 5)

# X_train_13 = np.array([encode_sequence(sequence) for sequence in x13])
# X_train_11 = np.array([encode_sequence(sequence) for sequence in x11])
# X_train_9 = np.array([encode_sequence(sequence) for sequence in x9])
# X_train_7 = np.array([encode_sequence(sequence) for sequence in x7])
# X_train_5 = np.array([encode_sequence(sequence) for sequence in x5])
# y_train = np.array(encode_structure(y))

# test_sequences, test_structures = parse_data_2(test_path)
# x13t, yt = create_windows(test_sequences, test_structures, 13)
# x11t, yt = create_windows(test_sequences, test_structures, 11)
# x9t, yt = create_windows(test_sequences, test_structures, 9)
# x7t, yt = create_windows(test_sequences, test_structures, 7)
# x5t, yt = create_windows(test_sequences, test_structures, 5)

# X_test_13 = np.array([encode_sequence(sequence) for sequence in x13t])
# X_test_11 = np.array([encode_sequence(sequence) for sequence in x11t])
# X_test_9 = np.array([encode_sequence(sequence) for sequence in x9t])
# X_test_7 = np.array([encode_sequence(sequence) for sequence in x7t])
# X_test_5 = np.array([encode_sequence(sequence) for sequence in x5t])
# y_test = np.array(encode_structure(yt))

In [343]:
def replace_values(array, keep_values, replace_value):
    mask = np.isin(array, keep_values)
    new_array = np.where(mask, 1, replace_value)
    return new_array

def replace_values_multiple(array, replacements):
    masks = [np.isin(array, values) for values in replacements.keys()]
    combined_mask = np.any(masks, axis=0)
    new_array = np.where(combined_mask, array, -1)
    for value, replacement in replacements.items():
        new_array = np.where(array == value, replacement, new_array)
    
    return new_array

# h = 2, e = 1, _ or e = 0
h_not_h = replace_values(y_train, 2, -1)
e_not_e = replace_values(y_train, 1, -1)
c_not_c = replace_values(y_train, 0, -1)
h_not_e = replace_values_multiple(y_train, {0: 0, 1: -1, 2: 1})
e_not_c = replace_values_multiple(y_train, {0: -1, 1: 1, 2: 0})
c_not_h = replace_values_multiple(y_train, {0: 1, 1: 0, 2: -1})

In [344]:
# Define class weights for each SVM
class_weights_hh = compute_class_weight(class_weight='balanced', classes=np.unique(h_not_h), y=h_not_h)
weights_hh = dict(zip(np.unique(h_not_h), class_weights_hh))

class_weights_ee = compute_class_weight(class_weight='balanced', classes=np.unique(e_not_e), y=e_not_e)
weights_ee = dict(zip(np.unique(e_not_e), class_weights_ee))

class_weights_cc = compute_class_weight(class_weight='balanced', classes=np.unique(c_not_c), y=c_not_c)
weights_cc = dict(zip(np.unique(c_not_c), class_weights_cc))

class_weights_he = compute_class_weight(class_weight='balanced', classes=np.unique(h_not_e), y=h_not_e)
weights_he = dict(zip(np.unique(h_not_e), class_weights_he))

class_weights_ec = compute_class_weight(class_weight='balanced', classes=np.unique(e_not_c), y=e_not_c)
weights_ec = dict(zip(np.unique(e_not_c), class_weights_ec))

class_weights_ch = compute_class_weight(class_weight='balanced', classes=np.unique(c_not_h), y=c_not_h)
weights_ch = dict(zip(np.unique(c_not_h), class_weights_ch))

In [345]:
X_train_13_reshaped = X_train_13.reshape(X_train_13.shape[0], -1)

X_train_11_reshaped = X_train_11.reshape(X_train_11.shape[0], -1)
X_train_9_reshaped = X_train_9.reshape(X_train_9.shape[0], -1)
X_train_7_reshaped = X_train_7.reshape(X_train_7.shape[0], -1)
X_train_5_reshaped = X_train_5.reshape(X_train_5.shape[0], -1)

# Model training and evaluation
6 binary SVM classifiers are trained as well as a convolutional neural network (CNN)

In [346]:
# One-vs-Rest SVM classifiers for each pair of classes
svm_h_h = SVC(kernel='rbf', verbose=True, C=1.5, class_weight=weights_hh)
svm_e_e = SVC(kernel='rbf', verbose=True, C=1.5, class_weight=weights_ee)
svm_c_c = SVC(kernel='rbf', verbose=True, C=1.5, class_weight=weights_cc)

# Binary SVM classifiers for each pair of classes
svm_h_e = SVC(kernel='rbf', verbose=True, C=1.5, class_weight=weights_he)
svm_e_c = SVC(kernel='rbf', verbose=True, C=1.5, class_weight=weights_ec)
svm_c_h = SVC(kernel='rbf', verbose=True, C=1.5, class_weight=weights_ch)

# Fit the SVM classifiers
svm_h_h.fit(X_train_11_reshaped, h_not_h)
svm_e_e.fit(X_train_9_reshaped, e_not_e)
svm_c_c.fit(X_train_7_reshaped, c_not_c)
svm_h_e.fit(X_train_9_reshaped, h_not_e)
svm_e_c.fit(X_train_5_reshaped, e_not_c)
svm_c_h.fit(X_train_9_reshaped, c_not_h)

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

In [347]:
h_h_scores = svm_h_h.decision_function(X_train_11_reshaped)
e_e_scores = svm_e_e.decision_function(X_train_9_reshaped)
c_c_scores = svm_c_c.decision_function(X_train_7_reshaped)
h_e_scores = svm_h_e.decision_function(X_train_9_reshaped)
e_c_scores = svm_e_c.decision_function(X_train_5_reshaped)
c_h_scores = svm_c_h.decision_function(X_train_9_reshaped)

In [348]:
def predict_svm_max_d(c_c_scores, e_e_scores, h_h_scores):
  predicted_classes = np.argmax([c_c_scores, e_e_scores, h_h_scores], axis=0)
  return predicted_classes

combined_predictions_max_d = predict_svm_max_d(c_c_scores, e_e_scores, h_h_scores)

In [349]:
def predict_svm_tree1(h_h_scores, e_c_scores):
  # Classify based on first SVM (H/not H)
    predicted_classes = []
    for i in range(len(h_h_scores)):
        if h_h_scores[i] > 0:
            predicted_classes.append(2)
        elif e_c_scores[i,2] > e_c_scores[i,0]:
            predicted_classes.append(1)
        else:
            predicted_classes.append(0)
    return np.array(predicted_classes)

combined_predictions_tree1 = predict_svm_tree1(h_h_scores, e_c_scores)

def predict_svm_tree2(e_e_scores, c_h_scores):
  # Classify based on first SVM (E/not E)
    predicted_classes = []
    for i in range(len(e_e_scores)):
        if e_e_scores[i] > 0:
            predicted_classes.append(1)
        elif c_h_scores[i,2] > c_h_scores[i,0]:
            predicted_classes.append(0)
        else:
            predicted_classes.append(2)
    return np.array(predicted_classes)

def predict_svm_tree3(c_c_scores, h_e_scores):
  # Classify based on first SVM (C/not C)
    predicted_classes = []
    for i in range(len(c_c_scores)):
        if c_c_scores[i] > 0:
            predicted_classes.append(0)
        elif h_e_scores[i,2] > h_e_scores[i,0]:
            predicted_classes.append(2)
        else:
            predicted_classes.append(1)
    return np.array(predicted_classes)

combined_predictions_tree1 = predict_svm_tree1(h_h_scores, e_c_scores)
combined_predictions_tree2 = predict_svm_tree2(e_e_scores, c_h_scores)
combined_predictions_tree3 = predict_svm_tree3(c_c_scores, h_e_scores)

In [350]:
def predict_svm_vote(X_train_11, X_train_9, X_train_7, X_train_5):
    # Predictions from all six binary classifiers
    all_votes = np.vstack([
        svm_h_h.predict(X_train_11),
        svm_e_e.predict(X_train_9),
        svm_c_c.predict(X_train_7),
        svm_h_e.predict(X_train_9),
        svm_e_c.predict(X_train_5),
        svm_c_h.predict(X_train_9)
    ]).T

    # Counting votes using NumPy operations
    votes = np.zeros((len(all_votes), 3), dtype=int)
    votes[:, 2] += np.sum(all_votes[:, [0, 3, 5]], axis=1)
    votes[:, 1] += np.sum(all_votes[:, [1, 3, 4]], axis=1)
    votes[:, 0] += np.sum(all_votes[:, [2, 4, 5]], axis=1)

    # Choosing the class with the maximum votes
    predictions = np.argmax(votes, axis=1)

    # Handling ties
    max_votes = np.max(votes, axis=1)
    tie_mask = (votes == max_votes[:, None])
    tie_counts = np.sum(tie_mask, axis=1)
    predictions[tie_counts > 1] = 0  # Assign class 0 to ties

    return predictions


combined_predictions_vote = predict_svm_vote(X_train_11_reshaped, X_train_9_reshaped, X_train_7_reshaped, X_train_5_reshaped)

In [387]:
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=5, activation='relu', padding='same', input_shape=(13,20)))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', padding='same'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Conv1D(filters=128, kernel_size=3, activation='relu', padding='same'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(3, activation='softmax'))

early_stopping = EarlyStopping(monitor='val_loss', patience=6, verbose=1, restore_best_weights=True)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_13, y_train, epochs=100, batch_size=32, validation_data=(X_test_13, y_test),  callbacks=[early_stopping])

NN_pred = model.predict(X_train_13)
combined_predictions_NN = np.argmax(NN_pred, axis=1)

Epoch 1/100


  super().__init__(


[1m566/566[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.4961 - loss: 1.1146 - val_accuracy: 0.5858 - val_loss: 0.9141
Epoch 2/100
[1m566/566[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.5712 - loss: 0.9201 - val_accuracy: 0.6045 - val_loss: 0.8617
Epoch 3/100
[1m566/566[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.5900 - loss: 0.8674 - val_accuracy: 0.6213 - val_loss: 0.8377
Epoch 4/100
[1m566/566[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.6295 - loss: 0.8185 - val_accuracy: 0.6310 - val_loss: 0.8295
Epoch 5/100
[1m566/566[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.6379 - loss: 0.8020 - val_accuracy: 0.6295 - val_loss: 0.8266
Epoch 6/100
[1m566/566[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.6477 - loss: 0.7827 - val_accuracy: 0.6344 - val_loss: 0.8364
Epoch 7/100
[1m566/566[0m [32m━

In [355]:
# nn_inputs = np.column_stack((h_h_scores, e_e_scores, c_c_scores, h_e_scores, e_c_scores, c_h_scores))
# # nn_inputs = np.column_stack((h_e_scores, e_c_scores, c_h_scores))
# # nn_inputs = np.column_stack((h_h_scores, e_e_scores, c_c_scores))

# # model = tf.keras.Sequential([
# #     tf.keras.layers.Dense(20, activation='relu'),
# #     tf.keras.layers.Dense(3, activation='softmax')
# # ])

# model = Sequential([
#     # Input(shape=(13, 21)),
#     # Flatten(),
#     Dense(units=27, activation='relu'),
#     # LSTM(units=273, activation='relu'),
#     Dense(units=3, activation='softmax')
# ])

# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# model.fit(nn_inputs, y_train, epochs=10, batch_size=32, verbose=1)

# NN_pred = model.predict(nn_inputs)

# combined_predictions_NN = np.argmax(NN_pred, axis=1)

In [356]:
def compute_weightings(accuracies):
    total_accuracy = sum(accuracies.values())
    weightings = {classifier: accuracy / total_accuracy for classifier, accuracy in accuracies.items()}
    return weightings

In [388]:
print("Max_D")
maxd, maxd_class = calculate_accuracy(y_train, combined_predictions_max_d)
print("Total Accuracy:", maxd)
print("Class Accuracies:", maxd_class)
print("\nTree1")
tree1, tree1_class = calculate_accuracy(y_train, combined_predictions_tree1)
print("Total Accuracy:", tree1)
print("Class Accuracies:", tree1_class)
print("\nTree2")
tree2, tree2_class = calculate_accuracy(y_train, combined_predictions_tree2)
print("Total Accuracy:", tree2)
print("Class Accuracies:", tree2_class)
print("\nTree3")
tree3, tree3_class = calculate_accuracy(y_train, combined_predictions_tree3)
print("Total Accuracy:", tree3)
print("Class Accuracies:", tree3_class)
print("\nVote")
vote, vote_class = calculate_accuracy(y_train, combined_predictions_vote)
print("Total Accuracy:", vote)
print("Class Accuracies:", vote_class)
print("\nNN")
nn, nn_class = calculate_accuracy(y_train, combined_predictions_NN)
print("Total Accuracy:", nn)
print("Class Accuracies:", nn_class)

Max_D
Total Accuracy: 0.9450980392156862
Class Accuracies: {0: 0.91335630320227, 1: 0.977997799779978, 2: 0.9871767007172354}

Tree1
Total Accuracy: 0.8821872410936206
Class Accuracies: {0: 0.824787190920146, 1: 0.8938393839383938, 2: 0.9960878069984785}

Tree2
Total Accuracy: 0.9182546257939795
Class Accuracies: {0: 0.8801175516822051, 1: 0.9958745874587459, 2: 0.9387089763094979}

Tree3
Total Accuracy: 0.8982049157691245
Class Accuracies: {0: 0.8858937981353872, 1: 0.9062156215621562, 2: 0.9182786350793306}

Vote
Total Accuracy: 0.9122341894504281
Class Accuracies: {0: 0.9116335630320227, 1: 0.9042904290429042, 2: 0.9198000434688112}

NN
Total Accuracy: 0.6950013808340237
Class Accuracies: {0: 0.8298540737738144, 1: 0.3943894389438944, 2: 0.6433384046946315}


In [358]:
def svm_jury(combined_predictions_max_d, combined_predictions_tree1,
             combined_predictions_tree2, combined_predictions_tree3, combined_predictions_vote, combined_predictions_NN, accuracies, class_accuracies):
    
    assert combined_predictions_max_d.shape == combined_predictions_vote.shape == combined_predictions_tree1.shape == \
           combined_predictions_tree2.shape == combined_predictions_tree3.shape == combined_predictions_NN.shape, \
           "All predictions must have the same shape."
    
    num_samples = combined_predictions_max_d.shape[0]
    
    jury_predictions = np.zeros((num_samples,3))
    
    for i in range(num_samples):
        # Count the number of votes for each class (C, E, H)
        votes = [0, 0, 0]
        votes[combined_predictions_max_d[i]] += accuracies['Max_D'] * class_accuracies['Max_D'][combined_predictions_max_d[i]]
        votes[combined_predictions_tree1[i]] += accuracies['Tree1'] * class_accuracies['Tree1'][combined_predictions_tree1[i]]
        votes[combined_predictions_tree2[i]] += accuracies['Tree2'] * class_accuracies['Tree2'][combined_predictions_tree2[i]]
        votes[combined_predictions_tree3[i]] += accuracies['Tree3'] * class_accuracies['Tree3'][combined_predictions_tree3[i]]
        votes[combined_predictions_vote[i]] += accuracies['Vote'] * class_accuracies['Vote'][combined_predictions_vote[i]]
        votes[combined_predictions_NN[i]] += accuracies['NN'] * class_accuracies['NN'][combined_predictions_NN[i]]

        # votes[combined_predictions_max_d[i]] += class_accuracies['Max_D'][combined_predictions_max_d[i]]
        # votes[combined_predictions_tree1[i]] += class_accuracies['Tree1'][combined_predictions_tree1[i]]
        # votes[combined_predictions_tree2[i]] += class_accuracies['Tree2'][combined_predictions_tree2[i]]
        # votes[combined_predictions_tree3[i]] += class_accuracies['Tree3'][combined_predictions_tree3[i]]
        # votes[combined_predictions_vote[i]] += class_accuracies['Vote'][combined_predictions_vote[i]]
        # votes[combined_predictions_NN[i]] += class_accuracies['NN'][combined_predictions_NN[i]]

        # votes[combined_predictions_max_d[i]] += accuracies['Max_D']
        # votes[combined_predictions_tree1[i]] += accuracies['Tree1']
        # votes[combined_predictions_tree2[i]] += accuracies['Tree2']
        # votes[combined_predictions_tree3[i]] += accuracies['Tree3']
        # votes[combined_predictions_vote[i]] += accuracies['Vote']
        # votes[combined_predictions_NN[i]] += accuracies['NN']

        # votes[combined_predictions_max_d[i]] += 1
        # votes[combined_predictions_tree1[i]] += 1
        # votes[combined_predictions_tree2[i]] += 1
        # votes[combined_predictions_tree3[i]] += 1
        # votes[combined_predictions_vote[i]] += 1
        # votes[combined_predictions_NN[i]] += 1

        jury_predictions[i] = np.array(votes)

    return jury_predictions

accuracies = {
    'Max_D': maxd,
    'Tree1': tree1,
    'Tree2': tree2,
    'Tree3': tree3,
    'Vote': vote,
    'NN': nn,
}

class_accuracies = {
    'Max_D': maxd_class,
    'Tree1': tree1_class,
    'Tree2': tree2_class,
    'Tree3': tree3_class,
    'Vote': vote_class,
    'NN': nn_class,
}

for key, inner_dict in class_accuracies.items():
    class_accuracies[key][0] *= 0.7

bruh = svm_jury(combined_predictions_max_d, combined_predictions_vote, combined_predictions_tree1, combined_predictions_tree2, combined_predictions_tree3, combined_predictions_NN, accuracies, class_accuracies)

In [359]:
train_result = np.argmax(bruh, axis=1)

In [360]:
train_total_accuracy, train_class_accuracy = calculate_accuracy(y_train, train_result)
print("Total Accuracy:", train_total_accuracy)
print("Class Accuracies:", train_class_accuracy)

Total Accuracy: 0.9249378624689313
Class Accuracies: {0: 0.8881232265910012, 1: 0.9537953795379538, 2: 0.9810910671593132}


# Evaluation

In [361]:
X_test_13_reshaped = X_test_13.reshape(X_test_13.shape[0], -1)

X_test_11_reshaped = X_test_11.reshape(X_test_11.shape[0], -1)
X_test_9_reshaped = X_test_9.reshape(X_test_9.shape[0], -1)
X_test_7_reshaped = X_test_7.reshape(X_test_7.shape[0], -1)
X_test_5_reshaped = X_test_5.reshape(X_test_5.shape[0], -1)

h_h_scores_test = svm_h_h.decision_function(X_test_11_reshaped)
e_e_scores_test = svm_e_e.decision_function(X_test_9_reshaped)
c_c_scores_test = svm_c_c.decision_function(X_test_7_reshaped)
h_e_scores_test = svm_h_e.decision_function(X_test_9_reshaped)
e_c_scores_test = svm_e_c.decision_function(X_test_5_reshaped)
c_h_scores_test = svm_c_h.decision_function(X_test_9_reshaped)

combined_predictions_max_d_test = predict_svm_max_d(c_c_scores_test, e_e_scores_test, h_h_scores_test)
combined_predictions_tree1_test = predict_svm_tree1(h_h_scores_test, e_c_scores_test)
combined_predictions_tree2_test = predict_svm_tree2(e_e_scores_test, c_h_scores_test)
combined_predictions_tree3_test = predict_svm_tree3(c_c_scores_test, h_e_scores_test)
combined_predictions_vote_test = predict_svm_vote(X_test_11_reshaped, X_test_9_reshaped, X_test_7_reshaped, X_test_5_reshaped)

# nn_inputs_test = np.column_stack((h_h_scores_test, e_e_scores_test, c_c_scores_test, h_e_scores_test, e_c_scores_test, c_h_scores_test))
# # nn_inputs_test = np.column_stack((h_e_scores_test, e_c_scores_test, c_h_scores_test))
# # nn_inputs_test = np.column_stack((h_h_scores_test, e_e_scores_test, c_c_scores_test))
# NN_pred_test = model.predict(nn_inputs_test)
# combined_predictions_NN_test = np.argmax(NN_pred_test, axis=1)

NN_pred_test = model.predict(X_test_13)
combined_predictions_NN_test = np.argmax(NN_pred_test, axis=1)

In [389]:
print("Max_D")
maxd_test, maxd_class_test = calculate_accuracy(y_test, combined_predictions_max_d_test)
print("Total Accuracy:", maxd_test)
print("Class Accuracies:", maxd_class_test)
calculate_correlation_coefficients(y_test, combined_predictions_max_d_test)
print("\nTree1")
tree1_test, tree1_class_test = calculate_accuracy(y_test, combined_predictions_tree1_test)
print("Total Accuracy:", tree1_test)
print("Class Accuracies:", tree1_class_test)
calculate_correlation_coefficients(y_test, combined_predictions_tree1_test)
print("\nTree2")
tree2_test, tree2_class_test = calculate_accuracy(y_test, combined_predictions_tree2_test)
print("Total Accuracy:", tree2_test)
print("Class Accuracies:", tree2_class_test)
calculate_correlation_coefficients(y_test, combined_predictions_tree2_test)
print("\nTree3")
tree3_test, tree3_class_test = calculate_accuracy(y_test, combined_predictions_tree3_test)
print("Total Accuracy:", tree3_test)
print("Class Accuracies:", tree3_class_test)
calculate_correlation_coefficients(y_test, combined_predictions_tree3_test)
print("\nVote")
vote_test, vote_class_test = calculate_accuracy(y_test, combined_predictions_vote_test)
print("Total Accuracy:", vote_test)
print("Class Accuracies:", vote_class_test)
calculate_correlation_coefficients(y_test, combined_predictions_vote_test)
print("\nNN")
nn_test, nn_class_test = calculate_accuracy(y_test, combined_predictions_NN_test)
print("Total Accuracy:", nn_test)
print("Class Accuracies:", nn_class_test)
calculate_correlation_coefficients(y_test, combined_predictions_NN_test)

accuracies_test = {
    'Max_D': maxd_test,
    'Tree1': tree1_test,
    'Tree2': tree2_test,
    'Tree3': tree3_test,
    'Vote': vote_test,
    'NN': nn_test,
}

class_accuracies_test = {
    'Max_D': maxd_class_test,
    'Tree1': tree1_class_test,
    'Tree2': tree2_class_test,
    'Tree3': tree3_class_test,
    'Vote': vote_class_test,
    'NN': nn_class_test,
}

bruh_test = svm_jury(combined_predictions_max_d_test, combined_predictions_tree1_test, combined_predictions_tree2_test, combined_predictions_tree3_test, combined_predictions_vote_test, combined_predictions_NN_test, accuracies_test, class_accuracies_test)

Max_D
Total Accuracy: 0.5988636363636364
Class Accuracies: {0: 0.719188767550702, 1: 0.42513368983957217, 2: 0.4793875147232038}
MCC for class coil: 0.3707619591223551
MCC for class β-sheet: 0.2705039085040551
MCC for class α-helix: 0.32199134485576164

Tree1
Total Accuracy: 0.5900568181818182
Class Accuracies: {0: 0.6692667706708268, 1: 0.47192513368983957, 2: 0.5147232037691402}
MCC for class coil: 0.3621438622013262
MCC for class β-sheet: 0.2605310085102478
MCC for class α-helix: 0.3589190876314922

Tree2
Total Accuracy: 0.5971590909090909
Class Accuracies: {0: 0.6983879355174207, 1: 0.42780748663101603, 2: 0.5170789163722026}
MCC for class coil: 0.36875277271724705
MCC for class β-sheet: 0.27788187236321465
MCC for class α-helix: 0.3314932418626386

Tree3
Total Accuracy: 0.5954545454545455
Class Accuracies: {0: 0.6952678107124285, 1: 0.4385026737967914, 2: 0.5076560659599529}
MCC for class coil: 0.3698547018642283
MCC for class β-sheet: 0.2825080133973469
MCC for class α-helix: 0.3

In [395]:
test_result = np.argmax(bruh_test, axis=1)
test_total_accuracy, test_class_accuracy = calculate_accuracy(y_test, test_result)
print("Total Accuracy:", test_total_accuracy)
print("Class Accuracies:", test_class_accuracy)

Total Accuracy: 0.6210227272727272
Class Accuracies: {0: 0.7753510140405616, 1: 0.3783422459893048, 2: 0.48527679623085984}


In [391]:
calculate_correlation_coefficients(y_test, test_result)

MCC for class coil: 0.3797932455727784
MCC for class β-sheet: 0.28198199176172734
MCC for class α-helix: 0.3596039716401946
