In [1]:
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import tensorflow as tf

import random

from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef


tf.keras.utils.set_random_seed(42)


In [2]:
# Define BLOSUM62 matrix as a dictionary of dictionaries
blosum62 = {
    'A': {'A': 4, 'R': -1, 'N': -2, 'D': -2, 'C': 0, 'Q': -1, 'E': -1, 'G': 0, 'H': -2, 'I': -1, 'L': -1, 'K': -1, 'M': -1, 'F': -2, 'P': -1, 'S': 1, 'T': 0, 'W': -3, 'Y': -2, 'V': 0},
    'R': {'A': -1, 'R': 5, 'N': 0, 'D': -2, 'C': -3, 'Q': 1, 'E': 0, 'G': -2, 'H': 0, 'I': -3, 'L': -2, 'K': 2, 'M': -1, 'F': -3, 'P': -2, 'S': -1, 'T': -1, 'W': -3, 'Y': -2, 'V': -3},
    'N': {'A': -2, 'R': 0, 'N': 6, 'D': 1, 'C': -3, 'Q': 0, 'E': 0, 'G': 0, 'H': 1, 'I': -3, 'L': -3, 'K': 0, 'M': -2, 'F': -3, 'P': -2, 'S': 1, 'T': 0, 'W': -4, 'Y': -2, 'V': -3},
    'D': {'A': -2, 'R': -2, 'N': 1, 'D': 6, 'C': -3, 'Q': 0, 'E': 2, 'G': -1, 'H': -1, 'I': -3, 'L': -4, 'K': -1, 'M': -3, 'F': -3, 'P': -1, 'S': 0, 'T': -1, 'W': -4, 'Y': -3, 'V': -3},
    'C': {'A': 0, 'R': -3, 'N': -3, 'D': -3, 'C': 9, 'Q': -3, 'E': -4, 'G': -3, 'H': -3, 'I': -1, 'L': -1, 'K': -3, 'M': -1, 'F': -2, 'P': -3, 'S': -1, 'T': -1, 'W': -2, 'Y': -2, 'V': -1},
    'Q': {'A': -1, 'R': 1, 'N': 0, 'D': 0, 'C': -3, 'Q': 5, 'E': 2, 'G': -2, 'H': 0, 'I': -3, 'L': -2, 'K': 1, 'M': 0, 'F': -3, 'P': -1, 'S': 0, 'T': -1, 'W': -2, 'Y': -1, 'V': -2},
    'E': {'A': -1, 'R': 0, 'N': 0, 'D': 2, 'C': -4, 'Q': 2, 'E': 5, 'G': -2, 'H': 0, 'I': -3, 'L': -3, 'K': 1, 'M': -2, 'F': -3, 'P': -1, 'S': 0, 'T': -1, 'W': -3, 'Y': -2, 'V': -2},
    'G': {'A': 0, 'R': -2, 'N': 0, 'D': -1, 'C': -3, 'Q': -2, 'E': -2, 'G': 6, 'H': -2, 'I': -4, 'L': -4, 'K': -2, 'M': -3, 'F': -3, 'P': -2, 'S': 0, 'T': -2, 'W': -2, 'Y': -3, 'V': -3},
    'H': {'A': -2, 'R': 0, 'N': 1, 'D': -1, 'C': -3, 'Q': 0, 'E': 0, 'G': -2, 'H': 8, 'I': -3, 'L': -3, 'K': -1, 'M': -2, 'F': -1, 'P': -2, 'S': -1, 'T': -2, 'W': -2, 'Y': 2, 'V': -3},
    'I': {'A': -1, 'R': -3, 'N': -3, 'D': -3, 'C': -1, 'Q': -3, 'E': -3, 'G': -4, 'H': -3, 'I': 4, 'L': 2, 'K': -3, 'M': 1, 'F': 0, 'P': -3, 'S': -2, 'T': -1, 'W': -3, 'Y': -1, 'V': 3},
    'L': {'A': -1, 'R': -2, 'N': -3, 'D': -4, 'C': -1, 'Q': -2, 'E': -3, 'G': -4, 'H': -3, 'I': 2, 'L': 4, 'K': -2, 'M': 2, 'F': 0, 'P': -3, 'S': -2, 'T': -1, 'W': -2, 'Y': -1, 'V': 1},
    'K': {'A': -1, 'R': 2, 'N': 0, 'D': -1, 'C': -3, 'Q': 1, 'E': 1, 'G': -2, 'H': -1, 'I': -3, 'L': -2, 'K': 5, 'M': -1, 'F': -3, 'P': -1, 'S': 0, 'T': -1, 'W': -3, 'Y': -2, 'V': -2},
    'M': {'A': -1, 'R': -1, 'N': -2, 'D': -3, 'C': -1, 'Q': 0, 'E': -2, 'G': -3, 'H': -2, 'I': 1, 'L': 2, 'K': -1, 'M': 5, 'F': 0, 'P': -2, 'S': -1, 'T': -1, 'W': -1, 'Y': -1, 'V': 1},
    'F': {'A': -2, 'R': -3, 'N': -3, 'D': -3, 'C': -2, 'Q': -3, 'E': -3, 'G': -3, 'H': -1, 'I': 0, 'L': 0, 'K': -3, 'M': 0, 'F': 6, 'P': -4, 'S': -2, 'T': -2, 'W': 1, 'Y': 3, 'V': -1},
    'P': {'A': -1, 'R': -2, 'N': -2, 'D': -1, 'C': -3, 'Q': -1, 'E': -1, 'G': -2, 'H': -2, 'I': -3, 'L': -3, 'K': -1, 'M': -2, 'F': -4, 'P': 7, 'S': -1, 'T': -1, 'W': -4, 'Y': -3, 'V': -2},
    'S': {'A': 1, 'R': -1, 'N': 1, 'D': 0, 'C': -1, 'Q': 0, 'E': 0, 'G': 0, 'H': -1, 'I': -2, 'L': -2, 'K': 0, 'M': -1, 'F': -2, 'P': -1, 'S': 4, 'T': 1, 'W': -3, 'Y': -2, 'V': -2},
    'T': {'A': 0, 'R': -1, 'N': 0, 'D': -1, 'C': -1, 'Q': -1, 'E': -1, 'G': -2, 'H': -2, 'I': -1, 'L': -1, 'K': -1, 'M': -1, 'F': -2, 'P': -1, 'S': 1, 'T': 5, 'W': -2, 'Y': -2, 'V': 0},
    'W': {'A': -3, 'R': -3, 'N': -4, 'D': -4, 'C': -2, 'Q': -2, 'E': -3, 'G': -2, 'H': -2, 'I': -3, 'L': -2, 'K': -3, 'M': -1, 'F': 1, 'P': -4, 'S': -3, 'T': -2, 'W': 11, 'Y': 2, 'V': -3},
    'Y': {'A': -2, 'R': -2, 'N': -2, 'D': -3, 'C': -2, 'Q': -1, 'E': -2, 'G': -3, 'H': 2, 'I': -1, 'L': -1, 'K': -2, 'M': -1, 'F': 3, 'P': -3, 'S': -2, 'T': -2, 'W': 2, 'Y': 7, 'V': -1},
    'V': {'A': 0, 'R': -3, 'N': -3, 'D': -3, 'C': -1, 'Q': -2, 'E': -2, 'G': -3, 'H': -3, 'I': 3, 'L': 1, 'K': -2, 'M': 1, 'F': -1, 'P': -2, 'S': -2, 'T': 0, 'W': -3, 'Y': -1, 'V': 4}
}

def create_conservative_dict(matrix, threshold=1):
    """Create a substitution dictionary based on a given substitution matrix and threshold."""
    substitution_dict = {}
    for aa1 in matrix:
        substitution_dict[aa1] = [aa2 for aa2, score in matrix[aa1].items() if score >= threshold and aa1 != aa2]
    return substitution_dict

conservative_substitutions_blosum = create_conservative_dict(blosum62)
print(conservative_substitutions_blosum)

{'A': ['S'], 'R': ['Q', 'K'], 'N': ['D', 'H', 'S'], 'D': ['N', 'E'], 'C': [], 'Q': ['R', 'E', 'K'], 'E': ['D', 'Q', 'K'], 'G': [], 'H': ['N', 'Y'], 'I': ['L', 'M', 'V'], 'L': ['I', 'M', 'V'], 'K': ['R', 'Q', 'E'], 'M': ['I', 'L', 'V'], 'F': ['W', 'Y'], 'P': [], 'S': ['A', 'N', 'T'], 'T': ['S'], 'W': ['F', 'Y'], 'Y': ['H', 'F', 'W'], 'V': ['I', 'L', 'M']}


In [3]:
# Create conservative substitution dictionary based on a threshold
def create_conservative_dict(matrix, threshold=1):
    substitution_dict = {}
    for aa1 in matrix:
        substitution_dict[aa1] = [aa2 for aa2, score in matrix[aa1].items() if score >= threshold and aa1 != aa2]
    return substitution_dict

conservative_substitutions_blosum = create_conservative_dict(blosum62)

# Function to apply conservative mutations
def conservative_mutate_sequence(seq, substitution_dict, mutation_rate=0.05):
    """Randomly mutate a sequence using conservative substitutions."""
    tf.keras.utils.set_random_seed(42)

    mutated_seq = list(seq)
    for i in range(len(seq)):
        if random.random() < mutation_rate:
            if seq[i] in substitution_dict and substitution_dict[seq[i]]:
                mutated_seq[i] = random.choice(substitution_dict[seq[i]])
    return ''.join(mutated_seq)

# Function to generate augmented sequences
def augment_sequences(sequences, substitution_dict, num_augmentations=3, mutation_rate=0.05):
    """Generate augmented sequences by applying conservative mutations."""
    augmented_sequences = []
    for seq in sequences:
        augmented_sequences.append(seq)  # Include the original sequence
        for _ in range(num_augmentations):
            mutated_seq = conservative_mutate_sequence(seq, substitution_dict, mutation_rate)
            augmented_sequences.append(mutated_seq)
    return augmented_sequences

In [4]:
# One-hot encoding function
def one_hot_encode(seq, vocab):
    """One-hot encode a sequence based on a given vocabulary."""
    one_hot = np.zeros((len(seq), len(vocab)), dtype=np.float32)
    for i, char in enumerate(seq):
        if char in vocab:
            one_hot[i, vocab.index(char)] = 1.0
    return one_hot

# Define your vocabularies
aa_vocab = 'ACDEFGHIKLMNPQRSTVWY_'  # 20 amino acids + 1 for gap/unknown
structure_vocab = 'he_'  # h for helix, e for sheet, _ for coil

# Modified prepare_data function with augmentation
def prepare_data(filepath, window_size=13, augment=False, num_augmentations=1, mutation_rate=0.05):
    sequences = []
    structures = []
    current_seq = []
    current_struct = []
    processing_sequence = False  # Track when inside a sequence block

    with open(filepath, 'r') as file:
        for line in file:
            line = line.strip()
            if line == '<>':  # Toggle processing flag
                if processing_sequence:  # We are ending a sequence block
                    if current_seq and current_struct:
                        # Augment sequences if required
                        seqs_to_encode = [current_seq]
                        if augment:
                            seqs_to_encode = augment_sequences([current_seq], conservative_substitutions_blosum, num_augmentations, mutation_rate)

                        for augmented_seq in seqs_to_encode:
                            seq_encoded = one_hot_encode(augmented_seq, aa_vocab)
                            struct_encoded = one_hot_encode(current_struct, structure_vocab)

                            # Apply sliding window
                            for i in range(len(seq_encoded) - window_size + 1):
                                window = seq_encoded[i:i + window_size]
                                label = struct_encoded[i + window_size // 2]
                                sequences.append(window)
                                structures.append(label)

                    current_seq = []
                    current_struct = []
                processing_sequence = not processing_sequence
                continue

            elif 'end' in line:  # Generalized handling for any 'end' marker
                continue  # Just skip this line, do not end processing sequence

            if processing_sequence:
                parts = line.split()
                if len(parts) != 2:
                    continue  # Skip malformed lines or lines that do not fit expected format
                current_seq.append(parts[0])
                current_struct.append(parts[1])

    return np.array(sequences), np.array(structures)


def prepare_data_for_svm(filepath, window_size=13, type='train'):
    if type == 'train':
        sequences, structures = prepare_data(filepath, augment=True, num_augmentations=1, mutation_rate=0.05)
    elif type == 'test':
        sequences, structures = prepare_data(filepath, augment=False)
    
    # Flatten the windows for SVM processing
    flat_sequences = sequences.reshape(sequences.shape[0], -1)  # Reshape to (number_of_samples, window_size*features_per_aa)
    
    return flat_sequences, structures

In [5]:

# Example paths, replace with your actual file pathsin
train_path = 'Q_and_s_data/protein-secondary-structure.train.txt'
test_path = 'Q_and_s_data/protein-secondary-structure.test.txt'

x_train, y_train = prepare_data_for_svm(train_path, 'train')
x_test, y_test = prepare_data_for_svm(test_path, 'test')

In [6]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(16220, 273) (16220, 3)
(3428, 273) (3428, 3)


In [7]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, matthews_corrcoef, confusion_matrix

# Assuming x_train and y_train are your input and labels loaded from your function
# Flatten the one-hot encoded labels to a single dimension
y_train_flat = np.argmax(y_train, axis=1)
y_test_flat = np.argmax(y_test, axis=1)

# Scaling the data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train.reshape(x_train.shape[0], -1))
x_test_scaled = scaler.transform(x_test.reshape(x_test.shape[0], -1))

# Initialize and train the SVM
svm_model = SVC(kernel='sigmoid', C=2.0, gamma='auto', random_state=42, verbose=True)
svm_model.fit(x_train_scaled, y_train_flat)

# Predict on test data
y_pred = svm_model.predict(x_test_scaled)

# Compute metrics
q3_score = accuracy_score(y_test_flat, y_pred)
cm = confusion_matrix(y_test_flat, y_pred)

print()
print(f"Q3 Score (Accuracy): {q3_score:.4f}")
print("Confusion Matrix:\n", cm)

# Helper function to calculate MCC for each class
def calculate_mcc_for_each_class(y_true, y_pred, num_classes):
    mcc_scores = []
    for class_id in range(num_classes):
        # Create binary labels for the current class
        y_true_binary = (y_true == class_id).astype(int)
        y_pred_binary = (y_pred == class_id).astype(int)
        
        # Calculate MCC and append to results
        mcc = matthews_corrcoef(y_true_binary, y_pred_binary)
        mcc_scores.append(mcc)
    
    return mcc_scores

# Calculate MCC for each class
mcc_scores = calculate_mcc_for_each_class(y_test_flat, y_pred, 3)

# Print the MCC for each class
class_labels = ['Helix', 'Sheet', 'Coil']
for label, mcc in zip(class_labels, mcc_scores):
    print(f"MCC for {label}: {mcc:.4f}")


[LibSVM]
Q3 Score (Accuracy): 0.6129
Confusion Matrix:
 [[ 416   63  391]
 [ 157  226  269]
 [ 311  136 1459]]
MCC for Helix: 0.2937
MCC for Sheet: 0.3274
MCC for Coil: 0.3394
