# Import Modules

In [1]:
import numpy as np
import tensorflow as tf
from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, matthews_corrcoef, confusion_matrix

tf.keras.utils.set_random_seed(42)


# Import and Process Dataset

In [2]:
# convert a sequence of amino acids to a one-hot encoded matrix
def one_hot_encode(seq, vocab):
    # Create a matrix of zeros, with dimensions len(seq) x len(vocab)
    one_hot = np.zeros((len(seq), len(vocab)), dtype=np.float32)
    # Iterate over the sequence and set the appropriate elements to 1.0
    for i, char in enumerate(seq):
        if char in vocab:
            one_hot[i, vocab.index(char)] = 1.0
    return one_hot


# prepare data to enter neural network
def prepare_data(filepath, window_size=13):
    sequences = []
    structures = []
    current_seq = []
    current_struct = []
    processing_sequence = False  # Track when inside a sequence block

    with open(filepath, 'r') as file:
        for line in file:
            line = line.strip()
            if line == '<>': # Sequence separator
                if processing_sequence: 
                    if current_seq and current_struct:
                        seq_encoded = one_hot_encode(current_seq, aa_vocab) # One-hot encode sequence
                        struct_encoded = one_hot_encode(current_struct, structure_vocab) # One-hot encode structure

                        # Apply sliding window of determined size
                        for i in range(len(seq_encoded) - window_size + 1):
                            window = seq_encoded[i:i + window_size]
                            label = struct_encoded[i + window_size // 2]
                            sequences.append(window)
                            structures.append(label)

                    current_seq = []
                    current_struct = []
                processing_sequence = not processing_sequence
                continue

            elif 'end' in line:  # end of sequence or file
                continue  
            
            # If inside a sequence block, process the sequence
            # handles errors in sequence end lines in the dataset
            if processing_sequence:
                parts = line.split()
                if len(parts) != 2:
                    continue  
                current_seq.append(parts[0])
                current_struct.append(parts[1])

    return np.array(sequences), np.array(structures)

# flatten the sequences for SVM
def prepare_data_for_svm(filepath, window_size=13):
    sequences, structures = prepare_data(filepath, window_size)
    # Flatten the windows for SVM processing
    flat_sequences = sequences.reshape(sequences.shape[0], -1)  # Reshape to (number_of_samples, window_size*features_per_aa)
    
    return flat_sequences, structures

In [25]:

# Example paths, replace with your actual file pathsin
train_path = 'Q_and_s_data/protein-secondary-structure.train.txt'
#test_path = 'Q_and_s_data/protein-secondary-structure.test.txt'
test_path = 'datasets/cb513.txt'


x_train, y_train = prepare_data_for_svm(train_path)
x_test, y_test = prepare_data_for_svm(test_path)

In [26]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(8110, 273) (8110, 3)
(11098, 273) (11098, 3)


# SVM

In [27]:

# Flatten the one-hot encoded labels to a single dimension
y_train_flat = np.argmax(y_train, axis=1)
y_test_flat = np.argmax(y_test, axis=1)

# Scaling the data
scaler = StandardScaler()
# Reshape to (number_of_samples, number_of_features)
x_train_scaled = scaler.fit_transform(x_train.reshape(x_train.shape[0], -1)) 
x_test_scaled = scaler.transform(x_test.reshape(x_test.shape[0], -1)) 

# Initialize and train the SVM using Scikit-learn's SVC
svm_model = SVC(kernel='rbf', C=3, random_state=42, verbose=True)
# Fit the model
svm_model.fit(x_train_scaled, y_train_flat)

# Predict on test data
y_pred = svm_model.predict(x_test_scaled)

# Compute metrics
q3_score = accuracy_score(y_test_flat, y_pred)
cm = confusion_matrix(y_test_flat, y_pred)

# Print the results
print()
print(f"Q3 Score (Accuracy): {(q3_score * 100):.1f}%")
print("Confusion Matrix:\n", cm)

# Helper function to calculate MCC for each class
def calculate_mcc_for_each_class(y_true, y_pred, num_classes):
    mcc_scores = []
    for class_id in range(num_classes):
        # Create binary labels for the current class
        y_true_binary = (y_true == class_id).astype(int)
        y_pred_binary = (y_pred == class_id).astype(int)
        
        # Calculate MCC and append to results
        mcc = matthews_corrcoef(y_true_binary, y_pred_binary)
        mcc_scores.append(mcc)
    
    return mcc_scores

# Calculate MCC for each class
mcc_scores = calculate_mcc_for_each_class(y_test_flat, y_pred, 3)

# Print the MCC for each class
class_labels = ['Helix', 'Sheet', 'Coil']
for label, mcc in zip(class_labels, mcc_scores):
    print(f"MCC for {label}: {mcc:.2f}")


[LibSVM]
Q3 Score (Accuracy): 58.9%
Confusion Matrix:
 [[1967  231 1801]
 [ 481  654 1259]
 [ 534  260 3911]]
MCC for Helix: 0.38
MCC for Sheet: 0.29
MCC for Coil: 0.36
