# Libraries and Functions

In [3]:
# Importing necessary libraries
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, Dense
from sklearn.metrics import matthews_corrcoef

# Function definitions

def parse_data(file_path):
    sequences, secondary_structures = [], []
    parsing_sequences = False
    with open(file_path, "r") as file:
        for line in file:
            line = line.strip()
            if line == "<>":
                parsing_sequences = True
            elif line == "end":
                parsing_sequences = False
            elif parsing_sequences:
                parts = line.split()
                if len(parts) == 2:
                    amino_acid, sec_structure = parts
                    sequences.append(amino_acid)
                    secondary_structures.append(sec_structure)
    return sequences, secondary_structures

def create_windows(sequences, secondary_structures, window_size=13):
    X, y = [], []
    padded_sequences = ['_' for _ in range(window_size // 2)] + sequences + ['_' for _ in range(window_size // 2)]
    for i in range(len(sequences)):
        window = padded_sequences[i:i + window_size]
        X.append(window)
        y.append(secondary_structures[i])
    return X, y

def one_hot_encode(sequence):
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    num_amino_acids = len(amino_acids)
    encoded_seq = np.zeros((len(sequence), num_amino_acids), dtype=int)
    for i, aa in enumerate(sequence):
        if aa in amino_acids:
            encoded_seq[i, amino_acids.index(aa)] = 1
    return encoded_seq

def encode_labels(labels):
    encoding = {'_': 0, 'e': 1, 'h': 2}
    encoded_labels = np.array([encoding[label] for label in labels])
    return encoded_labels

# Data processing

In [4]:
# Read and parse the data
train_path = 'Q_and_s_data/protein-secondary-structure.train.txt'
test_path = 'Q_and_s_data/protein-secondary-structure.test.txt'

X_train, y_train = parse_data(train_path)
X_test, y_test = parse_data(test_path)

# Create sliding windows
window_size = 13
X_train_windows, y_train_windows = create_windows(X_train, y_train, window_size)
X_test_windows, y_test_windows = create_windows(X_test, y_test, window_size)

# Encode sequences and labels
X_train_encoded = np.array([one_hot_encode(sequence) for sequence in X_train_windows])
y_train_encoded = encode_labels(y_train_windows)

X_test_encoded = np.array([one_hot_encode(sequence) for sequence in X_test_windows])
y_test_encoded = encode_labels(y_test_windows)

# Model training and evaluation

In [5]:
# Define and compile the model
model = Sequential([
    Input(shape=(window_size, 20)),
    LSTM(units=40),
    Dense(units=3, activation='softmax')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_encoded, y_train_encoded, epochs=12, batch_size=32)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_encoded, y_test_encoded)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Epoch 1/12
[1m566/566[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.5605 - loss: 0.9432
Epoch 2/12
[1m566/566[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.6202 - loss: 0.8285
Epoch 3/12
[1m566/566[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.6353 - loss: 0.8067
Epoch 4/12
[1m566/566[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.6391 - loss: 0.7952
Epoch 5/12
[1m566/566[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.6453 - loss: 0.7827
Epoch 6/12
[1m566/566[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.6559 - loss: 0.7703
Epoch 7/12
[1m566/566[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.6554 - loss: 0.7651
Epoch 8/12
[1m566/566[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.6663 - loss: 0.7443
Epoch 9/12
[1m566/566[0m [32m━━━━━━━━

# Prediction and evaluation

In [6]:
# Predict and evaluate secondary structures
y_pred_prob = model.predict(X_test_encoded)
y_pred_labels = np.argmax(y_pred_prob, axis=1)

# Calculate Q3 measure
correct_predictions = np.sum(y_pred_labels == y_test_encoded)
total_predictions = len(y_test_encoded)
Q3 = correct_predictions / total_predictions
print("Q3:", Q3)

[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Q3: 0.6082386363636364


In [7]:
# Matthews correlation coefficient calculation

# Calculate Matthews correlation coefficient
class_names = {0: 'coil', 1: 'β-sheet', 2: 'α-helix'}
correlation_coefficients = {}
for class_idx in range(3):
    actual_labels = (y_test_encoded == class_idx).astype(int)
    predicted_labels = (y_pred_labels == class_idx).astype(int)
    correlation_coefficient = matthews_corrcoef(actual_labels, predicted_labels)
    correlation_coefficients[class_names[class_idx]] = correlation_coefficient

# Print correlation coefficients
for class_name, correlation_coefficient in correlation_coefficients.items():
    print(f"MCC for class {class_name}: {correlation_coefficient}")

MCC for class coil: 0.3426325633430461
MCC for class β-sheet: 0.3056473124644217
MCC for class α-helix: 0.2820082345444822


1 net:
    Q3 - 62.7%
    Cα - 0.35
    Cβ - 0.29
    Cc - 0.38
    
2 nets:
    Q3 - 64.3%
    Cα - 0.41
    Cβ - 0.31
    Cc - 0.41

Mine:
    Q3 - 61.1%
    Cα - 0.36
    Cβ - 0.28
    Cc - 0.31

In [None]:
import matplotlib.pyplot as plt

def plot_class_imbalances(y):
    class_counts = {}
    for label in y:
        class_counts[label] = class_counts.get(label, 0) + 1

    labels = list(class_counts.keys())
    counts = list(class_counts.values())

    plt.figure(figsize=(6, 4))
    plt.bar(labels, counts)
    plt.xticks(labels)
    plt.grid(axis='y')
    plt.show()

# Assuming y contains the labels of your dataset
plot_class_imbalances(y_train)
