# Specification

## Project 1 -Predict secondary protein structure given the sequence. 

### Completion requirements:

- Reimplement the network described by Qian and Sejnowski in 1988


- Test and compare your accuracy - using their data


- Implement a single improvement, such as profiling


- Test and compare your accuracy again


- Does the model get similar accuracy on unseen datasets?


- Extend your work to other methods, e.g. can large language models help? How about SVMs?

# Import Modules

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Flatten, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, matthews_corrcoef, confusion_matrix
from sklearn.preprocessing import LabelEncoder


# Import Dataset

In [2]:
# convert a sequence of amino acids to a one-hot encoded matrix
def one_hot_encode(seq, vocab):
    # Create a matrix of zeros, with dimensions len(seq) x len(vocab)
    one_hot = np.zeros((len(seq), len(vocab)), dtype=np.float32)
    # Iterate over the sequence and set the appropriate elements to 1.0
    for i, char in enumerate(seq):
        if char in vocab:
            one_hot[i, vocab.index(char)] = 1.0
    return one_hot


# prepare data to enter neural network
def prepare_data(filepath, window_size=13):
    sequences = []
    structures = []
    current_seq = []
    current_struct = []
    processing_sequence = False  # Track when inside a sequence block

    with open(filepath, 'r') as file:
        for line in file:
            line = line.strip()
            if line == '<>': # Sequence separator
                if processing_sequence: 
                    if current_seq and current_struct:
                        seq_encoded = one_hot_encode(current_seq, aa_vocab) # One-hot encode sequence
                        struct_encoded = one_hot_encode(current_struct, structure_vocab) # One-hot encode structure

                        # Apply sliding window of determined size
                        for i in range(len(seq_encoded) - window_size + 1):
                            window = seq_encoded[i:i + window_size]
                            label = struct_encoded[i + window_size // 2]
                            sequences.append(window)
                            structures.append(label)

                    current_seq = []
                    current_struct = []
                processing_sequence = not processing_sequence
                continue

            elif 'end' in line:  # end of sequence or file
                continue  
            
            # If inside a sequence block, process the sequence
            # handles errors in sequence end lines in the dataset
            if processing_sequence:
                parts = line.split()
                if len(parts) != 2:
                    continue  
                current_seq.append(parts[0])
                current_struct.append(parts[1])

    return np.array(sequences), np.array(structures)


# Define your vocabularies
aa_vocab = 'ACDEFGHIKLMNPQRSTVWY_'  # 20 amino acids + 1 for gap/unknown
structure_vocab = 'he_'  # h for helix, e for sheet, _ for coil

# Example paths, replace with your actual file pathsin
train_path = 'Q_and_s_data/protein-secondary-structure.train.txt'
test_path = 'Q_and_s_data/protein-secondary-structure.test.txt'

x_train, y_train = prepare_data(train_path)
x_test, y_test = prepare_data(test_path)

In [3]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(8110, 13, 21) (8110, 3)
(1714, 13, 21) (1714, 3)


# Model Architecture 

In [24]:
# Set random seed for reproducibility
tf.keras.utils.set_random_seed(42)

# Define the number of hidden units
hidden_units = 40

# Define your TensorFlow model architecture with explicit input layer
input_layer = Input(shape=(13, 21))
flattened_layer = Flatten()(input_layer)
dense_layer = Dense(hidden_units, activation='sigmoid')(flattened_layer)
output_layer = Dense(3, activation='linear')(dense_layer)

model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model.compile(optimizer='sgd', loss='mean_squared_error', metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_accuracy', patience=20, restore_best_weights=True)

# Fit the model
history = model.fit(
    x_train, y_train,
    epochs=150,
    batch_size=32,
    validation_data=(x_test, y_test),
    callbacks=[early_stopping]
)

# Extract features directly from the dense layer
feature_extractor = Model(inputs=model.input, outputs=dense_layer)

# Extract features
features_train = feature_extractor.predict(x_train)
features_test = feature_extractor.predict(x_test)

# Flatten features
features_train = features_train.reshape(features_train.shape[0], -1)
features_test = features_test.reshape(features_test.shape[0], -1)

# Convert y_train and y_test from one-hot to labels
label_encoder = LabelEncoder()
y_train_labels = label_encoder.fit_transform(np.argmax(y_train, axis=1))
y_test_labels = label_encoder.transform(np.argmax(y_test, axis=1))

# Initialize and train the SVM
svm_model = SVC(kernel='linear', C=3, gamma='auto', random_state=42, verbose=True)
svm_model.fit(features_train, y_train_labels)  # Ensure y_train is flattened

# Predict using the SVM
predictions = svm_model.predict(features_test)

Epoch 1/150
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.4790 - loss: 0.2545 - val_accuracy: 0.5566 - val_loss: 0.2020
Epoch 2/150
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 971us/step - accuracy: 0.5313 - loss: 0.2054 - val_accuracy: 0.5589 - val_loss: 0.2006
Epoch 3/150
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5325 - loss: 0.2035 - val_accuracy: 0.5595 - val_loss: 0.1993
Epoch 4/150
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 862us/step - accuracy: 0.5351 - loss: 0.2018 - val_accuracy: 0.5589 - val_loss: 0.1980
Epoch 5/150
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 913us/step - accuracy: 0.5365 - loss: 0.2002 - val_accuracy: 0.5583 - val_loss: 0.1968
Epoch 6/150
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 876us/step - accuracy: 0.5375 - loss: 0.1986 - val_accuracy: 0.5589 - val_loss: 0.1957
Epoch 7/150
[

In [25]:
q3_score = accuracy_score(y_test_labels, predictions)
mcc_values = [matthews_corrcoef(y_test_labels == i, predictions == i) for i in range(3)]  # Calculate per-class MCC

# Print results
print("Q3 Score:", round(q3_score * 100, 1))
print("Matthews Correlation Coefficients per class:", mcc_values)

# Print confusion matrix
conf_matrix = confusion_matrix(y_test_labels, predictions)
print("Confusion Matrix:\n", conf_matrix)

Q3 Score: 62.1
Matthews Correlation Coefficients per class: [0.3059098537633514, 0.3176546797501633, 0.3449183493208053]
Confusion Matrix:
 [[196  30 209]
 [ 72 109 145]
 [126  68 759]]
