# Specification

## Project 1 -Predict secondary protein structure given the sequence. 

### Completion requirements:

- Reimplement the network described by Qian and Sejnowski in 1988


- Test and compare your accuracy - using their data


- Implement a single improvement, such as profiling


- Test and compare your accuracy again


- Does the model get similar accuracy on unseen datasets?


- Extend your work to other methods, e.g. can large language models help? How about SVMs?

# Import Modules

In [57]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Flatten, Dense
from tensorflow.keras.callbacks import EarlyStopping


# Import Dataset

In [58]:
# Define a function to convert amino acids and structures to one-hot

def one_hot_encode(seq, vocab):
    """One-hot encode a sequence based on a given vocabulary."""
    one_hot = np.zeros((len(seq), len(vocab)), dtype=np.float32)
    for i, char in enumerate(seq):
        if char in vocab:
            one_hot[i, vocab.index(char)] = 1.0
    return one_hot

def prepare_data(filepath, window_size=13):
    sequences = []
    structures = []
    current_seq = []
    current_struct = []
    processing_sequence = False  # Track when inside a sequence block

    with open(filepath, 'r') as file:
        for line in file:
            line = line.strip()
            if line == '<>':  # Toggle processing flag
                if processing_sequence:  # We are ending a sequence block
                    if current_seq and current_struct:
                        seq_encoded = one_hot_encode(current_seq, aa_vocab)
                        struct_encoded = one_hot_encode(current_struct, structure_vocab)

                        # Apply sliding window
                        for i in range(len(seq_encoded) - window_size + 1):
                            window = seq_encoded[i:i + window_size]
                            label = struct_encoded[i + window_size // 2]
                            sequences.append(window)
                            structures.append(label)

                    current_seq = []
                    current_struct = []
                processing_sequence = not processing_sequence
                continue

            elif 'end' in line:  # Generalized handling for any 'end' marker
                continue  # Just skip this line, do not end processing sequence

            if processing_sequence:
                parts = line.split()
                if len(parts) != 2:
                    continue  # Skip malformed lines or lines that do not fit expected format
                current_seq.append(parts[0])
                current_struct.append(parts[1])

    return np.array(sequences), np.array(structures)


# Define your vocabularies
aa_vocab = 'ACDEFGHIKLMNPQRSTVWY_'  # 20 amino acids + 1 for gap/unknown
structure_vocab = 'he_'  # h for helix, e for sheet, _ for coil

# Example paths, replace with your actual file pathsin
train_path = 'Q_and_s_data/protein-secondary-structure.train.txt'
test_path = 'Q_and_s_data/protein-secondary-structure.test.txt'

x_train, y_train = prepare_data(train_path)
x_test, y_test = prepare_data(test_path)

In [59]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(8110, 13, 21) (8110, 3)
(1714, 13, 21) (1714, 3)


# Model Architecture 

In [60]:
tf.keras.utils.set_random_seed(42)

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Flatten, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, matthews_corrcoef, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Define your TensorFlow model architecture with explicit input layer
input_layer = Input(shape=(13, 21))
flattened_layer = Flatten()(input_layer)
dense_layer = Dense(40, activation='sigmoid')(flattened_layer)
output_layer = Dense(3, activation='linear')(dense_layer)

model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model.compile(optimizer='sgd', loss='mean_squared_error', metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_accuracy', patience=20, restore_best_weights=True)

# Fit the model
history = model.fit(
    x_train, y_train,
    epochs=100,
    batch_size=32,
    validation_data=(x_test, y_test),
    callbacks=[early_stopping]
)

# Extract features directly from the dense layer
feature_extractor = Model(inputs=model.input, outputs=dense_layer)

# Extract features
features_train = feature_extractor.predict(x_train)
features_test = feature_extractor.predict(x_test)

# Flatten features if they're not already flat (optional based on your network structure)
features_train = features_train.reshape(features_train.shape[0], -1)
features_test = features_test.reshape(features_test.shape[0], -1)

# Convert y_train and y_test from one-hot to labels if they are one-hot encoded
label_encoder = LabelEncoder()
y_train_labels = label_encoder.fit_transform(np.argmax(y_train, axis=1))
y_test_labels = label_encoder.transform(np.argmax(y_test, axis=1))

# Initialize and train the SVM
svm_model = SVC(kernel='linear', C=1.0, gamma='auto', random_state=42, verbose=True)
svm_model.fit(features_train, y_train_labels)  # Ensure y_train is flattened

# Predict using the SVM
predictions = svm_model.predict(features_test)

Epoch 1/100


[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.4548 - loss: 0.2350 - val_accuracy: 0.5560 - val_loss: 0.2023
Epoch 2/100
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 996us/step - accuracy: 0.5251 - loss: 0.2087 - val_accuracy: 0.5554 - val_loss: 0.2008
Epoch 3/100
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5264 - loss: 0.2067 - val_accuracy: 0.5572 - val_loss: 0.1995
Epoch 4/100
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 949us/step - accuracy: 0.5293 - loss: 0.2049 - val_accuracy: 0.5572 - val_loss: 0.1982
Epoch 5/100
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 987us/step - accuracy: 0.5324 - loss: 0.2032 - val_accuracy: 0.5589 - val_loss: 0.1970
Epoch 6/100
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5334 - loss: 0.2016 - val_accuracy: 0.5624 - val_loss: 0.1959
Epoch 7/100
[1m254/254[0m 

In [61]:
# Ensure you're using y_test_labels for comparison
q3_score = accuracy_score(y_test_labels, predictions)
mcc_values = [matthews_corrcoef(y_test_labels == i, predictions == i) for i in range(3)]  # Calculate per-class MCC

# Print results
print("Q3 Score:", q3_score)
print("Matthews Correlation Coefficients per class:", mcc_values)

# Print confusion matrix
conf_matrix = confusion_matrix(y_test_labels, predictions)
print("Confusion Matrix:\n", conf_matrix)

Q3 Score: 0.6330221703617269
Matthews Correlation Coefficients per class: [0.32194512661418, 0.3431409352308367, 0.35995035070965586]
Confusion Matrix:
 [[200  27 208]
 [ 66 105 155]
 [125  48 780]]
