In [None]:
#Importing the required libraries
import os
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, TimeDistributed, Dense
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from keras import backend as K
import matplotlib.pyplot as plt
from google.colab import files

In [None]:
# Load datasets and Read the Excel file into a DataFrame
uploaded = files.upload()
file_name = next(iter(uploaded))

df = pd.read_excel(pd.ExcelFile(file_name))
ss_all = df.query('len.between(100, 300) & ~has_nonstd_aa')
sequences, sst3_labels = df['seq'].tolist(), df['sst3'].tolist()

In [None]:
# Train-validation-test split
train_seqs, test_seqs, train_sst3, test_sst3 = train_test_split(sequences, sst3_labels, test_size=0.2, random_state=42)
train_seqs, valid_seqs, train_sst3, valid_sst3 = train_test_split(train_seqs, train_sst3, test_size=0.25, random_state=42)

In [None]:
# Tokenization and Padding
encoder = Tokenizer()
encoder.fit_on_texts(train_seqs)
decoder = Tokenizer(char_level=True)
decoder.fit_on_texts(train_sst3)

In [None]:
# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)

In [None]:
# Train-validation-test split for sequences and labels
X_train, y_train = preprocess_sequences(train_seqs, train_sst3, encoder, decoder)
X_valid, y_valid = preprocess_sequences(valid_seqs, valid_sst3, encoder, decoder)

In [None]:
# Build and Compile Model
model = Sequential([
    Embedding(input_dim=len(encoder.word_index) + 1, output_dim=128, input_length=X_train.shape[1]),
    Bidirectional(LSTM(units=64, return_sequences=True, recurrent_dropout=0.1)),
    TimeDistributed(Dense(len(decoder.word_index) + 1, activation='softmax'))
])
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy", q3_acc])

In [None]:
# Train the model
history = model.fit(X_train, y_train, batch_size=128, epochs=10, validation_data=(X_valid, y_valid), callbacks=[early_stopping], verbose=1)

In [None]:
#Plotting the data into graphs
# Plot Training and Validation Loss
plt.figure(figsize=(8, 5))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Plot Training and Validation Accuracy
plt.figure(figsize=(8, 5))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Plot Training and Validation Q3 Accuracy
plt.figure(figsize=(8, 5))
plt.plot(history.history['q3_acc'], label='Training Q3 Accuracy')
plt.plot(history.history['val_q3_acc'], label='Validation Q3 Accuracy')
plt.title('Training and Validation Q3 Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Q3 Accuracy')
plt.legend()
plt.show()

In [None]:
#Evaluating the Test Results
# Tokenization and Padding for Test Set
X_test, y_test = preprocess_sequences(test_seqs, test_sst3, encoder, decoder)

# Evaluate the model on the test set
test_results = model.evaluate(X_test, y_test, verbose=1)

# Display the evaluation results
print("Test Loss:", test_results[0])
print("Test Accuracy:", test_results[1])
print("Test Q3 Accuracy:", test_results[2])