In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dense, Dropout, BatchNormalization, Bidirectional, LSTM
from tensorflow.keras.utils import Sequence
from tensorflow.keras.callbacks import ReduceLROnPlateau

# Step 1: Data Preprocessing
def read_dataset(file_path):
    ids, sequences, classes = [], [], []
    with open(file_path, 'r') as file:
        lines = file.readlines()
        for i in range(0, len(lines), 3):
            ids.append(lines[i].strip())
            sequences.append(lines[i+1].strip())
            classes.append(lines[i+2].strip())
    return ids, sequences, classes

def kmer_encoding(sequence, k=4):
    kmers = [sequence[x:x+k] for x in range(len(sequence) - k + 1)]
    return kmers

def one_hot_encode(sequences, unique_kmers):
    kmer_to_index = {kmer: i for i, kmer in enumerate(unique_kmers)}
    
    encoded_sequences = []
    for seq in sequences:
        encoded_seq = [kmer_to_index[kmer] for kmer in kmer_encoding(seq)]
        one_hot_seq = to_categorical(encoded_seq, num_classes=len(unique_kmers))
        encoded_sequences.append(one_hot_seq)
    return np.array(encoded_sequences)

# Load dataset
file_path = r'C:\\Users\\shire\\OneDrive\\Desktop\\Project\\dataset.txt'
ids, sequences, classes = read_dataset(file_path)

# Create a temporary sequence to determine unique kmers
temp_kmers = [kmer for seq in sequences for kmer in kmer_encoding(seq)]
unique_kmers = sorted(set(temp_kmers))
num_kmers = len(unique_kmers)

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(classes)
encoded_labels = to_categorical(encoded_labels)

# Train-test split
X_train_seqs, X_val_seqs, y_train, y_val = train_test_split(sequences, encoded_labels, test_size=0.2, random_state=42)

# Step 2: Data Generator
class DataGenerator(Sequence):
    def __init__(self, sequences, labels, unique_kmers, batch_size=16, k=4, shuffle=True):
        self.sequences = sequences
        self.labels = labels
        self.batch_size = batch_size
        self.k = k
        self.shuffle = shuffle
        self.unique_kmers = unique_kmers
        self.num_kmers = len(unique_kmers)
        self.on_epoch_end()
    
    def __len__(self):
        return int(np.floor(len(self.sequences) / self.batch_size))
    
    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        batch_sequences = [self.sequences[k] for k in indexes]
        batch_labels = [self.labels[k] for k in indexes]
        
        X, y = self.__data_generation(batch_sequences, batch_labels)
        return X, y
    
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.sequences))
        if self.shuffle:
            np.random.shuffle(self.indexes)
    
    def __data_generation(self, batch_sequences, batch_labels):
        X = one_hot_encode(batch_sequences, self.unique_kmers)
        y = np.array(batch_labels)
        return X, y

# Initialize data generators
training_generator = DataGenerator(X_train_seqs, y_train, unique_kmers, batch_size=16)
validation_generator = DataGenerator(X_val_seqs, y_val, unique_kmers, batch_size=16)

# Step 3: Model Building
def build_model(input_shape, num_classes):
    model = Sequential()
    
    model.add(Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=input_shape))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.25))  # Added dropout layer
    
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.25))  # Added dropout layer
    
    model.add(Bidirectional(LSTM(32, return_sequences=True)))
    model.add(Dropout(0.5))  # Added dropout layer
    
    model.add(Bidirectional(LSTM(32)))
    model.add(Dropout(0.5))  # Added dropout layer
    
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    
    model.add(Dense(num_classes, activation='softmax'))
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Define input shape and number of classes
sequence_length = len(kmer_encoding(sequences[0]))  # Length of the sequence after k-mer encoding
input_shape = (sequence_length, num_kmers)  # Timesteps, features
num_classes = y_train.shape[1]
model = build_model(input_shape, num_classes)

# Step 4: Training and Evaluation

# Learning rate reduction
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.00001)

# Train the model using generators
history = model.fit(training_generator, validation_data=validation_generator, epochs=50, callbacks=[reduce_lr])

# Evaluate the model
loss, accuracy = model.evaluate(validation_generator)
print(f'Validation Accuracy: {accuracy:.4f}')

# Plot training & validation accuracy values
plt.figure(figsize=(10, 6))
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50


  self._warn_if_super_not_called()


[1m1583/1583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 117ms/step - accuracy: 0.5327 - loss: 0.6944 - val_accuracy: 0.5476 - val_loss: 0.6784 - learning_rate: 0.0010
Epoch 2/50
[1m1583/1583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 115ms/step - accuracy: 0.5726 - loss: 0.6798 - val_accuracy: 0.6328 - val_loss: 0.6532 - learning_rate: 0.0010
Epoch 3/50
[1m1583/1583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m186s[0m 117ms/step - accuracy: 0.6301 - loss: 0.6550 - val_accuracy: 0.6579 - val_loss: 0.6279 - learning_rate: 0.0010
Epoch 4/50
[1m1583/1583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m187s[0m 118ms/step - accuracy: 0.6575 - loss: 0.6380 - val_accuracy: 0.6842 - val_loss: 0.6133 - learning_rate: 0.0010
Epoch 5/50
[1m1583/1583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m187s[0m 118ms/step - accuracy: 0.6784 - loss: 0.6192 - val_accuracy: 0.6712 - val_loss: 0.6181 - learning_rate: 0.0010
Epoch 6/50
[1m1583/1583[0m [32m━━━━━━━━━━━━━━━━━