In [140]:
import pandas as pd
import numpy as np

def read_fasta(file_path):
    names = []
    sequences = []
    current_name = ""
    current_sequence = ""

    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()

            if line.startswith('>'):
                # If the line starts with '>', it indicates the beginning of a new sequence
                # Store the previous name and sequence
                if current_name and current_sequence:
                    names.append(current_name)
                    sequences.append(current_sequence)

                # Reset for the new sequence
                current_name = line[1:]  # Exclude the '>'
                current_sequence = ""
            else:
                # If the line doesn't start with '>', it is part of the sequence
                current_sequence += line

    # Add the last sequence to the lists
    if current_name and current_sequence:
        names.append(current_name)
        sequences.append(current_sequence)

    # Create a DataFrame
    df = pd.DataFrame({'Name': names, 'Sequence': sequences})
    return df

def process_sequence(sequence):
    # If the sequence length is less than or equal to 320, repeat it
    if len(sequence) <= 320:
        sequence = sequence * (320 // len(sequence) + 1)
    # If the sequence length is greater than 320, truncate it
    return sequence[:320]
    

def one_hot_encode_sequence(sequence, encoding_map):
    num_classes = len(encoding_map)
    one_hot_sequence = np.zeros((len(sequence), num_classes), dtype=int)

    for i, aa in enumerate(sequence):
        one_hot_sequence[i, encoding_map[aa] - 1] = 1  # Subtract 1 to convert 1-20 to 0-19 index

    return one_hot_sequence

# Example usage
amino_acid_mapping = {
    'A': 1, 'R': 2, 'N': 3, 'D': 4, 'C': 5,
    'Q': 6, 'E': 7, 'G': 8, 'H': 9, 'I': 10,
    'L': 11, 'K': 12, 'M': 13, 'F': 14, 'P': 15,
    'S': 16, 'T': 17, 'W': 18, 'Y': 19, 'V': 20
}

negative_train_file_path = '../data/drugfinder/fastadata/Train/negative_train_sequence.fasta'
negative_train_fasta_df = read_fasta(negative_train_file_path)
negative_train_fasta_df['Processed_Sequence'] = negative_train_fasta_df['Sequence'].apply(process_sequence)
negative_train_fasta_df.drop(negative_train_fasta_df[negative_train_fasta_df['Processed_Sequence'].str.contains('U')].index, inplace=True)
negative_train_fasta_df.drop(negative_train_fasta_df[negative_train_fasta_df['Processed_Sequence'].str.contains('X')].index, inplace=True)
negative_train = np.array([
    one_hot_encode_sequence(seq, amino_acid_mapping)
    for seq in negative_train_fasta_df['Processed_Sequence']
])
negative_train = np.stack(negative_train, axis=0)

positive_train_file_path = '../data/drugfinder/fastadata/Train/positive_train_sequence.fasta'
positive_train_fasta_df = read_fasta(positive_train_file_path)
positive_train_fasta_df['Processed_Sequence'] = positive_train_fasta_df['Sequence'].apply(process_sequence)
positive_train_fasta_df.drop(positive_train_fasta_df[positive_train_fasta_df['Processed_Sequence'].str.contains('U')].index, inplace=True)
positive_train_fasta_df.drop(positive_train_fasta_df[positive_train_fasta_df['Processed_Sequence'].str.contains('X')].index, inplace=True)
positive_train = np.array([
    one_hot_encode_sequence(seq, amino_acid_mapping)
    for seq in positive_train_fasta_df['Processed_Sequence']
])
positive_train = np.stack(positive_train, axis=0)

# Negative test set
negative_test_file_path = '../data/drugfinder/fastadata/Independent_Test/negative_test_sequence.fasta'
negative_test_fasta_df = read_fasta(negative_test_file_path)
negative_test_fasta_df['Processed_Sequence'] = negative_test_fasta_df['Sequence'].apply(process_sequence)
negative_test_fasta_df.drop(negative_test_fasta_df[negative_test_fasta_df['Processed_Sequence'].str.contains('U')].index, inplace=True)
negative_test_fasta_df.drop(negative_test_fasta_df[negative_test_fasta_df['Processed_Sequence'].str.contains('X')].index, inplace=True)
negative_test = np.array([
    one_hot_encode_sequence(seq, amino_acid_mapping)
    for seq in negative_test_fasta_df['Processed_Sequence']
])
negative_test = np.stack(negative_test, axis=0)

# Positive test set
positive_test_file_path = '../data/drugfinder/fastadata/Independent_Test/positive_test_sequence.fasta'
positive_test_fasta_df = read_fasta(positive_test_file_path)
positive_test_fasta_df['Processed_Sequence'] = positive_test_fasta_df['Sequence'].apply(process_sequence)
positive_test_fasta_df.drop(positive_test_fasta_df[positive_test_fasta_df['Processed_Sequence'].str.contains('U')].index, inplace=True)
positive_test_fasta_df.drop(positive_test_fasta_df[positive_test_fasta_df['Processed_Sequence'].str.contains('X')].index, inplace=True)
positive_test = np.array([
    one_hot_encode_sequence(seq, amino_acid_mapping)
    for seq in positive_test_fasta_df['Processed_Sequence']
])
positive_test = np.stack(positive_test, axis=0)

In [149]:
X_train = np.concatenate([positive_train, negative_train], axis=0)
y_train = np.concatenate([np.ones(len(positive_train)), np.zeros(len(negative_train))])

# Combine positive and negative sets for testing
X_test = np.concatenate([positive_test, negative_test], axis=0)
y_test = np.concatenate([np.ones(len(positive_test)), np.zeros(len(negative_test))])

from sklearn.utils import shuffle

# Shuffle training data
X_train, y_train = shuffle(X_train, y_train, random_state=42)

# Shuffle test data
X_test, y_test = shuffle(X_test, y_test, random_state=42)

In [148]:
X_test.shape

(473, 320, 20)