In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F


In [21]:
import os
from Bio import SeqIO

def encode_and_save_fasta(fasta_file, data_file):
    # Define the mapping for encoding
    nucleotide_mapping = {'A': '1', 'C': '2', 'G': '3', 'T': '4'}

    # Ensure the directory exists
    os.makedirs(os.path.dirname(data_file), exist_ok=True)

    # Extract sequences, encode, and save to the file
    with open(data_file, 'w') as f:
        for record in SeqIO.parse(fasta_file, "fasta"):
            label = record.description  # Get the label from the header
            sequence = str(record.seq).upper()  # Ensure the sequence is uppercase

            # Encode the sequence using the nucleotide mapping
            encoded_sequence = ''.join([nucleotide_mapping.get(nuc, '0') for nuc in sequence])

            # Write to the file in the format: class label encoded_data
            f.write(f"{label} {encoded_sequence}\n")

# File paths
fasta_file = "data2/fungi_ITS_cleaned.fasta"
data_file = "data2/encoded_data.txt"

# Call the function
encode_and_save_fasta(fasta_file, data_file)


# genera more than x samples

In [22]:
import os
from collections import Counter
def filter_classes_with_more_than_5_samples(encoded_data_file, filtered_data_file):
    # Step 1: Count occurrences of each class
    class_counts = Counter()

    # Read the encoded data and count class occurrences
    with open(encoded_data_file, 'r') as f:
        lines = f.readlines()
        for line in lines:
            class_label = line.split()[0]  # Assume class label is the first part of each line
            class_counts[class_label] += 1

    # Step 2: Filter classes with more than 5 samples
    valid_classes = {cls for cls, count in class_counts.items() if count >= 5}

    # Step 3: Save filtered sequences to another file
    with open(filtered_data_file, 'w') as filtered_f:
        for line in lines:
            class_label = line.split()[0]
            # Write to the filtered file only if the class has more than 5 samples
            if class_label in valid_classes:
                filtered_f.write(line)


encoded_data_file = "data2/encoded_data.txt"
filtered_data_file = "data2/filtered_encoded_data.txt"
filter_classes_with_more_than_5_samples(encoded_data_file, filtered_data_file)

# train test split

In [23]:
import os
import random
from collections import defaultdict

def train_test_split(filtered_data_file, train_file, test_file):
    # Dictionary to hold samples for each class
    class_samples = defaultdict(list)

    # Step 1: Read the filtered data file and organize samples by class
    with open(filtered_data_file, 'r') as f:
        lines = f.readlines()
        for line in lines:
            class_label = line.split()[0]
            class_samples[class_label].append(line)

    # Step 2: Split into train and test
    train_samples = []
    test_samples = []

    for class_label, samples in class_samples.items():
        if len(samples) > 1:  # Only take a sample for test if there are multiple samples
            test_sample = random.choice(samples)
            test_samples.append(test_sample)
            # Add the remaining samples to the train set
            train_samples.extend([sample for sample in samples if sample != test_sample])
        else:
            # If only one sample, add it to the train set
            train_samples.extend(samples)

    # Step 3: Save train and test samples to respective files
    with open(train_file, 'w') as train_f, open(test_file, 'w') as test_f:
        train_f.writelines(train_samples)
        test_f.writelines(test_samples)

# File paths
filtered_data_file = "data2/filtered_encoded_data.txt"
train_file = "data2/train_data.txt"
test_file = "data2/test_data.txt"

# Perform train-test split
train_test_split(filtered_data_file, train_file, test_file)


In [26]:
import torch
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict

class SequenceDataset(Dataset):
    def __init__(self, data_file):
        self.samples = []
        self.label_mapping = {}  # To keep track of label-to-number mapping
        label_counter = 0
        
        # Read the file and parse each line
        with open(data_file, 'r') as f:
            lines = f.readlines()
            for line in lines:
                parts = line.strip().split(maxsplit=1)
                label = parts[0]
                sequence_str = parts[1]

                # Map class label to a numeric value
                if label not in self.label_mapping:
                    self.label_mapping[label] = label_counter
                    label_counter += 1

                numeric_label = self.label_mapping[label]
                sequence = [int(x) for x in sequence_str]  # Convert encoded sequence characters to integers
                self.samples.append((sequence, numeric_label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sequence, label = self.samples[idx]
        return torch.tensor(sequence, dtype=torch.float32), torch.tensor(label, dtype=torch.long)

# Custom collate function to pad sequences
def pad_collate(batch):
    sequences, labels = zip(*batch)

    # Find the longest sequence in the batch
    max_length = max(len(seq) for seq in sequences)

    # Pad sequences to the maximum length
    padded_sequences = []
    for seq in sequences:
        padded_seq = torch.cat([seq, torch.zeros(max_length - len(seq))])
        padded_sequences.append(padded_seq)

    # Stack padded sequences and labels
    padded_sequences = torch.stack(padded_sequences)
    labels = torch.tensor(labels, dtype=torch.long)

    return padded_sequences, labels

# File paths
train_file = "data2/train_data.txt"
test_file = "data2/test_data.txt"

# Create Dataset objects
train_dataset = SequenceDataset(train_file)
test_dataset = SequenceDataset(test_file)

# Create DataLoaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=pad_collate)

# Get class mapping
class_mapping = train_dataset.label_mapping
inverse_class_mapping = {v: k for k, v in class_mapping.items()}  # Reverse the mapping to get label names

# Print dataset information
print(f"Number of samples in training dataset: {len(train_dataset)}")
print(f"Class mapping: {class_mapping}")
print(f"Number of unique classes: {len(class_mapping)}")

# Iterate over the DataLoader and print batch information
for batch_idx, (data, labels) in enumerate(train_loader):
    print(f"Batch {batch_idx + 1}:")
    print("Data shape:", data.shape)
    print("Labels:", labels)

    # Map label indices to class names for each batch
    label_names = [inverse_class_mapping[idx.item()] for idx in labels]
    print("Label Names:", label_names)
    print("Number of labels in batch:", len(label_names))
    print()


Number of samples in training dataset: 2500
Class mapping: {'Cortinarius': 0, 'Penicillium': 1, 'Aspergillus': 2, 'Inocybe': 3, 'Colletotrichum': 4, 'Trichoderma': 5, 'Phlegmacium': 6, 'Talaromyces': 7, 'Fusarium': 8, 'Agaricus': 9, 'Amanita': 10, 'Entoloma': 11, 'Orbilia': 12, 'Russula': 13, 'Lactarius': 14, 'Thaxterogaster': 15, 'Elsinoe': 16, 'Psathyrella': 17, 'Cytospora': 18, 'Phyllosticta': 19, 'Mucor': 20, 'Candida': 21, 'Apiospora': 22, 'Hydnum': 23, 'Exophiala': 24, 'Marasmius': 25, 'Zasmidium': 26, 'Hypoxylon': 27, 'Ogataea': 28, 'Ophiostoma': 29, 'Tuber': 30, 'Pluteus': 31, 'Scolecobasidium': 32, 'Lactifluus': 33, 'Metschnikowia': 34, 'Leucoagaricus': 35, 'Sticta': 36, 'Claviceps': 37, 'Sporothrix': 38, 'Gymnopus': 39, 'Xylodon': 40, 'Cladophialophora': 41, 'Distoseptispora': 42, 'Tomentella': 43, 'Ganoderma': 44, 'Derxomyces': 45, 'Otidea': 46, 'Kazachstania': 47, 'Coniochaeta': 48, 'Verrucaria': 49, 'Sarocladium': 50, 'Lipomyces': 51, 'Hygrophorus': 52, 'Tubeufia': 53, 'Ge

# RRCNN-LSTM

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

class RRCNN_LSTM(nn.Module):
    def __init__(self, num_classes, embedding_dim=4):
        super(RRCNN_LSTM, self).__init__()
        
        self.embedding = nn.Embedding(num_embeddings=5, embedding_dim=embedding_dim, padding_idx=0)
        
        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=32, kernel_size=5, padding=2)
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=5, padding=2)
        
        self.lstm = nn.LSTM(input_size=64, hidden_size=128, num_layers=2, batch_first=True, bidirectional=True)
        
        self.fc1 = nn.Linear(128 * 2, 128) 
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.embedding(x)  
        x = x.permute(0, 2, 1)

        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
    
        x = x.permute(0, 2, 1)  
        x, _ = self.lstm(x)
        
        x = x[:, -1, :]  
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x
    
embedding_dim = 4  
num_classes = len(train_dataset.label_mapping)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Model, Loss, and Optimizer
model = RRCNN_LSTM(num_classes=num_classes, embedding_dim=embedding_dim)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3)

# Training loop with testing
num_epochs = 100
train_accuracies = []
test_accuracies = []

for epoch in range(num_epochs):
    # Training
    model.train()
    total_train_loss = 0
    total_correct_train = 0
    total_train_samples = 0

    for batch_idx, (data, labels) in enumerate(train_loader):
        data, labels = data.to(device).long(), labels.to(device)  # Explicitly convert data to LongTensor

        # Forward pass
        outputs = model(data)
        loss = criterion(outputs, labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Compute training metrics
        total_train_loss += loss.item()
        _, predicted_train = torch.max(outputs, 1)
        total_correct_train += (predicted_train == labels).sum().item()
        total_train_samples += labels.size(0)

    avg_train_loss = total_train_loss / len(train_loader)
    train_accuracy = total_correct_train / total_train_samples * 100
    train_accuracies.append(train_accuracy)
    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {avg_train_loss:.4f}, Training Accuracy: {train_accuracy:.2f}%')

    # Evaluate on test dataset
    model.eval()
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for data, labels in test_loader:
            data, labels = data.to(device).long(), labels.to(device)  # Explicitly convert data to LongTensor
            outputs = model(data)
            _, predicted = torch.max(outputs, 1)
            total_correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)

    accuracy = total_correct / total_samples * 100
    test_accuracies.append(accuracy)
    print(f'Epoch [{epoch+1}/{num_epochs}], Test Accuracy: {accuracy:.2f}%')

    # Adjust learning rate if needed
    scheduler.step(avg_train_loss)


Epoch [1/100], Training Loss: 5.2629, Training Accuracy: 7.44%
Epoch [1/100], Test Accuracy: 0.40%
Epoch [2/100], Training Loss: 5.1314, Training Accuracy: 7.96%
Epoch [2/100], Test Accuracy: 0.40%
Epoch [3/100], Training Loss: 5.1235, Training Accuracy: 7.96%
Epoch [3/100], Test Accuracy: 0.40%
Epoch [4/100], Training Loss: 5.1129, Training Accuracy: 7.88%
Epoch [4/100], Test Accuracy: 0.40%


KeyboardInterrupt: 