In [1]:
# Reading the uploaded FASTA file and modifying the headers to only retain the second string

input_file_path = 'data/fungi_ITS_sorted90.fasta'
output_file_path = 'data/fungi_ITS_cleaned.fasta'

# Open the input file and create the output file
with open(input_file_path, 'r') as infile, open(output_file_path, 'w') as outfile:
    for line in infile:
        if line.startswith('>'):
            # Splitting the line and keeping only the second string
            header_parts = line.strip().split()
            if len(header_parts) > 1:
                modified_header = '>' + header_parts[1] + '\n'
                outfile.write(modified_header)
            else:
                outfile.write(line)
        else:
            # Write sequence lines as they are
            outfile.write(line)

output_file_path


'data/fungi_ITS_cleaned.fasta'

# vectorization

In [2]:
import itertools

def generate_all_kmers(k):
    return [''.join(p) for p in itertools.product('ACGT', repeat=k)]

def kmer_vector(sequence, k):
    # gen all possible k-mers comb
    all_kmers = generate_all_kmers(k)
    vector = [0] * len(all_kmers)
    kmer_to_index = {kmer: idx for idx, kmer in enumerate(all_kmers)}
    for i in range(len(sequence) - k + 1):
        kmer = sequence[i:i+k]
        if kmer in kmer_to_index:
            vector[kmer_to_index[kmer]] += 1
    return vector

input_file_path = 'data/fungi_ITS_cleaned.fasta'
output_file_path = 'data/fungi_ITS_kmer_vector.txt'

############################# k
k = 4


with open(input_file_path, 'r') as infile, open(output_file_path, 'w') as outfile:
    current_sequence = ""
    current_header = ""
    
    for line in infile:
        if line.startswith('>'):
            if current_sequence:
                vector = kmer_vector(current_sequence, k)
                vector_str = ' '.join(map(str, vector))
                outfile.write(f"{current_header}\t{vector_str}\n")
            current_header = line.strip()
            current_sequence = ""
        else:
            current_sequence += line.strip()
    
    if current_sequence:
        vector = kmer_vector(current_sequence, k)
        vector_str = ' '.join(map(str, vector))
        outfile.write(f"{current_header}\t{vector_str}\n")

output_file_path


'data/fungi_ITS_kmer_vector.txt'

In [3]:
# File paths
input_file_path = 'data/fungi_ITS_kmer_vector.txt'
output_file_path = 'data/fungi_ITS_kmer_vector_with_header.txt'

# k value to consider
k = 6

# Generate all possible k-mers for the header row
all_kmers = generate_all_kmers(k)
header_row = "ID\t" + "\t".join(all_kmers) + "\n"

# Open the input file and create the output file with the header row
with open(input_file_path, 'r') as infile, open(output_file_path, 'w') as outfile:
    # Write the header row to the output file
    outfile.write(header_row)
    # Write the rest of the content from the input file
    for line in infile:
        outfile.write(line)

output_file_path


'data/fungi_ITS_kmer_vector_with_header.txt'

# train test split

In [4]:
import random

input_file_path = 'data/fungi_ITS_kmer_vector.txt'
train_file_path = 'data/fungi_ITS_train.txt'
test_file_path = 'data/fungi_ITS_test.txt'

data_by_class = {}

with open(input_file_path, 'r') as infile:
    for line in infile:
        line = line.strip()
        if line.startswith('>'):
            parts = line.split('\t')
            if len(parts) == 2:
                label = parts[0][1:]  # remove '>'
                vector_str = parts[1]
                
                if label not in data_by_class:
                    data_by_class[label] = []
                data_by_class[label].append(line)

test_data = []
train_data = []

for label, samples in data_by_class.items():
    if len(samples) > 1:
        test_sample = random.choice(samples)
        test_data.append(test_sample)
        train_data.extend([sample for sample in samples if sample != test_sample])
    else:
        train_data.extend(samples)

with open(train_file_path, 'w') as train_file:
    for line in train_data:
        train_file.write(line + '\n')

with open(test_file_path, 'w') as test_file:
    for line in test_data:
        test_file.write(line + '\n')


train_file_path, test_file_path


('data/fungi_ITS_train.txt', 'data/fungi_ITS_test.txt')

# start

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader

# Load data and labels from input file
input_file_path = 'data/fungi_ITS_kmer_vector.txt'

labels = []
data_vectors = []

with open(input_file_path, 'r') as infile:
    for line in infile:
        line = line.strip()
        if line.startswith('>'):
            parts = line.split('\t')
            if len(parts) == 2:
                label = parts[0][1:]  # remove '>'
                vector_str = parts[1]
                vector = list(map(int, vector_str.split()))
                labels.append(label)
                data_vectors.append(vector)

# Encode the labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Convert data and labels to tensors
data_tensor = torch.tensor(data_vectors, dtype=torch.float32)
labels_tensor = torch.tensor(encoded_labels, dtype=torch.long)

# Define Dataset class
class KmerDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# Create the DataLoader
kmer_dataset = KmerDataset(data_tensor, labels_tensor)
dataloader = DataLoader(kmer_dataset, batch_size=32, shuffle=True)

# Create a dictionary to map class indices to class names
class_mapping = {index: label for index, label in enumerate(label_encoder.classes_)}

# Print dataset information
print(len(kmer_dataset))
print(set(labels))
print(len(label_encoder.classes_))

for batch_idx, (data, labels) in enumerate(dataloader):
    print(f"Batch {batch_idx + 1}:")
    print("Data:", data)
    print("Labels:", labels)
    # Map label indices to class names for each batch
    label_names = [class_mapping[idx.item()] for idx in labels]
    print("Label Names:", label_names)
    print("print len of label names:", len(label_names))
    print()

6205
{'Phaeoacremonium', 'Arthrographis', 'Bambusicola', 'Cyclothyriella', 'Anthopsis', 'Anthostomella', 'Pyrenula', 'Striaticonidium', 'Neoantrodia', 'Leiothecium', 'Eriospora', 'Castanediella', 'Neophaeosphaeriopsis', 'Amanita', 'Custingophora', 'Dioszegia', 'Pseudoramichloridium', 'Elbamycella', 'Mallocybe', 'Arecophila', 'Pseudobensingtonia', 'Rhizoplaca', 'Neodendryphiella', 'Eremiomyces', 'Geomyces', 'Nothoramularia', 'Achroiostachys', 'Acanthostigmina', 'Neoramichloridium', 'Aschersonia', 'Kiflimonium', 'Rusavskia', 'Echinusitheca', 'Nagrajchalara', 'Dactylidispora', 'Bulleribasidium', 'Tritirachium', 'Nothocybe', 'Rhynchogastrema', 'Neohelicomyces', 'Alogomyces', 'Kraurogymnocarpa', 'Spizellomyces', 'Naohidea', 'Ascosacculus', 'Strigula', 'Fissuroma', 'Coprinopsis', 'Xylogone', 'Neocudoniella', 'Conocybe', 'Astragalicola', 'Xylophallus', 'Rhexothecium', 'Auxarthron', 'Bullera', 'Nullicamyces', 'Phaeophlebiopsis', 'Strobilomyces', 'Cystinarius', 'Powellomyces', 'Atrophysma', 'Di

In [6]:
# Define a CNN model with Conv1D layers and fully connected layers
class CNNModel(nn.Module):
    def __init__(self, input_length, nb_classes):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv1d(1, 5, 5, padding='valid')
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool1d(2)
        self.conv2 = nn.Conv1d(5, 10, 5, padding='valid')
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool1d(2)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(10 * ((input_length - 4) // 2 - 4) // 2, 500)
        self.relu3 = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(500, nb_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = x.unsqueeze(1) 
        x = self.relu1(self.conv1(x))
        x = self.pool1(x)
        x = self.relu2(self.conv2(x))
        x = self.pool2(x)
        x = self.flatten(x)
        x = self.relu3(self.fc1(x))
        x = self.dropout(x)
        x = self.softmax(self.fc2(x))
        return x

input_size = len(data_vectors[0])
num_classes = len(label_encoder.classes_)
model = CNNModel(input_size, num_classes)
input_size = len(data_vectors[0])
num_classes = len(label_encoder.classes_)
model = CNNModel(input_size, num_classes)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_size = int(0.8 * len(kmer_dataset))
test_size = len(kmer_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(kmer_dataset, [train_size, test_size])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Training loop with testing
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    for batch_idx, (data, labels) in enumerate(train_loader):
        data, labels = data.to(device), labels.to(device)
        outputs = model(data)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {avg_train_loss:.4f}')
    model.eval()
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for data, labels in test_loader:
            data, labels = data.to(device), labels.to(device)
            outputs = model(data)
            _, predicted = torch.max(outputs, 1)
            total_correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)

    accuracy = total_correct / total_samples * 100
    print(f'Epoch [{epoch+1}/{num_epochs}], Test Accuracy: {accuracy:.2f}%')

  from .autonotebook import tqdm as notebook_tqdm


Epoch [1/100], Training Loss: 7.8682
Epoch [1/100], Test Accuracy: 2.42%
Epoch [2/100], Training Loss: 7.8650
Epoch [2/100], Test Accuracy: 2.42%
Epoch [3/100], Training Loss: 7.8636
Epoch [3/100], Test Accuracy: 2.42%
Epoch [4/100], Training Loss: 7.8636
Epoch [4/100], Test Accuracy: 2.42%
Epoch [5/100], Training Loss: 7.8650
Epoch [5/100], Test Accuracy: 2.42%
Epoch [6/100], Training Loss: 7.8650
Epoch [6/100], Test Accuracy: 2.42%
Epoch [7/100], Training Loss: 7.8650
Epoch [7/100], Test Accuracy: 2.42%
Epoch [8/100], Training Loss: 7.8650
Epoch [8/100], Test Accuracy: 2.42%
Epoch [9/100], Training Loss: 7.8650
Epoch [9/100], Test Accuracy: 2.42%
Epoch [10/100], Training Loss: 7.8636
Epoch [10/100], Test Accuracy: 2.42%
Epoch [11/100], Training Loss: 7.8650
Epoch [11/100], Test Accuracy: 2.42%
Epoch [12/100], Training Loss: 7.8636
Epoch [12/100], Test Accuracy: 2.42%
Epoch [13/100], Training Loss: 7.8650
Epoch [13/100], Test Accuracy: 2.42%
Epoch [14/100], Training Loss: 7.8650
Epoch