In [3]:
# Reading the uploaded FASTA file and modifying the headers to only retain the second string

input_file_path = 'data/fungi_ITS_sorted90.fasta'
output_file_path = 'data/fungi_ITS_cleaned.fasta'

# Open the input file and create the output file
with open(input_file_path, 'r') as infile, open(output_file_path, 'w') as outfile:
    for line in infile:
        if line.startswith('>'):
            # Splitting the line and keeping only the second string
            header_parts = line.strip().split()
            if len(header_parts) > 1:
                modified_header = '>' + header_parts[1] + '\n'
                outfile.write(modified_header)
            else:
                outfile.write(line)
        else:
            # Write sequence lines as they are
            outfile.write(line)

output_file_path


'data/fungi_ITS_cleaned.fasta'

# vectorization

In [11]:
import itertools

def generate_all_kmers(k):
    return [''.join(p) for p in itertools.product('ACGT', repeat=k)]

def kmer_vector(sequence, k):
    # gen all possible k-mers comb
    all_kmers = generate_all_kmers(k)
    vector = [0] * len(all_kmers)
    kmer_to_index = {kmer: idx for idx, kmer in enumerate(all_kmers)}
    for i in range(len(sequence) - k + 1):
        kmer = sequence[i:i+k]
        if kmer in kmer_to_index:
            vector[kmer_to_index[kmer]] += 1
    return vector

input_file_path = 'data/fungi_ITS_cleaned.fasta'
output_file_path = 'data/fungi_ITS_kmer_vector.txt'

############################# k
k = 4


with open(input_file_path, 'r') as infile, open(output_file_path, 'w') as outfile:
    current_sequence = ""
    current_header = ""
    
    for line in infile:
        if line.startswith('>'):
            if current_sequence:
                vector = kmer_vector(current_sequence, k)
                vector_str = ' '.join(map(str, vector))
                outfile.write(f"{current_header}\t{vector_str}\n")
            current_header = line.strip()
            current_sequence = ""
        else:
            current_sequence += line.strip()
    
    if current_sequence:
        vector = kmer_vector(current_sequence, k)
        vector_str = ' '.join(map(str, vector))
        outfile.write(f"{current_header}\t{vector_str}\n")

output_file_path


'data/fungi_ITS_kmer_vector.txt'

In [10]:
# File paths
input_file_path = 'data/fungi_ITS_kmer_vector.txt'
output_file_path = 'data/fungi_ITS_kmer_vector_with_header.txt'

# k value to consider
k = 6

# Generate all possible k-mers for the header row
all_kmers = generate_all_kmers(k)
header_row = "ID\t" + "\t".join(all_kmers) + "\n"

# Open the input file and create the output file with the header row
with open(input_file_path, 'r') as infile, open(output_file_path, 'w') as outfile:
    # Write the header row to the output file
    outfile.write(header_row)
    # Write the rest of the content from the input file
    for line in infile:
        outfile.write(line)

output_file_path


'data/fungi_ITS_kmer_vector_with_header.txt'

# train test split

In [21]:
import random

input_file_path = 'data/fungi_ITS_kmer_vector.txt'
train_file_path = 'data/fungi_ITS_train.txt'
test_file_path = 'data/fungi_ITS_test.txt'

data_by_class = {}

with open(input_file_path, 'r') as infile:
    for line in infile:
        line = line.strip()
        if line.startswith('>'):
            parts = line.split('\t')
            if len(parts) == 2:
                label = parts[0][1:]  # remove '>'
                vector_str = parts[1]
                
                if label not in data_by_class:
                    data_by_class[label] = []
                data_by_class[label].append(line)

test_data = []
train_data = []

for label, samples in data_by_class.items():
    if len(samples) > 1:
        test_sample = random.choice(samples)
        test_data.append(test_sample)
        train_data.extend([sample for sample in samples if sample != test_sample])
    else:
        train_data.extend(samples)

with open(train_file_path, 'w') as train_file:
    for line in train_data:
        train_file.write(line + '\n')

with open(test_file_path, 'w') as test_file:
    for line in test_data:
        test_file.write(line + '\n')


train_file_path, test_file_path


('data/fungi_ITS_train.txt', 'data/fungi_ITS_test.txt')

# start

In [20]:
import torch
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader

input_file_path = 'data/fungi_ITS_kmer_vector.txt'

labels = []
data_vectors = []

with open(input_file_path, 'r') as infile:
    for line in infile:
        line = line.strip()
        if line.startswith('>'):
            parts = line.split('\t')
            if len(parts) == 2:
                label = parts[0][1:]  # remove >
                vector_str = parts[1]
                vector = list(map(int, vector_str.split()))
                labels.append(label)
                data_vectors.append(vector)


label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
data_tensor = torch.tensor(data_vectors, dtype=torch.float32)
labels_tensor = torch.tensor(encoded_labels, dtype=torch.long)

class KmerDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]


kmer_dataset = KmerDataset(data_tensor, labels_tensor)
dataloader = DataLoader(kmer_dataset, batch_size=32, shuffle=True)

print(len(kmer_dataset))
print(set(labels))

for batch_idx, (data, labels) in enumerate(dataloader):
    print(f"Batch {batch_idx + 1}:")
    print("Data:", data)
    print("Labels:", labels)
    print()


6205
{'Laccaria', 'Didymocyrtis', 'Intubia', 'Circinaria', 'Marsupiomyces', 'Penicillago', 'Megasporoporia', 'Singerocybe', 'Corynesporopsis', 'Exserohilum', 'Neocrinula', 'Setophoma', 'Melanconium', 'Verrucocladosporium', 'Itersonilia', 'Remototrachyna', 'Iodosphaeria', 'Calcarisporiella', 'Xeromyces', 'Harringtonia', 'Capnobotryella', 'Gambiomyces', 'Brunneomyces', 'Chapsa', 'Pseudoproboscispora', 'Chytridium', 'Sampaiozyma', 'Ovicillium', 'Cystolepiota', 'Psoroma', 'Sphaeropezia', 'Camporesiomyces', 'Helicoarthrosporum', 'Bryoclavula', 'Helminthosphaeria', 'Arachnomyces', 'Lopharia', 'Squamanita', 'Verruciconidia', 'Geoscypha', 'Phaeoclavulina', 'Auxarthron', 'Bipolaris', 'Fuscosporella', 'Penicillifer', 'Sirenophila', 'Arezzomyces', 'Jobellisia', 'Charcotiana', 'Ramomarthamyces', 'Venturia', 'Paradictyoarthrinium', 'Trichoderma', 'Candelariella', 'Bulleribasidium', 'Periconia', 'Tetraploa', 'Byssoascus', 'Amyloflagellula', 'Leveillula', 'Pleohelicoon', 'Induratia', 'Cora', 'Eucasph