# Compare between different mer

# k = 3, run 10 times, 0.8691

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import random
import matplotlib.pyplot as plt
from itertools import product
from collections import defaultdict

# Constants
k = 3
input_fasta = 'data1/fungi_ITS_sorted90.fasta'
cleaned_fasta = 'data1/fungi_ITS_cleaned.fasta'
kmer_vector_file = 'data1/fungi_ITS_kmer_vector.txt'
filtered_vector_file = 'data1/fungi_ITS_kmer_vector_filtered.txt'
num_epochs = 100
num_runs = 10

# Helper functions
def generate_kmers(k):
    return [''.join(p) for p in product('ACGT', repeat=k)]

def kmer_vector(seq, k, kmer_index):
    vector = [0] * len(kmer_index)
    for i in range(len(seq) - k + 1):
        kmer = seq[i:i+k]
        if kmer in kmer_index:
            vector[kmer_index[kmer]] += 1
    return vector

class KmerDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Convert the k-mer vector to float and the label to a long tensor.
        return (torch.tensor(self.data[idx], dtype=torch.float),
                torch.tensor(self.labels[idx], dtype=torch.long))

class CNNModel(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv1d(1, 32, 7, padding='same'),
            nn.BatchNorm1d(32),
            nn.LeakyReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(32, 64, 7, padding='same'),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(64, 128, 7, padding='same'),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(),
            nn.MaxPool1d(2),
            nn.Flatten(),
            nn.Linear((input_dim//8)*128, 1024),
            nn.LeakyReLU(),
            nn.Dropout(0.2),
            nn.Linear(1024, 256),
            nn.LeakyReLU(),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        # x is expected to have shape (batch_size, input_dim)
        return self.model(x.unsqueeze(1))

# Step 1: Clean FASTA headers
with open(input_fasta, 'r') as infile, open(cleaned_fasta, 'w') as outfile:
    for line in infile:
        if line.startswith('>'):
            parts = line.strip().split()
            outfile.write(f'>{parts[1]}\n' if len(parts) > 1 else line)
        else:
            outfile.write(line)

# Step 2: k-mer vectorization
kmers = generate_kmers(k)
kmer_index = {kmer: idx for idx, kmer in enumerate(kmers)}

with open(cleaned_fasta, 'r') as infile, open(kmer_vector_file, 'w') as outfile:
    current_sequence, header = '', ''
    for line in infile:
        if line.startswith('>'):
            if current_sequence:
                vector = kmer_vector(current_sequence, k, kmer_index)
                outfile.write(f"{header}\t{' '.join(map(str, vector))}\n")
            header = line.strip()
            current_sequence = ''
        else:
            current_sequence += line.strip()
    if current_sequence:
        vector = kmer_vector(current_sequence, k, kmer_index)
        outfile.write(f"{header}\t{' '.join(map(str, vector))}\n")

# Step 3: Filter sequences by genus frequency
genera_count = defaultdict(int)
with open(kmer_vector_file, 'r') as f:
    for line in f:
        genus = line.split()[0][1:]
        genera_count[genus] += 1

with open(kmer_vector_file, 'r') as infile, open(filtered_vector_file, 'w') as outfile:
    for line in infile:
        genus = line.split()[0][1:]
        if genera_count[genus] >= 10:
            outfile.write(line)

# Main Experiment Loop
best_accuracies = []

for run in range(num_runs):
    data_by_class = defaultdict(list)
    with open(filtered_vector_file, 'r') as f:
        for line in f:
            label = line.split('\t')[0][1:]
            data_by_class[label].append(line.strip())

    train_data, test_data = [], []
    for samples in data_by_class.values():
        test_sample = random.choice(samples)
        test_data.append(test_sample)
        train_data.extend(s for s in samples if s != test_sample)

    train_vectors = [list(map(int, line.split('\t')[1].split())) for line in train_data]
    train_labels = [line.split('\t')[0][1:] for line in train_data]

    test_vectors = [list(map(int, line.split('\t')[1].split())) for line in test_data]
    test_labels = [line.split('\t')[0][1:] for line in test_data]

    label_encoder = LabelEncoder().fit(train_labels)
    train_labels_encoded = label_encoder.transform(train_labels)
    test_labels_encoded = label_encoder.transform(test_labels)

    train_loader = DataLoader(KmerDataset(train_vectors, train_labels_encoded), batch_size=32, shuffle=True)
    test_loader = DataLoader(KmerDataset(test_vectors, test_labels_encoded), batch_size=32)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = CNNModel(len(train_vectors[0]), len(label_encoder.classes_)).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=0.001)

    max_accuracy = 0
    for epoch in range(num_epochs):
        model.train()
        for data, labels in train_loader:
            data, labels = data.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for data, labels in test_loader:
                data, labels = data.to(device), labels.to(device)
                outputs = model(data)
                _, predicted = torch.max(outputs, 1)
                correct += (predicted == labels).sum().item()
                total += labels.size(0)

        accuracy = correct / total
        print(f"Run {run+1}, Epoch {epoch+1}: Accuracy = {accuracy:.4f}")

        if accuracy > max_accuracy:
            max_accuracy = accuracy

    best_accuracies.append(max_accuracy)

avg_accuracy = sum(best_accuracies) / len(best_accuracies)
print(f'Average Highest Accuracy over {num_runs} runs: {avg_accuracy:.4f}')


Run 1, Epoch 1: Accuracy = 0.1852
Run 1, Epoch 2: Accuracy = 0.3951
Run 1, Epoch 3: Accuracy = 0.6173
Run 1, Epoch 4: Accuracy = 0.6790
Run 1, Epoch 5: Accuracy = 0.7284
Run 1, Epoch 6: Accuracy = 0.6420
Run 1, Epoch 7: Accuracy = 0.7284
Run 1, Epoch 8: Accuracy = 0.7407
Run 1, Epoch 9: Accuracy = 0.7531
Run 1, Epoch 10: Accuracy = 0.8148
Run 1, Epoch 11: Accuracy = 0.7778
Run 1, Epoch 12: Accuracy = 0.7901
Run 1, Epoch 13: Accuracy = 0.8395
Run 1, Epoch 14: Accuracy = 0.8642
Run 1, Epoch 15: Accuracy = 0.7407
Run 1, Epoch 16: Accuracy = 0.7901
Run 1, Epoch 17: Accuracy = 0.7531
Run 1, Epoch 18: Accuracy = 0.7654
Run 1, Epoch 19: Accuracy = 0.7654
Run 1, Epoch 20: Accuracy = 0.7531
Run 1, Epoch 21: Accuracy = 0.7778
Run 1, Epoch 22: Accuracy = 0.7778
Run 1, Epoch 23: Accuracy = 0.8519
Run 1, Epoch 24: Accuracy = 0.8642
Run 1, Epoch 25: Accuracy = 0.8025
Run 1, Epoch 26: Accuracy = 0.8272
Run 1, Epoch 27: Accuracy = 0.8519
Run 1, Epoch 28: Accuracy = 0.8642
Run 1, Epoch 29: Accuracy = 0

# K = 5, 10 times running(0.942)

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import random
import matplotlib.pyplot as plt
from itertools import product
from collections import defaultdict

# Constants
k = 5
input_fasta = 'data1/fungi_ITS_sorted90.fasta'
cleaned_fasta = 'data1/fungi_ITS_cleaned.fasta'
kmer_vector_file = 'data1/fungi_ITS_kmer_vector.txt'
filtered_vector_file = 'data1/fungi_ITS_kmer_vector_filtered.txt'
num_epochs = 100
num_runs = 10

# Helper functions
def generate_kmers(k):
    return [''.join(p) for p in product('ACGT', repeat=k)]

def kmer_vector(seq, k, kmer_index):
    vector = [0] * len(kmer_index)
    for i in range(len(seq) - k + 1):
        kmer = seq[i:i+k]
        if kmer in kmer_index:
            vector[kmer_index[kmer]] += 1
    return vector

class KmerDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Convert the k-mer vector to float and the label to a long tensor.
        return (torch.tensor(self.data[idx], dtype=torch.float),
                torch.tensor(self.labels[idx], dtype=torch.long))

class CNNModel(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv1d(1, 32, 7, padding='same'),
            nn.BatchNorm1d(32),
            nn.LeakyReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(32, 64, 7, padding='same'),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(64, 128, 7, padding='same'),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(),
            nn.MaxPool1d(2),
            nn.Flatten(),
            nn.Linear((input_dim//8)*128, 1024),
            nn.LeakyReLU(),
            nn.Dropout(0.2),
            nn.Linear(1024, 256),
            nn.LeakyReLU(),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        # x is expected to have shape (batch_size, input_dim)
        return self.model(x.unsqueeze(1))

# Step 1: Clean FASTA headers
with open(input_fasta, 'r') as infile, open(cleaned_fasta, 'w') as outfile:
    for line in infile:
        if line.startswith('>'):
            parts = line.strip().split()
            outfile.write(f'>{parts[1]}\n' if len(parts) > 1 else line)
        else:
            outfile.write(line)

# Step 2: k-mer vectorization
kmers = generate_kmers(k)
kmer_index = {kmer: idx for idx, kmer in enumerate(kmers)}

with open(cleaned_fasta, 'r') as infile, open(kmer_vector_file, 'w') as outfile:
    current_sequence, header = '', ''
    for line in infile:
        if line.startswith('>'):
            if current_sequence:
                vector = kmer_vector(current_sequence, k, kmer_index)
                outfile.write(f"{header}\t{' '.join(map(str, vector))}\n")
            header = line.strip()
            current_sequence = ''
        else:
            current_sequence += line.strip()
    if current_sequence:
        vector = kmer_vector(current_sequence, k, kmer_index)
        outfile.write(f"{header}\t{' '.join(map(str, vector))}\n")

# Step 3: Filter sequences by genus frequency
genera_count = defaultdict(int)
with open(kmer_vector_file, 'r') as f:
    for line in f:
        genus = line.split()[0][1:]
        genera_count[genus] += 1

with open(kmer_vector_file, 'r') as infile, open(filtered_vector_file, 'w') as outfile:
    for line in infile:
        genus = line.split()[0][1:]
        if genera_count[genus] >= 10:
            outfile.write(line)

# Main Experiment Loop
best_accuracies = []

for run in range(num_runs):
    data_by_class = defaultdict(list)
    with open(filtered_vector_file, 'r') as f:
        for line in f:
            label = line.split('\t')[0][1:]
            data_by_class[label].append(line.strip())

    train_data, test_data = [], []
    for samples in data_by_class.values():
        test_sample = random.choice(samples)
        test_data.append(test_sample)
        train_data.extend(s for s in samples if s != test_sample)

    train_vectors = [list(map(int, line.split('\t')[1].split())) for line in train_data]
    train_labels = [line.split('\t')[0][1:] for line in train_data]

    test_vectors = [list(map(int, line.split('\t')[1].split())) for line in test_data]
    test_labels = [line.split('\t')[0][1:] for line in test_data]

    label_encoder = LabelEncoder().fit(train_labels)
    train_labels_encoded = label_encoder.transform(train_labels)
    test_labels_encoded = label_encoder.transform(test_labels)

    train_loader = DataLoader(KmerDataset(train_vectors, train_labels_encoded), batch_size=32, shuffle=True)
    test_loader = DataLoader(KmerDataset(test_vectors, test_labels_encoded), batch_size=32)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = CNNModel(len(train_vectors[0]), len(label_encoder.classes_)).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=0.001)

    max_accuracy = 0
    for epoch in range(num_epochs):
        model.train()
        for data, labels in train_loader:
            data, labels = data.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for data, labels in test_loader:
                data, labels = data.to(device), labels.to(device)
                outputs = model(data)
                _, predicted = torch.max(outputs, 1)
                correct += (predicted == labels).sum().item()
                total += labels.size(0)

        accuracy = correct / total
        print(f"Run {run+1}, Epoch {epoch+1}: Accuracy = {accuracy:.4f}")

        if accuracy > max_accuracy:
            max_accuracy = accuracy

    best_accuracies.append(max_accuracy)

avg_accuracy = sum(best_accuracies) / len(best_accuracies)
print(f'Average Highest Accuracy over {num_runs} runs: {avg_accuracy:.4f}')


Run 1, Epoch 1: Accuracy = 0.1605
Run 1, Epoch 2: Accuracy = 0.3704
Run 1, Epoch 3: Accuracy = 0.6667
Run 1, Epoch 4: Accuracy = 0.8025
Run 1, Epoch 5: Accuracy = 0.8148
Run 1, Epoch 6: Accuracy = 0.7901
Run 1, Epoch 7: Accuracy = 0.7901
Run 1, Epoch 8: Accuracy = 0.7778
Run 1, Epoch 9: Accuracy = 0.8395
Run 1, Epoch 10: Accuracy = 0.8148
Run 1, Epoch 11: Accuracy = 0.7778
Run 1, Epoch 12: Accuracy = 0.7531
Run 1, Epoch 13: Accuracy = 0.8642
Run 1, Epoch 14: Accuracy = 0.8272
Run 1, Epoch 15: Accuracy = 0.7901
Run 1, Epoch 16: Accuracy = 0.8642
Run 1, Epoch 17: Accuracy = 0.8519
Run 1, Epoch 18: Accuracy = 0.9012
Run 1, Epoch 19: Accuracy = 0.8025
Run 1, Epoch 20: Accuracy = 0.8765
Run 1, Epoch 21: Accuracy = 0.9259
Run 1, Epoch 22: Accuracy = 0.9630
Run 1, Epoch 23: Accuracy = 0.7778
Run 1, Epoch 24: Accuracy = 0.9136
Run 1, Epoch 25: Accuracy = 0.8148
Run 1, Epoch 26: Accuracy = 0.8765
Run 1, Epoch 27: Accuracy = 0.8519
Run 1, Epoch 28: Accuracy = 0.8272
Run 1, Epoch 29: Accuracy = 0

# K = 6, 10 times running(0.9519)

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import random
import matplotlib.pyplot as plt
from itertools import product
from collections import defaultdict

# Constants
k = 6
input_fasta = 'data1/fungi_ITS_sorted90.fasta'
cleaned_fasta = 'data1/fungi_ITS_cleaned.fasta'
kmer_vector_file = 'data1/fungi_ITS_kmer_vector.txt'
filtered_vector_file = 'data1/fungi_ITS_kmer_vector_filtered.txt'
num_epochs = 100
num_runs = 10

# Helper functions
def generate_kmers(k):
    return [''.join(p) for p in product('ACGT', repeat=k)]

def kmer_vector(seq, k, kmer_index):
    vector = [0] * len(kmer_index)
    for i in range(len(seq) - k + 1):
        kmer = seq[i:i+k]
        if kmer in kmer_index:
            vector[kmer_index[kmer]] += 1
    return vector

class KmerDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Convert the k-mer vector to float and the label to a long tensor.
        return (torch.tensor(self.data[idx], dtype=torch.float),
                torch.tensor(self.labels[idx], dtype=torch.long))

class CNNModel(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv1d(1, 32, 7, padding='same'),
            nn.BatchNorm1d(32),
            nn.LeakyReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(32, 64, 7, padding='same'),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(64, 128, 7, padding='same'),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(),
            nn.MaxPool1d(2),
            nn.Flatten(),
            nn.Linear((input_dim//8)*128, 1024),
            nn.LeakyReLU(),
            nn.Dropout(0.2),
            nn.Linear(1024, 256),
            nn.LeakyReLU(),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        # x is expected to have shape (batch_size, input_dim)
        return self.model(x.unsqueeze(1))

# Step 1: Clean FASTA headers
with open(input_fasta, 'r') as infile, open(cleaned_fasta, 'w') as outfile:
    for line in infile:
        if line.startswith('>'):
            parts = line.strip().split()
            outfile.write(f'>{parts[1]}\n' if len(parts) > 1 else line)
        else:
            outfile.write(line)

# Step 2: k-mer vectorization
kmers = generate_kmers(k)
kmer_index = {kmer: idx for idx, kmer in enumerate(kmers)}

with open(cleaned_fasta, 'r') as infile, open(kmer_vector_file, 'w') as outfile:
    current_sequence, header = '', ''
    for line in infile:
        if line.startswith('>'):
            if current_sequence:
                vector = kmer_vector(current_sequence, k, kmer_index)
                outfile.write(f"{header}\t{' '.join(map(str, vector))}\n")
            header = line.strip()
            current_sequence = ''
        else:
            current_sequence += line.strip()
    if current_sequence:
        vector = kmer_vector(current_sequence, k, kmer_index)
        outfile.write(f"{header}\t{' '.join(map(str, vector))}\n")

# Step 3: Filter sequences by genus frequency
genera_count = defaultdict(int)
with open(kmer_vector_file, 'r') as f:
    for line in f:
        genus = line.split()[0][1:]
        genera_count[genus] += 1

with open(kmer_vector_file, 'r') as infile, open(filtered_vector_file, 'w') as outfile:
    for line in infile:
        genus = line.split()[0][1:]
        if genera_count[genus] >= 10:
            outfile.write(line)

# Main Experiment Loop
best_accuracies = []

for run in range(num_runs):
    data_by_class = defaultdict(list)
    with open(filtered_vector_file, 'r') as f:
        for line in f:
            label = line.split('\t')[0][1:]
            data_by_class[label].append(line.strip())

    train_data, test_data = [], []
    for samples in data_by_class.values():
        test_sample = random.choice(samples)
        test_data.append(test_sample)
        train_data.extend(s for s in samples if s != test_sample)

    train_vectors = [list(map(int, line.split('\t')[1].split())) for line in train_data]
    train_labels = [line.split('\t')[0][1:] for line in train_data]

    test_vectors = [list(map(int, line.split('\t')[1].split())) for line in test_data]
    test_labels = [line.split('\t')[0][1:] for line in test_data]

    label_encoder = LabelEncoder().fit(train_labels)
    train_labels_encoded = label_encoder.transform(train_labels)
    test_labels_encoded = label_encoder.transform(test_labels)

    train_loader = DataLoader(KmerDataset(train_vectors, train_labels_encoded), batch_size=32, shuffle=True)
    test_loader = DataLoader(KmerDataset(test_vectors, test_labels_encoded), batch_size=32)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = CNNModel(len(train_vectors[0]), len(label_encoder.classes_)).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=0.001)

    max_accuracy = 0
    for epoch in range(num_epochs):
        model.train()
        for data, labels in train_loader:
            data, labels = data.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for data, labels in test_loader:
                data, labels = data.to(device), labels.to(device)
                outputs = model(data)
                _, predicted = torch.max(outputs, 1)
                correct += (predicted == labels).sum().item()
                total += labels.size(0)

        accuracy = correct / total
        print(f"Run {run+1}, Epoch {epoch+1}: Accuracy = {accuracy:.4f}")

        if accuracy > max_accuracy:
            max_accuracy = accuracy

    best_accuracies.append(max_accuracy)

avg_accuracy = sum(best_accuracies) / len(best_accuracies)
print(f'Average Highest Accuracy over {num_runs} runs: {avg_accuracy:.4f}')


Run 1, Epoch 1: Accuracy = 0.1605
Run 1, Epoch 2: Accuracy = 0.4444
Run 1, Epoch 3: Accuracy = 0.6790
Run 1, Epoch 4: Accuracy = 0.8272
Run 1, Epoch 5: Accuracy = 0.7778
Run 1, Epoch 6: Accuracy = 0.8765
Run 1, Epoch 7: Accuracy = 0.8642
Run 1, Epoch 8: Accuracy = 0.8272
Run 1, Epoch 9: Accuracy = 0.8395
Run 1, Epoch 10: Accuracy = 0.8765
Run 1, Epoch 11: Accuracy = 0.8395
Run 1, Epoch 12: Accuracy = 0.8272
Run 1, Epoch 13: Accuracy = 0.8272
Run 1, Epoch 14: Accuracy = 0.8025
Run 1, Epoch 15: Accuracy = 0.8025
Run 1, Epoch 16: Accuracy = 0.8272
Run 1, Epoch 17: Accuracy = 0.9383
Run 1, Epoch 18: Accuracy = 0.8272
Run 1, Epoch 19: Accuracy = 0.8519
Run 1, Epoch 20: Accuracy = 0.8272
Run 1, Epoch 21: Accuracy = 0.8765
Run 1, Epoch 22: Accuracy = 0.8519
Run 1, Epoch 23: Accuracy = 0.8519
Run 1, Epoch 24: Accuracy = 0.8395
Run 1, Epoch 25: Accuracy = 0.9012
Run 1, Epoch 26: Accuracy = 0.8395
Run 1, Epoch 27: Accuracy = 0.8025
Run 1, Epoch 28: Accuracy = 0.8395
Run 1, Epoch 29: Accuracy = 0

# k=3, filter to 5, run 10 times, 0.668

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import random
import matplotlib.pyplot as plt
from itertools import product
from collections import defaultdict

# Constants
k = 3
input_fasta = 'data1/fungi_ITS_sorted90.fasta'
cleaned_fasta = 'data1/fungi_ITS_cleaned.fasta'
kmer_vector_file = 'data1/fungi_ITS_kmer_vector.txt'
filtered_vector_file = 'data1/fungi_ITS_kmer_vector_filtered.txt'
num_epochs = 100
num_runs = 10

# Helper functions
def generate_kmers(k):
    return [''.join(p) for p in product('ACGT', repeat=k)]

def kmer_vector(seq, k, kmer_index):
    vector = [0] * len(kmer_index)
    for i in range(len(seq) - k + 1):
        kmer = seq[i:i+k]
        if kmer in kmer_index:
            vector[kmer_index[kmer]] += 1
    return vector

class KmerDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Convert the k-mer vector to float and the label to a long tensor.
        return (torch.tensor(self.data[idx], dtype=torch.float),
                torch.tensor(self.labels[idx], dtype=torch.long))

class CNNModel(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv1d(1, 32, 7, padding='same'),
            nn.BatchNorm1d(32),
            nn.LeakyReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(32, 64, 7, padding='same'),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(64, 128, 7, padding='same'),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(),
            nn.MaxPool1d(2),
            nn.Flatten(),
            nn.Linear((input_dim//8)*128, 1024),
            nn.LeakyReLU(),
            nn.Dropout(0.2),
            nn.Linear(1024, 256),
            nn.LeakyReLU(),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        # x is expected to have shape (batch_size, input_dim)
        return self.model(x.unsqueeze(1))

# Step 1: Clean FASTA headers
with open(input_fasta, 'r') as infile, open(cleaned_fasta, 'w') as outfile:
    for line in infile:
        if line.startswith('>'):
            parts = line.strip().split()
            outfile.write(f'>{parts[1]}\n' if len(parts) > 1 else line)
        else:
            outfile.write(line)

# Step 2: k-mer vectorization
kmers = generate_kmers(k)
kmer_index = {kmer: idx for idx, kmer in enumerate(kmers)}

with open(cleaned_fasta, 'r') as infile, open(kmer_vector_file, 'w') as outfile:
    current_sequence, header = '', ''
    for line in infile:
        if line.startswith('>'):
            if current_sequence:
                vector = kmer_vector(current_sequence, k, kmer_index)
                outfile.write(f"{header}\t{' '.join(map(str, vector))}\n")
            header = line.strip()
            current_sequence = ''
        else:
            current_sequence += line.strip()
    if current_sequence:
        vector = kmer_vector(current_sequence, k, kmer_index)
        outfile.write(f"{header}\t{' '.join(map(str, vector))}\n")

# Step 3: Filter sequences by genus frequency
genera_count = defaultdict(int)
with open(kmer_vector_file, 'r') as f:
    for line in f:
        genus = line.split()[0][1:]
        genera_count[genus] += 1

with open(kmer_vector_file, 'r') as infile, open(filtered_vector_file, 'w') as outfile:
    for line in infile:
        genus = line.split()[0][1:]
        if genera_count[genus] >= 5:
            outfile.write(line)

# Main Experiment Loop
best_accuracies = []

for run in range(num_runs):
    data_by_class = defaultdict(list)
    with open(filtered_vector_file, 'r') as f:
        for line in f:
            label = line.split('\t')[0][1:]
            data_by_class[label].append(line.strip())

    train_data, test_data = [], []
    for samples in data_by_class.values():
        test_sample = random.choice(samples)
        test_data.append(test_sample)
        train_data.extend(s for s in samples if s != test_sample)

    train_vectors = [list(map(int, line.split('\t')[1].split())) for line in train_data]
    train_labels = [line.split('\t')[0][1:] for line in train_data]

    test_vectors = [list(map(int, line.split('\t')[1].split())) for line in test_data]
    test_labels = [line.split('\t')[0][1:] for line in test_data]

    label_encoder = LabelEncoder().fit(train_labels)
    train_labels_encoded = label_encoder.transform(train_labels)
    test_labels_encoded = label_encoder.transform(test_labels)

    train_loader = DataLoader(KmerDataset(train_vectors, train_labels_encoded), batch_size=32, shuffle=True)
    test_loader = DataLoader(KmerDataset(test_vectors, test_labels_encoded), batch_size=32)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = CNNModel(len(train_vectors[0]), len(label_encoder.classes_)).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=0.001)

    max_accuracy = 0
    for epoch in range(num_epochs):
        model.train()
        for data, labels in train_loader:
            data, labels = data.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for data, labels in test_loader:
                data, labels = data.to(device), labels.to(device)
                outputs = model(data)
                _, predicted = torch.max(outputs, 1)
                correct += (predicted == labels).sum().item()
                total += labels.size(0)

        accuracy = correct / total
        print(f"Run {run+1}, Epoch {epoch+1}: Accuracy = {accuracy:.4f}")

        if accuracy > max_accuracy:
            max_accuracy = accuracy

    best_accuracies.append(max_accuracy)

avg_accuracy = sum(best_accuracies) / len(best_accuracies)
print(f'Average Highest Accuracy over {num_runs} runs: {avg_accuracy:.4f}')


Run 1, Epoch 1: Accuracy = 0.0607
Run 1, Epoch 2: Accuracy = 0.1660
Run 1, Epoch 3: Accuracy = 0.2632
Run 1, Epoch 4: Accuracy = 0.3360
Run 1, Epoch 5: Accuracy = 0.4291
Run 1, Epoch 6: Accuracy = 0.4696
Run 1, Epoch 7: Accuracy = 0.5182
Run 1, Epoch 8: Accuracy = 0.5061
Run 1, Epoch 9: Accuracy = 0.5344
Run 1, Epoch 10: Accuracy = 0.5628
Run 1, Epoch 11: Accuracy = 0.5506
Run 1, Epoch 12: Accuracy = 0.4939
Run 1, Epoch 13: Accuracy = 0.5668
Run 1, Epoch 14: Accuracy = 0.5547
Run 1, Epoch 15: Accuracy = 0.5506
Run 1, Epoch 16: Accuracy = 0.5223
Run 1, Epoch 17: Accuracy = 0.4899
Run 1, Epoch 18: Accuracy = 0.5668
Run 1, Epoch 19: Accuracy = 0.5668
Run 1, Epoch 20: Accuracy = 0.5668
Run 1, Epoch 21: Accuracy = 0.5304
Run 1, Epoch 22: Accuracy = 0.5951
Run 1, Epoch 23: Accuracy = 0.6154
Run 1, Epoch 24: Accuracy = 0.6154
Run 1, Epoch 25: Accuracy = 0.5668
Run 1, Epoch 26: Accuracy = 0.5223
Run 1, Epoch 27: Accuracy = 0.5587
Run 1, Epoch 28: Accuracy = 0.5142
Run 1, Epoch 29: Accuracy = 0

# k=5, filter to 5, run 10 times, (0.7547)

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import random
import matplotlib.pyplot as plt
from itertools import product
from collections import defaultdict

# Constants
k = 5
input_fasta = 'data1/fungi_ITS_sorted90.fasta'
cleaned_fasta = 'data1/fungi_ITS_cleaned.fasta'
kmer_vector_file = 'data1/fungi_ITS_kmer_vector.txt'
filtered_vector_file = 'data1/fungi_ITS_kmer_vector_filtered.txt'
num_epochs = 100
num_runs = 10

# Helper functions
def generate_kmers(k):
    return [''.join(p) for p in product('ACGT', repeat=k)]

def kmer_vector(seq, k, kmer_index):
    vector = [0] * len(kmer_index)
    for i in range(len(seq) - k + 1):
        kmer = seq[i:i+k]
        if kmer in kmer_index:
            vector[kmer_index[kmer]] += 1
    return vector

class KmerDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Convert the k-mer vector to float and the label to a long tensor.
        return (torch.tensor(self.data[idx], dtype=torch.float),
                torch.tensor(self.labels[idx], dtype=torch.long))

class CNNModel(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv1d(1, 32, 7, padding='same'),
            nn.BatchNorm1d(32),
            nn.LeakyReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(32, 64, 7, padding='same'),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(64, 128, 7, padding='same'),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(),
            nn.MaxPool1d(2),
            nn.Flatten(),
            nn.Linear((input_dim//8)*128, 1024),
            nn.LeakyReLU(),
            nn.Dropout(0.2),
            nn.Linear(1024, 256),
            nn.LeakyReLU(),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        # x is expected to have shape (batch_size, input_dim)
        return self.model(x.unsqueeze(1))

# Step 1: Clean FASTA headers
with open(input_fasta, 'r') as infile, open(cleaned_fasta, 'w') as outfile:
    for line in infile:
        if line.startswith('>'):
            parts = line.strip().split()
            outfile.write(f'>{parts[1]}\n' if len(parts) > 1 else line)
        else:
            outfile.write(line)

# Step 2: k-mer vectorization
kmers = generate_kmers(k)
kmer_index = {kmer: idx for idx, kmer in enumerate(kmers)}

with open(cleaned_fasta, 'r') as infile, open(kmer_vector_file, 'w') as outfile:
    current_sequence, header = '', ''
    for line in infile:
        if line.startswith('>'):
            if current_sequence:
                vector = kmer_vector(current_sequence, k, kmer_index)
                outfile.write(f"{header}\t{' '.join(map(str, vector))}\n")
            header = line.strip()
            current_sequence = ''
        else:
            current_sequence += line.strip()
    if current_sequence:
        vector = kmer_vector(current_sequence, k, kmer_index)
        outfile.write(f"{header}\t{' '.join(map(str, vector))}\n")

# Step 3: Filter sequences by genus frequency
genera_count = defaultdict(int)
with open(kmer_vector_file, 'r') as f:
    for line in f:
        genus = line.split()[0][1:]
        genera_count[genus] += 1

with open(kmer_vector_file, 'r') as infile, open(filtered_vector_file, 'w') as outfile:
    for line in infile:
        genus = line.split()[0][1:]
        if genera_count[genus] >= 5:
            outfile.write(line)

# Main Experiment Loop
best_accuracies = []

for run in range(num_runs):
    data_by_class = defaultdict(list)
    with open(filtered_vector_file, 'r') as f:
        for line in f:
            label = line.split('\t')[0][1:]
            data_by_class[label].append(line.strip())

    train_data, test_data = [], []
    for samples in data_by_class.values():
        test_sample = random.choice(samples)
        test_data.append(test_sample)
        train_data.extend(s for s in samples if s != test_sample)

    train_vectors = [list(map(int, line.split('\t')[1].split())) for line in train_data]
    train_labels = [line.split('\t')[0][1:] for line in train_data]

    test_vectors = [list(map(int, line.split('\t')[1].split())) for line in test_data]
    test_labels = [line.split('\t')[0][1:] for line in test_data]

    label_encoder = LabelEncoder().fit(train_labels)
    train_labels_encoded = label_encoder.transform(train_labels)
    test_labels_encoded = label_encoder.transform(test_labels)

    train_loader = DataLoader(KmerDataset(train_vectors, train_labels_encoded), batch_size=32, shuffle=True)
    test_loader = DataLoader(KmerDataset(test_vectors, test_labels_encoded), batch_size=32)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = CNNModel(len(train_vectors[0]), len(label_encoder.classes_)).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=0.001)

    max_accuracy = 0
    for epoch in range(num_epochs):
        model.train()
        for data, labels in train_loader:
            data, labels = data.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for data, labels in test_loader:
                data, labels = data.to(device), labels.to(device)
                outputs = model(data)
                _, predicted = torch.max(outputs, 1)
                correct += (predicted == labels).sum().item()
                total += labels.size(0)

        accuracy = correct / total
        print(f"Run {run+1}, Epoch {epoch+1}: Accuracy = {accuracy:.4f}")

        if accuracy > max_accuracy:
            max_accuracy = accuracy

    best_accuracies.append(max_accuracy)

avg_accuracy = sum(best_accuracies) / len(best_accuracies)
print(f'Average Highest Accuracy over {num_runs} runs: {avg_accuracy:.4f}')


Run 1, Epoch 1: Accuracy = 0.0526
Run 1, Epoch 2: Accuracy = 0.1538
Run 1, Epoch 3: Accuracy = 0.2551
Run 1, Epoch 4: Accuracy = 0.3725
Run 1, Epoch 5: Accuracy = 0.4413
Run 1, Epoch 6: Accuracy = 0.5547
Run 1, Epoch 7: Accuracy = 0.5668
Run 1, Epoch 8: Accuracy = 0.5506
Run 1, Epoch 9: Accuracy = 0.5830
Run 1, Epoch 10: Accuracy = 0.6032
Run 1, Epoch 11: Accuracy = 0.6437
Run 1, Epoch 12: Accuracy = 0.6073
Run 1, Epoch 13: Accuracy = 0.6559
Run 1, Epoch 14: Accuracy = 0.6599
Run 1, Epoch 15: Accuracy = 0.6721
Run 1, Epoch 16: Accuracy = 0.5951
Run 1, Epoch 17: Accuracy = 0.6356
Run 1, Epoch 18: Accuracy = 0.5668
Run 1, Epoch 19: Accuracy = 0.6194
Run 1, Epoch 20: Accuracy = 0.6275
Run 1, Epoch 21: Accuracy = 0.6275
Run 1, Epoch 22: Accuracy = 0.5385
Run 1, Epoch 23: Accuracy = 0.5951
Run 1, Epoch 24: Accuracy = 0.6154
Run 1, Epoch 25: Accuracy = 0.6518
Run 1, Epoch 26: Accuracy = 0.6721
Run 1, Epoch 27: Accuracy = 0.6356
Run 1, Epoch 28: Accuracy = 0.6518
Run 1, Epoch 29: Accuracy = 0

# k=6, filter to 5, run 10 times(0.7704)

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import random
import matplotlib.pyplot as plt
from itertools import product
from collections import defaultdict

# Constants
k = 6
input_fasta = 'data1/fungi_ITS_sorted90.fasta'
cleaned_fasta = 'data1/fungi_ITS_cleaned.fasta'
kmer_vector_file = 'data1/fungi_ITS_kmer_vector.txt'
filtered_vector_file = 'data1/fungi_ITS_kmer_vector_filtered.txt'
num_epochs = 100
num_runs = 10

# Helper functions
def generate_kmers(k):
    return [''.join(p) for p in product('ACGT', repeat=k)]

def kmer_vector(seq, k, kmer_index):
    vector = [0] * len(kmer_index)
    for i in range(len(seq) - k + 1):
        kmer = seq[i:i+k]
        if kmer in kmer_index:
            vector[kmer_index[kmer]] += 1
    return vector

class KmerDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Convert the k-mer vector to float and the label to a long tensor.
        return (torch.tensor(self.data[idx], dtype=torch.float),
                torch.tensor(self.labels[idx], dtype=torch.long))

class CNNModel(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv1d(1, 32, 7, padding='same'),
            nn.BatchNorm1d(32),
            nn.LeakyReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(32, 64, 7, padding='same'),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(64, 128, 7, padding='same'),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(),
            nn.MaxPool1d(2),
            nn.Flatten(),
            nn.Linear((input_dim//8)*128, 1024),
            nn.LeakyReLU(),
            nn.Dropout(0.2),
            nn.Linear(1024, 256),
            nn.LeakyReLU(),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        # x is expected to have shape (batch_size, input_dim)
        return self.model(x.unsqueeze(1))

# Step 1: Clean FASTA headers
with open(input_fasta, 'r') as infile, open(cleaned_fasta, 'w') as outfile:
    for line in infile:
        if line.startswith('>'):
            parts = line.strip().split()
            outfile.write(f'>{parts[1]}\n' if len(parts) > 1 else line)
        else:
            outfile.write(line)

# Step 2: k-mer vectorization
kmers = generate_kmers(k)
kmer_index = {kmer: idx for idx, kmer in enumerate(kmers)}

with open(cleaned_fasta, 'r') as infile, open(kmer_vector_file, 'w') as outfile:
    current_sequence, header = '', ''
    for line in infile:
        if line.startswith('>'):
            if current_sequence:
                vector = kmer_vector(current_sequence, k, kmer_index)
                outfile.write(f"{header}\t{' '.join(map(str, vector))}\n")
            header = line.strip()
            current_sequence = ''
        else:
            current_sequence += line.strip()
    if current_sequence:
        vector = kmer_vector(current_sequence, k, kmer_index)
        outfile.write(f"{header}\t{' '.join(map(str, vector))}\n")

# Step 3: Filter sequences by genus frequency
genera_count = defaultdict(int)
with open(kmer_vector_file, 'r') as f:
    for line in f:
        genus = line.split()[0][1:]
        genera_count[genus] += 1

with open(kmer_vector_file, 'r') as infile, open(filtered_vector_file, 'w') as outfile:
    for line in infile:
        genus = line.split()[0][1:]
        if genera_count[genus] >= 5:
            outfile.write(line)

# Main Experiment Loop
best_accuracies = []

for run in range(num_runs):
    data_by_class = defaultdict(list)
    with open(filtered_vector_file, 'r') as f:
        for line in f:
            label = line.split('\t')[0][1:]
            data_by_class[label].append(line.strip())

    train_data, test_data = [], []
    for samples in data_by_class.values():
        test_sample = random.choice(samples)
        test_data.append(test_sample)
        train_data.extend(s for s in samples if s != test_sample)

    train_vectors = [list(map(int, line.split('\t')[1].split())) for line in train_data]
    train_labels = [line.split('\t')[0][1:] for line in train_data]

    test_vectors = [list(map(int, line.split('\t')[1].split())) for line in test_data]
    test_labels = [line.split('\t')[0][1:] for line in test_data]

    label_encoder = LabelEncoder().fit(train_labels)
    train_labels_encoded = label_encoder.transform(train_labels)
    test_labels_encoded = label_encoder.transform(test_labels)

    train_loader = DataLoader(KmerDataset(train_vectors, train_labels_encoded), batch_size=32, shuffle=True)
    test_loader = DataLoader(KmerDataset(test_vectors, test_labels_encoded), batch_size=32)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = CNNModel(len(train_vectors[0]), len(label_encoder.classes_)).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=0.001)

    max_accuracy = 0
    for epoch in range(num_epochs):
        model.train()
        for data, labels in train_loader:
            data, labels = data.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for data, labels in test_loader:
                data, labels = data.to(device), labels.to(device)
                outputs = model(data)
                _, predicted = torch.max(outputs, 1)
                correct += (predicted == labels).sum().item()
                total += labels.size(0)

        accuracy = correct / total
        print(f"Run {run+1}, Epoch {epoch+1}: Accuracy = {accuracy:.4f}")

        if accuracy > max_accuracy:
            max_accuracy = accuracy

    best_accuracies.append(max_accuracy)

avg_accuracy = sum(best_accuracies) / len(best_accuracies)
print(f'Average Highest Accuracy over {num_runs} runs: {avg_accuracy:.4f}')


Run 1, Epoch 1: Accuracy = 0.0445
Run 1, Epoch 2: Accuracy = 0.1093
Run 1, Epoch 3: Accuracy = 0.2065
Run 1, Epoch 4: Accuracy = 0.2915
Run 1, Epoch 5: Accuracy = 0.4453
Run 1, Epoch 6: Accuracy = 0.5101
Run 1, Epoch 7: Accuracy = 0.5304
Run 1, Epoch 8: Accuracy = 0.6113
Run 1, Epoch 9: Accuracy = 0.5911
Run 1, Epoch 10: Accuracy = 0.6316
Run 1, Epoch 11: Accuracy = 0.6518
Run 1, Epoch 12: Accuracy = 0.6640
Run 1, Epoch 13: Accuracy = 0.6802
Run 1, Epoch 14: Accuracy = 0.6883
Run 1, Epoch 15: Accuracy = 0.6640
Run 1, Epoch 16: Accuracy = 0.6640
Run 1, Epoch 17: Accuracy = 0.7085
Run 1, Epoch 18: Accuracy = 0.6518
Run 1, Epoch 19: Accuracy = 0.6478
Run 1, Epoch 20: Accuracy = 0.6761
Run 1, Epoch 21: Accuracy = 0.6640
Run 1, Epoch 22: Accuracy = 0.7166
Run 1, Epoch 23: Accuracy = 0.6680
Run 1, Epoch 24: Accuracy = 0.7206
Run 1, Epoch 25: Accuracy = 0.7409
Run 1, Epoch 26: Accuracy = 0.6883
Run 1, Epoch 27: Accuracy = 0.7530
Run 1, Epoch 28: Accuracy = 0.7449
Run 1, Epoch 29: Accuracy = 0

# aa


Let's mathematically evaluate how to choose the optimal k-mer length for DNA sequences of approximately 750 nucleotides.

### Step-by-step Mathematical Reasoning:

Given:

- DNA sequence length \( L \approx 750 \text{ nucleotides} \).

- DNA is composed of 4 nucleotides: \(\{A, T, C, G\}\).

---

## Step 1: Calculate the number of possible k-mers:

- For a k-mer of length \(k\), the number of possible combinations is:
\[
4^k
\]

| k-mer length (k) | Number of combinations (4^k) |
|------------------|------------------------------|
| 3                | \(4^3 = 64\)                   |
| 4                | \(4^4 = 256\)                  |
| 5                | \(4^5 = 1024\)                 |
| 6                | \(4^6 = 4096\)                 |
| 7                | \(4^7 = 16384\)                |

---

## Step 2: Calculate the number of k-mers in a sequence of length 750:

- For a sequence of length \(L\), the number of k-mers generated is approximately:
\[
L - k + 1 \approx L - k
\]

For \(L=750\):

| k-mer length (k) | Number of k-mers per sequence |
|------------------|-------------------------------|
| 4                | \(750 - 4 + 1 = 747\)          |
| 5                | \(750 - 5 + 1 = 746\)          |
| 6                | \(750 - 6 + 1 = 745\)          |
| 7                | \(750 - 7 + 1 = 744\)          |

(Approximately equal numbers.)

---

## Step 3: Calculate Expected Occurrences per k-mer:

- The expected frequency of each unique k-mer, if uniformly random, is approximately:
\[
\text{Frequency per k-mer} = \frac{\text{Total k-mers in sequence}}{4^k}
\]

| k-mer length (k) | Frequency per unique k-mer     |
|------------------|--------------------------------|
| 4                | \(747 / 256 \approx 2.918\)     |
| 5                | \(746 / 1024 \approx 0.729\)    |
| 6                | \(745 / 4096 \approx 0.182\)    |
| 7                | \(744 / 16384 \approx 0.045\)   |

---

## Analysis of Results:

- **k = 4:**  
  - Each unique 4-mer appears ~3 times on average per sequence.
  - Might be too frequent to distinguish meaningful motifs clearly; could generate noise.

- **k = 5**:
  - Each unique 5-mer appears ~0.73 times per sequence (once every ~1.37 sequences).
  - Good balance between capturing recurring motifs without excessive repetition or sparsity.

- **k = 6**:
  - Each unique 6-mer appears ~0.18 times per sequence (approximately once every ~5.5 sequences).
  - Sparse but manageable, good for identifying distinct patterns in larger datasets.

- **k = 7**:
  - Each unique 7-mer appears ~0.045 times per sequence (once every ~22 sequences).
  - Potentially too sparse for effective motif detection unless you have a very large dataset.

---

## Recommended Choice (Mathematically Justified):

- **5-mers (k=5)** provide a balanced frequency (about one occurrence per sequence) to capture clear, distinguishable motifs without excessive sparsity or redundancy.

- **6-mers** could also be good if your analysis focuses on more specific motifs, and your dataset is sufficiently large.

Thus, the mathematically optimal k-mer for DNA sequences of around 750 nucleotides would typically be:

\[
\boxed{k = 5 \text{ or } k = 6}
\]

Would you like to analyze this further with actual data distributions?

# 