In [None]:
import pandas as pd

path_to_bold = "~/Downloads/BOLD_Public.19-Apr-2024.tsv"

df = pd.read_csv(path_to_bold, sep="\t", on_bad_lines="skip")

# Preprocess dataset
df_preprocessed = df[df["nucraw"].notna()]
# Replace all symbols in nucraw which are not A, C, G, T with N
df_preprocessed["nucraw"] = df_preprocessed["nucraw"].str.replace("[^ACGT]", "N", regex=True)
# Remove all duplicate nucraw sequences
df_preprocessed = df_preprocessed.drop_duplicates(subset="nucraw")
# Truncate trailing Ns from nucraw
df_preprocessed["nucraw"] = df_preprocessed["nucraw"].str.replace("N+$", "", regex=True)
# Remove sequences with less than 200bp
df_preprocessed = df_preprocessed[df_preprocessed["nucraw"].str.len() >= 200]
# Remove sequences with >50% Ns
df_preprocessed = df_preprocessed[df_preprocessed["nucraw"].str.count("N") / df_preprocessed["nucraw"].str.len() <= 0.5]

# Create splits
training_df = df_preprocessed
# Create unseen dataset
unseen_df = df_preprocessed[df_preprocessed["species"].notna()]
species_counts = unseen_df["species"].value_counts()
unseen_df = unseen_df[unseen_df["species"].isin(species_counts[species_counts < 20].index)]
unseen_df = unseen_df.sample(frac=0.01)
training_df = training_df.drop(unseen_df.index)
# Create fine-tuning dataset for genus classification
# Grab all entries from df_preprocessed which are not in unseen_df
finetuning_df = training_df[training_df["genus"].notna()]
finetuning_df = finetuning_df.sample(frac=0.05)
training_df = training_df.drop(finetuning_df.index)
# From training_df remove all rows where species are in unseen_df
training_df = training_df[~training_df["species"].isin(unseen_df["species"])]

# Save the datasets
training_df.to_csv("~/Downloads/BOLD_pretrain.tsv", index=False, sep="\t")
finetuning_df.to_csv("~/Downloads/BOLD_finetune.tsv", index=False, sep="\t")
unseen_df.to_csv("~/Downloads/BOLD_unseen.tsv", index=False, sep="\t")
