In [None]:
import pandas as pd
import os
from pathlib import Path
from brisc.manuscript_analysis.utils import get_path

In [None]:
DATA_ROOT = "/Users/blota/Data/brisc"
DATA_ROOT = get_path("barcode_diversity_analysis", DATA_ROOT)

In [None]:
# To download source data, go to:
# Saunders et al. (2023) Source Data 1f doi: 10.1038/s41467-022-34334-1

# Clark et al. (2021) GSM4519333

# Zhang et al. (2024) SRR23310757 (in fig), SRR23310756, SRR23310758

# Shin et al. (2024) GSE277536 (all data), GSM8524116, GSM8524119 (in fig.)

In [None]:
# Convert Saunders et al. data
data_path = DATA_ROOT / "saunders_source_data"

df = pd.read_csv(
    data_path / "Figure_1f.csv",
    sep=",",
    usecols=["file_type", "n_unique_umibc_pairs", "n_count"],
)
df = df[df.file_type == "Pseudotyped_Concatenated"][1:]
single_column = (
    df.apply(lambda r: [r["n_unique_umibc_pairs"]] * int(r["n_count"]), axis=1)
    .explode()
    .astype(int)
    .reset_index(drop=True)
)

single_column.to_csv(
    data_path.parent
    / "collapsed_barcodes"
    / "saunders_pseudotyped"
    / "saunders_pseudotyped_bowtie_ed2.txt",
    index=False,
    header=False,
)


# Convert Shin et al. data
input_dir = DATA_ROOT / "raw" / "GSE277536_RAW"
output_dir = DATA_ROOT / "collapsed_barcodes"

# Find all barcode counts files
files = [f for f in os.listdir(input_dir) if f.endswith("_BarcodeCounts.tsv")]

for file_name in files:
    input_file_path = os.path.join(input_dir, file_name)
    # Extract prefix for output subfolder
    prefix = file_name.split("_BarcodeCounts.tsv")[0]
    output_subfolder = os.path.join(output_dir, prefix)
    os.makedirs(output_subfolder, exist_ok=True)
    output_file_path = os.path.join(output_subfolder, f"{prefix}.txt")
    ed2_output_file_path = os.path.join(output_subfolder, f"{prefix}_bowtie_ed2.txt")

    # Read the input file and process it
    try:
        df = pd.read_csv(input_file_path, sep="\t", header=0)
        df["barcode"] = df["barcode"].str.replace("_", "", regex=False)
        df = df[["UMI_Count", "barcode"]].rename(
            columns={"UMI_Count": "count", "barcode": "barcode"}
        )
        df = df.sort_values(by="count", ascending=False)

        # Save to the output file
        df.to_csv(output_file_path, sep="\t", header=False, index=False)
        df.to_csv(ed2_output_file_path, sep="\t", header=False, index=False)
        print(f"Processed and saved: {output_file_path}")

    except Exception as e:
        print(f"Failed to process {input_file_path}: {e}")

print("Processing complete!")

# Convert Clark et al. data
clark_data_path = DATA_ROOT / "clark_rabies"

clark_df = pd.read_csv(
    clark_data_path / "GSM4519333_Viral.Barcodes.count.table.tsv",
    sep="\t",
    header=None,
)
# switch the two columns and then resave with a different name, there are no headers
clark_df = clark_df[[1, 0]]
clark_df.to_csv(
    clark_data_path.parent / "collapsed_barcodes" / "clark_rabies_bowtie_ed2.txt",
    sep="\t",
    header=False,
    index=False,
)

# Convert Zhang et al. data by running batch_bowtie_collapse.sh on SRR23310756.fastq SRR23310757.fastq SRR23310758.fastq
# Before running the script, ensure that the fastq files to process are listed in fastqs.txt