In [None]:
import pandas as pd
import os
from pathlib import Path
from brisc.manuscript_analysis.utils import get_path
import requests, zipfile, io

In [None]:
DATA_ROOT = r"E:\temp\brisc_data\brisc"


# Add data_root to sys.path, to find the config.yml file
import sys

sys.path.append(DATA_ROOT)


DATA_ROOT = get_path("barcode_diversity_analysis", DATA_ROOT)

In [None]:
# To download source data, go to:
# Saunders et al. (2023) Source Data 1f doi: 10.1038/s41467-022-34334-1

# Clark et al. (2021) GSM4519333

# Zhang et al. (2024) SRR23310757 (in fig), SRR23310756, SRR23310758

# Shin et al. (2024) GSE277536 (all data), GSM8524116, GSM8524119 (in fig.)

In [None]:
# Convert Saunders et al. data
data_path = DATA_ROOT / "saunders_source_data"

if not data_path.exists():
    print('Downloading raw data from Saunders et al (2023)')
    url2data = r"https://static-content.springer.com/esm/art%3A10.1038%2Fs41467-022-34334-1/MediaObjects/41467_2022_34334_MOESM6_ESM.zip"
    req = requests.get(url2data)
    zipcontent = zipfile.ZipFile(io.BytesIO(req.content))
    zipcontent.extractall(data_path.parent)


df = pd.read_csv(
    data_path / "Figure_1f.csv",
    sep=",",
    usecols=["file_type", "n_unique_umibc_pairs", "n_count"],
)
df = df[df.file_type == "Pseudotyped_Concatenated"][1:]
single_column = (
    df.apply(lambda r: [r["n_unique_umibc_pairs"]] * int(r["n_count"]), axis=1)
    .explode()
    .astype(int)
    .reset_index(drop=True)
)

target_folder = data_path.parent / "collapsed_barcodes" / "saunders_pseudotyped"
target_folder.mkdir(exist_ok=True)
single_column.to_csv(
    data_path.parent
    / "collapsed_barcodes"
    / "saunders_pseudotyped"
    / "saunders_pseudotyped_bowtie_ed2.txt",
    index=False,
    header=False,
)

In [None]:
# Convert Shin et al. data

# The data from Shin et al is in private until Sep 30, 2026. The code to download it is
# available on their preprint but it needs to be done manually. Download the tar file
# and extract it's content in the input_dir defined here:
input_dir = DATA_ROOT / "raw" / "GSE277536_RAW"
output_dir = DATA_ROOT / "collapsed_barcodes"

# Find all barcode counts files
files = [f for f in os.listdir(input_dir) if f.endswith("_BarcodeCounts.tsv")]

for file_name in files:
    input_file_path = os.path.join(input_dir, file_name)
    # Extract prefix for output subfolder
    prefix = file_name.split("_BarcodeCounts.tsv")[0]
    output_subfolder = os.path.join(output_dir, prefix)
    os.makedirs(output_subfolder, exist_ok=True)
    output_file_path = os.path.join(output_subfolder, f"{prefix}.txt")
    ed2_output_file_path = os.path.join(output_subfolder, f"{prefix}_bowtie_ed2.txt")

    # Read the input file and process it
    try:
        df = pd.read_csv(input_file_path, sep="\t", header=0)
        df["barcode"] = df["barcode"].str.replace("_", "", regex=False)
        df = df[["UMI_Count", "barcode"]].rename(
            columns={"UMI_Count": "count", "barcode": "barcode"}
        )
        df = df.sort_values(by="count", ascending=False)

        # Save to the output file
        df.to_csv(output_file_path, sep="\t", header=False, index=False)
        df.to_csv(ed2_output_file_path, sep="\t", header=False, index=False)
        print(f"Processed and saved: {output_file_path}")

    except Exception as e:
        print(f"Failed to process {input_file_path}: {e}")

print("Processing complete!")


In [None]:

# Convert Clark et al. data
clark_data_path = DATA_ROOT / "clark_rabies"

if not clark_data_path.exists():
    print('Downloading data from Clark et al.')
    url2data = r"https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4519nnn/GSM4519333/suppl/GSM4519333%5FViral.Barcodes.count.table.tsv.gz"
    df = pd.read_csv(url2data, compression='gzip', sep='\t', header=0)
    clark_data_path.mkdir()
    df.to_csv(clark_data_path/"GSM4519333_Viral.Barcodes.count.table.tsv", sep='\t', header=None)


clark_df = pd.read_csv(
    clark_data_path / "GSM4519333_Viral.Barcodes.count.table.tsv",
    sep="\t",
    header=None,
)
# switch the two columns and then resave with a different name, there are no headers
target_folder = clark_data_path.parent / "collapsed_barcodes" / "clark_rabies"
target_folder.mkdir(exist_ok=True)

clark_df = clark_df[[1, 0]]
clark_df.to_csv(
    target_folder / "clark_rabies_bowtie_ed2.txt",
    sep="\t",
    header=False,
    index=False,
)


In [None]:

# Convert Zhang et al. data by running batch_bowtie_collapse.sh on SRR23310756.fastq SRR23310757.fastq SRR23310758.fastq
# Before running the script, ensure that the fastq files to process are listed in fastqs.txt