In [3]:
from utils import download_extract
import csv

DATASET_URLS = {
    "train": "http://proteomics.ucsd.edu/data/cse291_2022/lung_top20_dcf82dfcd2b8456b800d07e682d494b4.zip",
    "test": "http://proteomics.ucsd.edu/data/cse291_2022/colon_top20_87bb3840244542918c777f63352a6115.zip",
}
NUM_UNIQUE_MZML_FILES = 2

In [4]:
def get_unique_mzml_file_names(filename):
    unique_mzml_files = set()
    with open(filename, "r") as tsv_file:
        reader = csv.reader(tsv_file, delimiter="\t")
        next(reader)  # Skip the header row
        for row in reader:
            filepath = row[-1]
            if filepath not in unique_mzml_files:
                unique_mzml_files.add(filepath)
            if len(unique_mzml_files) >= NUM_UNIQUE_MZML_FILES:
                break
    return list(unique_mzml_files)



def generate_tsv(type):
    raw_dir = f"./data/{type}/raw"
    data_filename = f"{type}_full.tsv"
    download_extract(DATASET_URLS[type], raw_dir, data_filename)
    required_rows = []
    scan_numbers = set()
    current_peptides = []
    count = 1
    mzml_file_names = get_unique_mzml_file_names(f"{raw_dir}/{data_filename}")
    with open(f"{raw_dir}/{data_filename}", "r") as tsv_file:
        reader = csv.reader(tsv_file, delimiter='\t')
        header = next(reader)  # Skip the header row
        for row in reader:
            filepath = row[-1]
            peptide = row[9]
            scan_number = int(row[2])
            if filepath in mzml_file_names:
                if scan_number not in scan_numbers:
                    current_peptides = [peptide]
                    count = 1
                    required_rows.append(row)
                    scan_numbers.add(scan_number)
                elif scan_number in scan_numbers:
                    if peptide in current_peptides:
                        continue
                    elif count < 5:
                        count += 1
                        continue
                    elif count >= 5 and count < 8:
                        count += 1
                        required_rows.append(row)
                        current_peptides.append(peptide)
                    elif count >= 8 and len(current_peptides) < 4 and peptide not in current_peptides:
                        count += 1
                        required_rows.append(row)
                        current_peptides.append(peptide)
                    else:
                        continue
    with open(f"{raw_dir}/{type}.tsv", "w") as tsv_file:
        writer = csv.writer(tsv_file, delimiter='\t', lineterminator='\n')
        writer.writerow(header)
        writer.writerows(required_rows)

In [5]:
generate_tsv("train")


05/26 20:07:17 [[1;32mNOTICE[0m] Downloading 1 item(s)
[#ee756e 0B/0B CN:1 DL:0B]
[#ee756e 0B/0B CN:1 DL:0B]
[#ee756e 0B/0B CN:1 DL:0B]
[#ee756e 0B/0B CN:1 DL:0B]
[#ee756e 0B/0B CN:1 DL:0B]
[#ee756e 0B/0B CN:1 DL:0B]
[#ee756e 0B/0B CN:1 DL:0B]
[#ee756e 0B/0B CN:1 DL:0B]
[#ee756e 0B/0B CN:1 DL:0B]
[#ee756e 416KiB/2.5GiB(0%) CN:5 DL:3.4MiB ETA:12m22s]
[#ee756e 3.3MiB/2.5GiB(0%) CN:5 DL:3.0MiB ETA:13m58s]
[#ee756e 6.5MiB/2.5GiB(0%) CN:5 DL:3.1MiB ETA:13m42s]
[#ee756e 9.8MiB/2.5GiB(0%) CN:5 DL:3.2MiB ETA:13m19s]
[#ee756e 13MiB/2.5GiB(0%) CN:5 DL:3.2MiB ETA:13m6s]
[#ee756e 16MiB/2.5GiB(0%) CN:5 DL:3.3MiB ETA:12m52s]
[#ee756e 20MiB/2.5GiB(0%) CN:5 DL:3.4MiB ETA:12m20s]
[#ee756e 24MiB/2.5GiB(0%) CN:5 DL:3.4MiB ETA:12m20s]
[#ee756e 28MiB/2.5GiB(1%) CN:5 DL:3.4MiB ETA:12m9s]
[#ee756e 32MiB/2.5GiB(1%) CN:5 DL:3.5MiB ETA:11m48s]
[#ee756e 36MiB/2.5GiB(1%) CN:5 DL:3.6MiB ETA:11m31s]
[#ee756e 39MiB/2.5GiB(1%) CN:5 DL:3.7MiB ETA:11m24s]
[#ee756e 42MiB/2.5GiB(1%) CN:5 DL:3.6MiB ETA:11m24s]
[#ee756e

KeyboardInterrupt: 

In [None]:
generate_tsv("test")