In [None]:
import os
import subprocess
import pandas as pd
import dxdy

# Command functions

In [None]:
# download command
def dx_download(project_name, file_name, dest_name):
    quoted_path = f'"{project_name}:{file_name}"'
    download_command = f"dx download {quoted_path} -o {dest_name}"
    print(f"Downloading {file_name} from {project_name} to {dest_name}")
    subprocess.run(download_command, shell=True, check=True)

# upload command
def dx_upload(file_name, dest_name):
    upload_command = f"dx upload {file_name} -o {dest_name}"
    print(f"Uploading {file_name} to {dest_name}")
    subprocess.run(upload_command, shell=True, check=True)

# Data preparation

In [None]:
## Download genotype data from DNA Nexus
# make and check dirctory
if not os.path.exists("UKB_genotype_data"):
    os.makedirs("UKB_genotype_data")
    
    
# project_id = dxpy.api.user_get_project({'project': dxpy.DXProject('').get_id()})['project']
# project_name = dxpy.DXProject(project_id).describe()['name']
project_name = "project-GxqpVq0Jpp5Py82xVbZV198y"

#UK biobank geno-path path
genotype_folder = "/Bulk/Imputation/UKB imputation from genotype"


## download files for Chr 1-22, X, XY

In [None]:
# Chr1-22, X, XY
def file_download(chr_num):
    bgen_file_name = f"{genotype_folder}/ukb22828_c{chr_num}_b0_v3.bgen"
    bgen_dest_name = f"UKB_genotype_data/chr{chr_num}.bgen"
    dx_download(project_name, bgen_file_name, bgen_dest_name)
    bgi_file_name = f"{genotype_folder}/ukb22828_c{chr_num}_b0_v3.bgen.bgi"
    bgi_dest_name = f"UKB_genotype_data/chr{chr_num}.bgen.bgi"
    dx_download(project_name, bgi_file_name, bgi_dest_name)
    mfi_file_name = f"{genotype_folder}/ukb22828_c{chr_num}_b0_v3.mfi.txt"
    mfi_dest_name = f"UKB_genotype_data/chr{chr_num}.mfi.txt"
    dx_download(project_name, mfi_file_name, mfi_dest_name)
    sample_file_name = f"{genotype_folder}/ukb22828_c{chr_num}_b0_v3.sample"
    sample_dest_name = f"UKB_genotype_data/chr{chr_num}.sample"
    dx_download(project_name, sample_file_name, sample_dest_name)


## bgen transform to plink format

In [None]:
## bgen transform to plink format
def bgen_to_plink(bgen_file, sample_file, output_prefix, keep_file):
    print(f"Transforming {bgen_file} to plink format...")
    cmd = f"plink2 --bgen {bgen_file} ref-first \
                --sample {sample_file} \
                --extract {output_prefix}.mfi_filtered.txt \
                --keep {keep_file} \
                --make-bed \
                --out {output_prefix}"
    print(cmd)
    subprocess.run(cmd, shell=True, check=True)
    print(f"Transformed {bgen_file} to plink format.")

## Filter SNPs with info score >= 0.9 (extract high quality imputed SNPs)

In [None]:
def filter_info(input_file, output_file, info_col=7, snp_col=1, threshold=0.9):
    print(f"Filtering SNPs with info score >= {threshold} from {input_file} to {output_file}...")
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            fields = line.strip().split()
            try:
                info_score = float(fields[info_col])  # 0-based index, so column 8 is index 7
                if info_score >= threshold:
                    outfile.write(fields[snp_col] + "\n")
            except (IndexError, ValueError):
                continue

## Subset and filter subjects (to be updated)

In [None]:
# Subset and filter sample file create (to be updated)

def subset_samples(input_file, output_file, filter_caucasian: bool = True):
    print(f"Subsetting samples from {input_file} to {output_file}...")
    # Load CSV and standardize column names
    df = pd.read_csv(input_file, sep=",")
    df.columns = [col.strip() for col in df.columns]
    # Filter by Caucasian
    if filter_caucasian:
        df = df[df["p22006"] == "1"]
    else:
        print("No filtering applied for Caucasian samples.")
    
    # format for plink keep
    keep_df = df[["eid"]].rename(columns={"eid": "IID"})
    keep_df = keep_df.assign(FID=keep_df["IID"])[["FID", "IID"]]
    
    # Save to file
    keep_df.to_csv(output_file, sep="\t", index=False, header=False)
    print(f"Saved {len(keep_df)} samples to {output_file}")
    

# Subset samples: initial file has already prepared from the cohort app
dx_download(project_name, "full_participant.csv", "participant_table.csv")
input_file = "participant_table.csv"
output_file = "image_subset.txt"

subset_samples(input_file, output_file, filter_caucasian=False)

# Main pipeline

In [None]:
# file_list
chr_num_list = list(range(1, 23))
chr_num_list = list(map(str, chr_num_list))
chr_num_list.append('X')
chr_num_list.append('XY')


# Download files
for chr_num in chr_num_list:
    # Download bgen, bgi, mfi, and sample files
    file_download(chr_num)
    
    input_file = f"UKB_genotype_data/chr{chr_num}.mfi.txt"
    output_file = f"UKB_genotype_data/chr{chr_num}.mfi_filtered.txt"
    
    # Filter mfi file
    filter_info(input_file, output_file, info_col=7, snp_col=1, threshold=0.9)
    
    # Subset and filter sample file: generate image_subset.txt
    subset_samples(input_file, output_file, filter_caucasian=False)
    
    ## Transform bgen to plink format
    bgen_file = f"UKB_genotype_data/chr{chr_num}.mfi.txt"
    sample_file = f"UKB_genotype_data/chr{chr_num}.sample"
    output_prefix = f"UKB_genotype_data/chr{chr_num}"
    keep_file = "image_subset.txt"
    bgen_to_plink(bgen_file, sample_file, output_prefix, keep_file)
    
    # Upload the files to the project

    # upload the chromosome plink files to the project
    genotype_destination_folder = "/GWAS_pipeline/Genotype_data"# customize this path
    
    dx_upload(f"UKB_genotype_data/chr{chr_num}.bed", f"{genotype_destination_folder}/chr{chr_num}_62k.bed")
    dx_upload(f"UKB_genotype_data/chr{chr_num}.bim", f"{genotype_destination_folder}/chr{chr_num}_62k.bim")
    dx_upload(f"UKB_genotype_data/chr{chr_num}.fam", f"{genotype_destination_folder}/chr{chr_num}_62k.fam")
    
    
